## Current Work (03/30)
* Load in BERT base model from Transformers
* Build classification head as modeled in Transformers
* Load in our datasets and tokenize
* Fine-tune classification head on our data
* Evaluate performance

#### Sources

* [HuggingFace Transformers models for PyTorch - BERT](https://github.com/huggingface/transformers/blob/94ae1ba5b55e79ba766582de8a199d8ccf24a021/src/transformers/models/bert/modeling_bert.py#L957)
* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805)
* [Word Representations: A Simple and General Method for Semi-Supervised Learning](https://www.researchgate.net/publication/220873681_Word_Representations_A_Simple_and_General_Method_for_Semi-Supervised_Learning)
* [HuggingFace documentation for Fine Tuning Transformer for MultiLabel Text Classification](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb#scrollTo=mZ7lTlkyaG7u)

In [115]:
# packages
import csv
import math
import numpy as np
import torch
from transformers.models.bert import BertModel
from transformers.models.bert.configuration_bert import BertConfig
from transformers import BertTokenizer

In [116]:
# constants
DEBUG = True
cfg = BertConfig()
BATCH_SIZE = 16

if not torch.cuda.is_available():
    print('GPU not available, running script on CPU..')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [174]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.data = df.review
        self.labels = df.sentiment
        self.tokenizer = tokenizer
        self.max_length = cfg.max_length

        #self.classes = ["positive", "negative"]
        self.classes = [0, 1]
    
    def get_num_classes(self):
        return len(self.classes)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        data = str(self.data[idx])
        data = " ".join(data.split())

        inputs = self.tokenizer.encode_plus(
            data,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            'targets': torch.tensor(self.target_transform(idx), dtype=torch.long)
        }

    def target_transform(self, idx):
        if self.labels[idx] == 'positive':
            #return [0, 1]
            return 1
        elif self.labels[idx] == 'negative':
            #return [1, 0]
            return 0
        else:
            print('ERROR')

In [175]:
# load and split data functions
import pandas as pd
def load_data_from_file(filepath):
    print('...loading dataset from file')
    
    df = pd.read_csv(filepath)
    #print(df.head())
    df['sentiment'] = df[df.columns[0]].values.tolist()
    new_df = df[['review', 'sentiment']].copy()
    #print(new_df.head())

    return new_df

In [176]:
new_df = load_data_from_file('/kaggle/input/archeage/archeage.csv')

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_set = SentimentDataset(train_dataset, tokenizer)

train_params = {'batch_size': BATCH_SIZE, 'shuffle': True}

train_dataloader = torch.utils.data.DataLoader(train_set, **train_params)

...loading dataset from file
FULL Dataset: (1718, 2)
TRAIN Dataset: (1374, 2)
TEST Dataset: (344, 2)


In [79]:
# create dataloaders
archeage_data, archeage_labels = load_data_from_file('/kaggle/input/archeage/archeage.csv')
ntua_data, ntua_labels = load_data_from_file('/kaggle/input/ntua-dataset/ntua.csv')

data = archeage_data + ntua_data
labels = archeage_labels + ntua_labels
train_data, val_data, test_data, train_labels, val_labels, test_labels = split_data(data, labels)

def load_dataset(batch_size=BATCH_SIZE):
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    train_dataset = SentimentDataset(train_data, train_labels, tokenizer)
    val_dataset = SentimentDataset(val_data, val_labels, tokenizer)
    test_dataset = SentimentDataset(test_data, test_labels, tokenizer)

    num_labels = train_dataset.get_num_classes()

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    return train_dataloader, val_dataloader, test_dataloader, num_labels

train_dataloader, val_dataloader, test_dataloader, num_labels = load_dataset()

...loading dataset from file
  sentiment                                             review
0  positive  @archeage i appreciate the "some few of you" :...
1  positive                               @archeage i love you
2  positive  @archeage been playing for the pass 3 days and...
3  positive  yay #archeage is working for me now... dunno a...
4  positive  played archeage (pc); guild wars 2 (pc) and 1 ...
                                              review sentiment
0  @archeage i appreciate the "some few of you" :...  positive
1                               @archeage i love you  positive
2  @archeage been playing for the pass 3 days and...  positive
3  yay #archeage is working for me now... dunno a...  positive
4  played archeage (pc); guild wars 2 (pc) and 1 ...  positive
...loading dataset from file
  sentiment                                             review
0  positive  @ddlovato hey, I just wanted to no when your d...
1  positive  Great Morning USA! @JoeJonasLoverxo @Courtney7.

In [169]:
class BERTForSentimentAnalysis(BertModel):
    """
    from Bert For Sequence Classification
    """
    def __init__(self, config):
        super().__init__(config=config)
        self.num_labels = 2
        self.config = config

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(config.hidden_size, self.num_labels)

        # TODO: only weights for the classifier head should show as uninitialized
        self.post_init()

    def forward(self,
               input_ids=None,
               attention_mask=None,
               token_type_ids=None,
               position_ids=None,
               head_mask=None,
               inputs_embeds=None,
               labels=None,
               output_attentions=None,
               output_hidden_states=None,
               return_dict=None):        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [170]:
print('...creating BERT model')
model = BERTForSentimentAnalysis(cfg).from_pretrained('bert-base-cased')
model.to(DEVICE)

...creating BERT model


Some weights of BERTForSentimentAnalysis were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 

BERTForSentimentAnalysis(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1

In [180]:
def loss_fn(outputs, targets):
    #print(outputs)
    #print(targets)
    #return torch.nn.BCEWithLogitsLoss()(outputs, targets)
    return torch.nn.CrossEntropyLoss()(outputs, targets)
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)

In [181]:
def train(model, dataloader, device, epoch):
    model.train()
    for _,data in enumerate(dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [182]:
for epoch in range(2):
    train(model, train_dataloader, DEVICE, epoch)

Epoch: 0, Loss:  0.1493157297372818
Epoch: 1, Loss:  0.06309366226196289


In [None]:
def loss_fn(outputs, targets):
    nn.CrossEntropyLoss()(outputs, targets)
optimizer = torch.optim.Adam(params=BERT_ins.parameters(), lr=2e-5)

def train(train_dataloader, model, device):
    model.train()
    #running_acc = 0.0
    #running_loss = 0.0
    
    for i, (data, targets) in enumerate(train_dataloader):
        #optimizer.zero_grad()
        print('in train loop')
        print(tokenizer.decode(data['input_ids'][0]))
        print(tokenizer.decode(data['token_type_ids'][0]))
        #input_ids = data['input_ids'].to(device)
        #token_type_ids = data['token_type_ids'].to(device)
        #attention_mask = data['attention_mask'].to(device)
        input_ids = torch.tensor(data['input_ids'], dtype=torch.long)
        token_type_ids = torch.tensor(data['token_type_ids'], dtype=torch.long)
        attention_mask = torch.tensor(data['attention_mask'], dtype=torch.long)
        targets = targets.to(device)
        targets = torch.tensor(targets, dtype=torch.float)

        #outputs = model(images)
        outputs = model(input_ids=input_ids,
                        token_type_ids=token_type_ids,
                        attention_mask=attention_mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        print('loss:', loss.item())
        #loss = criterion(outputs, )
        #hidden_state = outputs.last_hidden_state
        #predictions.append(torch.argmax(hidden_state, 1).cpu())
        #predictions.append(outputs.cpu())

        #loss = loss_function(outputs, labels)
        #acc = compute_accuracy(outputs,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #running_loss += loss.item()
        #running_acc += acc

        #if (i+1) % 256 == 0:
        #    print(
        #        f'TRAINING --> Epoch: {epoch+1}/{num_epochs}, ' +
        #        f'Step: {i+1}/{total_steps}, ' +
        #        f'Loss: {running_loss / (i+1)}, '
        #        f'Accuracy: {running_acc / (i+1)}'
        #    )

    #running_loss = running_loss / total_steps
    #running_acc = running_acc / total_steps

    #return running_loss, running_acc
    return predictions

In [None]:
def save_best_model(
    model: torch.nn.Module,
    model_save_path,
    val_loss: float,
    val_losses: list,
    epoch: int,
    keep_models: bool = False
):
    """Save the model if it is the first epoch. Subsequently, save the model
    only if a lower validation loss is achieved whilst training.

    :param model: The model to save.
    :type model: torch.nn.Module
    :param model_save_path: The location to save the model to.
    :type model_save_path: Path
    :param val_loss: The current epoch's validation loss.
    :type val_loss: float
    :param val_losses: The history of all other validation losses.
    :type val_losses: list
    :param epoch: The current epoch number.
    :type epoch: int
    :param keep_models: Should all models be saved, defaults to False
    :type keep_models: bool, optional
    """
    # Should we keep all models or just one
    if keep_models:
        model_save_path = model_save_path / f'model_{epoch+1}_{val_loss}.pt'
    else:
        model_save_path = model_save_path / f'model_state_dict3.pt'
    # Save the first model
    if len(val_losses) == 0:
        torch.save(
            model.state_dict(),
            model_save_path
        )
        print(
            'SAVING --> First epoch: \n' +
            f'Val Loss: {val_loss}\n' +
            f'Saving new model to:\n{model_save_path}'
        )
    elif val_loss < min(val_losses):
        # If our new validation loss is less than the previous best save the
        # model
        print(
            'SAVING --> Found model with better validation loss: \n' +
            f'New Best Val Loss: {val_loss}\n' +
            f'Old Best Val Loss: {min(val_losses)}\n'
            f'Saving new model to:\n{model_save_path}'
        )
        torch.save(
            model.state_dict(),
            model_save_path
        )

In [None]:
def compute_accuracy(outputs, labels):
    predictions = torch.argmax(outputs, 1)
    num_predictions = len(predictions)
    num_incorrect = torch.count_nonzero(predictions-labels)
    accuracy = (num_predictions-num_incorrect)/num_predictions

    return accuracy.item()

In [None]:
def validation(val_dataloader: torch.utils.data.DataLoader, model: torch.nn.Module, loss_function,
               epoch: int, num_epochs: int, total_steps: int, device: str) -> tuple:
    running_loss = 0.0
    running_acc = 0.0

    with torch.no_grad():
        for i, (images, labels) in enumerate(val_dataloader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)

            loss = loss_function(outputs, labels)
            acc = compute_accuracy(outputs, labels)

            running_loss += loss.item()
            running_acc += acc

            if (i+1) % 256 == 0:
                print(
                    f'VALIDATION --> Epoch: {epoch+1}/{num_epochs}, ' +
                    f'Step: {i+1}/{total_steps}, ' +
                    f'Val Loss: {running_loss / (i+1)}, ' +
                    f'Val Acc: {running_acc / (i+1)}'
                )
    running_loss = running_loss / total_steps
    running_acc = running_acc / total_steps

    return running_loss, running_acc

In [None]:
def train_loop(train_dataloader, model, loss, optimizer, num_epochs, device, model_save_path, val_dataloader):
    print(f'Models will be saved to: {model_save_path}')
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    # Create the save path for the model
    if not model_save_path.exists():
        model_save_path.mkdir(exist_ok=True, parents=True)

    # Total number batches in training set
    train_total_steps = len(train_dataloader)
    val_total_steps = len(val_dataloader)

    # Perform training loop
    for epoch in range(num_epochs):
        # Enable model training
        model.train(True)

        # Enter the training function loop
        train_loss, train_acc = train(
            train_dataloader,
            model,
            loss_function,
            optimizer,
            epoch,
            num_epochs,
            train_total_steps,
            device
        )
        print(
            f'TRAINING --> Epoch {epoch+1}/{NUM_EPOCHS} DONE, ' +
            f'Avg Loss: {train_loss}, Avg Accuracy: {train_acc}'
        )

        # Enter the validation loop
        val_loss, val_acc = validation(
            val_dataloader,
            model,
            loss_function,
            epoch,
            num_epochs,
            val_total_steps,
            device
        )
        print(
            f'VALIDATION --> Epoch {epoch+1}/{NUM_EPOCHS} DONE, ' +
            f'Avg Loss: {val_loss}, Avg Accuracy: {val_acc}'
        )

        # Determine if we should save the model
        save_best_model(model, model_save_path, val_loss, val_losses, epoch)

        # Record the stats
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc)

    return (train_losses, train_accs), (val_losses, val_accs)

In [None]:
# TODO: train and validate
NUM_EPOCHS = 2
# TODO: loss function and optimizer
#loss = 0
#opt = 0
print('training..')
ret = train(train_dataloader, BERT_ins, DEVICE)
print(ret[0])
#(train_losses, train_accs), (val_losses, val_accs) = train_model(train_dataloader,BERT_ins,loss,opt,NUM_EPOCHS,DEVICE,model_save_path=Path('./models/BERT'),val_dataloader=val_dataloader)
# Get the best validation loss and accuracy
#print(f'Best Validation Loss: {min(val_losses)} after epoch {np.argmin(val_losses) + 1}')
#print(f'Best Validation Acc: {max(val_accs)} after epoch {np.argmax(val_accs) + 1}')

In [None]:
# inference
"""
BERT_ins.eval()
with torch.no_grad():
    predictions = []
    
    for i, (data) in enumerate(val_dataloader):
        input_ids = data['input_ids'].to(DEVICE)
        token_type_ids = data['token_type_ids'].to(DEVICE)
        attention_mask = data['attention_mask'].to(DEVICE)
    
        outputs = BERT_ins(input_ids=input_ids,
                          token_type_ids=token_type_ids,
                          attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        predictions.append(torch.argmax(hidden_state, 1).cpu())
print(predictions[0])
"""