# BERT For Sentiment Analysis

### Current Work (03/30)
* ~Load in BERT base model from Transformers~
* ~Build classification head as modeled in Transformers~
* ~Load in our datasets and tokenize~
* Fine-tune classification head on our data
* ~Evaluate performance on test dataset~
* Strategies to prevent overfitting on train dataset

### Data
* [Archeage - Sentiment Analysis Datasets](https://github.com/hadis-1/Sentiment-Analysis-Datasets/blob/main/archeage.csv)
* [Ntua - Sentiment Analysis Datasets](https://github.com/hadis-1/Sentiment-Analysis-Datasets/blob/main/ntua.csv)
* [HCR - Sentiment Analysis Datasets](https://github.com/hadis-1/Sentiment-Analysis-Datasets/blob/main/hcr.csv)


#### Sources

* [HuggingFace Transformers models for PyTorch - BERT](https://github.com/huggingface/transformers/blob/94ae1ba5b55e79ba766582de8a199d8ccf24a021/src/transformers/models/bert/modeling_bert.py#L957)
* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805)
* [Word Representations: A Simple and General Method for Semi-Supervised Learning](https://www.researchgate.net/publication/220873681_Word_Representations_A_Simple_and_General_Method_for_Semi-Supervised_Learning)
* [HuggingFace documentation for Fine Tuning Transformer for MultiLabel Text Classification](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb#scrollTo=mZ7lTlkyaG7u)

In [38]:
# packages
import csv
import math
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import BertTokenizer
from transformers.models.bert import BertModel
from transformers.models.bert.configuration_bert import BertConfig

In [39]:
# constants
DEBUG = True
cfg = BertConfig()
BATCH_SIZE = 16
NUM_EPOCHS = 10

if not torch.cuda.is_available():
    print('GPU not available, running script on CPU..')
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [40]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.data = df.review
        self.labels = df.sentiment
        self.tokenizer = tokenizer
        self.max_length = cfg.max_length

        self.classes = [0, 1]  # negative, positive
    
    def get_num_classes(self):
        return len(self.classes)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        inputs = self.tokenize(idx)

        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            'targets': torch.tensor(self.target_transform(idx), dtype=torch.long)
        }

    def tokenize(self, idx):
        data = str(self.data[idx])
        data = " ".join(data.split())

        inputs = self.tokenizer.encode_plus(
            data,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        return inputs    

    def target_transform(self, idx):
        if self.labels[idx] == 'positive':
            return 1
        elif self.labels[idx] == 'negative':
            return 0
        else:
            print('[ERROR]: label not accepted:', self.labels[idx], 'must be positive or negative')

In [41]:
# load and split data functions
def load_data_from_file(filepath):
    print('...loading dataset from file')
    
    df = pd.read_csv(filepath)
    df['sentiment'] = df[df.columns[0]].values.tolist()
    new_df = df[['review', 'sentiment']].copy()

    return new_df

In [42]:
archeage_df = load_data_from_file('/kaggle/input/archeage/archeage.csv')
ntua_df = load_data_from_file('/kaggle/input/ntua-dataset/ntua.csv')
hcr_df = load_data_from_file('/kaggle/input/hcr-dataset/hcr.csv')
all_data = pd.concat([archeage_df, ntua_df, hcr_df], ignore_index=True)

train_size = 0.7
val_size = 0.2

train_dataset = all_data.sample(frac=train_size,random_state=200)
remaining = all_data.drop(train_dataset.index).reset_index(drop=True)
val_dataset = remaining.sample(frac=train_size+val_size,random_state=200)
test_dataset = remaining.drop(val_dataset.index).reset_index(drop=True)

train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(all_data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALIDATION Dataset: {}".format(val_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_dataset = SentimentDataset(train_dataset, tokenizer)
val_dataset = SentimentDataset(val_dataset, tokenizer)
test_dataset = SentimentDataset(test_dataset, tokenizer)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

...loading dataset from file
...loading dataset from file
...loading dataset from file
FULL Dataset: (3904, 2)
TRAIN Dataset: (2733, 2)
VALIDATION Dataset: (1054, 2)
TEST Dataset: (117, 2)


In [43]:
class BERTForSentimentAnalysis(BertModel):
    """
    from Bert For Sequence Classification
    """
    def __init__(self, config):
        super().__init__(config=config)
        self.num_labels = 2
        self.config = config

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(config.hidden_size, self.num_labels)

        self.post_init()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,):        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [44]:
print('...creating BERT model')
model = BERTForSentimentAnalysis(cfg).from_pretrained('bert-base-cased')
model.to(DEVICE)

...creating BERT model


Some weights of BERTForSentimentAnalysis were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 

BERTForSentimentAnalysis(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1

In [45]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)
optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)

In [46]:
def compute_accuracy(outputs, targets):
    predictions = torch.argmax(outputs, 1)
    num_predictions = len(predictions)

    predictions = predictions.cpu()
    targets = targets.cpu()
    num_incorrect = 0
    for i in range(len(predictions)):
        if not predictions[i] == targets[i]:
            num_incorrect = num_incorrect + 1
    accuracy = (num_predictions-num_incorrect)/num_predictions

    return accuracy

In [47]:
def train(model, dataloader, device, epoch, num_epochs, total_steps):
    running_loss = 0.0
    running_acc = 0.0

    for i,data in enumerate(dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        accuracy = compute_accuracy(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_acc += accuracy

        if (i+1) % 20 == 0:
            print(
                f'TRAINING --> Epoch: {epoch+1}/{num_epochs}, ' +
                f'Step: {i+1}/{total_steps}, ' +
                f'Loss: {running_loss / (i+1)}, '
                f'Accuracy: {running_acc / (i+1)}'
            )
    
    running_loss = running_loss / total_steps
    running_acc = running_acc / total_steps

    return running_loss, running_acc

In [48]:
def validate(model, dataloader, device, epoch, num_epochs, total_steps):
    running_loss = 0.0
    running_acc = 0.0

    with torch.no_grad():
        for i,data in enumerate(dataloader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
    
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            accuracy = compute_accuracy(outputs, targets)
    
            running_loss += loss.item()
            running_acc += accuracy
    
            if (i+1) % 20 == 0:
                print(
                    f'VALIDATION --> Epoch: {epoch+1}/{num_epochs}, ' +
                    f'Step: {i+1}/{total_steps}, ' +
                    f'Loss: {running_loss / (i+1)}, '
                    f'Accuracy: {running_acc / (i+1)}'
                )
    running_loss = running_loss / total_steps
    running_acc = running_acc / total_steps

    return running_loss, running_acc

In [49]:
def save_best_model(
    model: torch.nn.Module,
    model_save_path,
    val_loss: float,
    val_losses: list,
    epoch: int,
    keep_models: bool = False
):
    """Save the model if it is the first epoch. Subsequently, save the model
    only if a lower validation loss is achieved whilst training.

    :param model: The model to save.
    :type model: torch.nn.Module
    :param model_save_path: The location to save the model to.
    :type model_save_path: Path
    :param val_loss: The current epoch's validation loss.
    :type val_loss: float
    :param val_losses: The history of all other validation losses.
    :type val_losses: list
    :param epoch: The current epoch number.
    :type epoch: int
    :param keep_models: Should all models be saved, defaults to False
    :type keep_models: bool, optional
    """
    # Should we keep all models or just one
    if keep_models:
        model_save_path = model_save_path / f'model_{epoch+1}_{val_loss}.pt'
    else:
        model_save_path = model_save_path / f'model_state_dict.pt'
    # Save the first model
    if len(val_losses) == 0:
        torch.save(
            model.state_dict(),
            model_save_path
        )
        print(
            'SAVING --> First epoch: \n' +
            f'Val Loss: {val_loss}\n' +
            f'Saving new model to:\n{model_save_path}'
        )
    elif val_loss < min(val_losses):
        # If our new validation loss is less than the previous best save the
        # model
        print(
            'SAVING --> Found model with better validation loss: \n' +
            f'New Best Val Loss: {val_loss}\n' +
            f'Old Best Val Loss: {min(val_losses)}\n'
            f'Saving new model to:\n{model_save_path}'
        )
        torch.save(
            model.state_dict(),
            model_save_path
        )

In [50]:
def train_loop(model, train_dataloader, val_dataloader, device, num_epochs, model_save_path=Path('./models')):
    print(f'Models will be saved to: {model_save_path}')
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    if not model_save_path.exists():
        model_save_path.mkdir(exist_ok=True, parents=True)

    train_total_steps = len(train_dataloader)
    val_total_steps = len(val_dataloader)

    for epoch in range(num_epochs):
        model.train()
        train_loss, train_accuracy = train(model, train_dataloader, device, epoch, num_epochs, train_total_steps)
        print(
            f'TRAINING --> Epoch {epoch+1}/{NUM_EPOCHS} DONE, ' +
            f'Avg Loss: {train_loss}, Avg Accuracy: {train_accuracy}'
        )

        val_loss, val_accuracy = validate(model, val_dataloader, device, epoch, num_epochs, val_total_steps)
        print(
            f'VALIDATION --> Epoch {epoch+1}/{NUM_EPOCHS} DONE, ' +
            f'Avg Loss: {val_loss}, Avg Accuracy: {val_accuracy}'
        )

        save_best_model(model, model_save_path, val_loss, val_losses, epoch)
        
        train_losses.append(train_loss)
        train_accs.append(train_accuracy)
        val_losses.append(val_loss)
        val_accs.append(val_accuracy)
    return (train_losses, train_accs), (val_losses, val_accs)

In [51]:
(train_losses, train_accs), (val_losses, val_accs) = train_loop(model, train_dataloader, val_dataloader, DEVICE, NUM_EPOCHS)
print(f'Best Validation Loss: {min(val_losses)} after epoch {np.argmin(val_losses) + 1}')
print(f'Best Validation Acc: {max(val_accs)} after epoch {np.argmax(val_accs) + 1}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Models will be saved to: models




TRAINING --> Epoch: 1/2, Step: 10/171, Loss: 0.7005204200744629, Accuracy: 0.49375
TRAINING --> Epoch: 1/2, Step: 20/171, Loss: 0.6806706845760345, Accuracy: 0.54375
TRAINING --> Epoch: 1/2, Step: 30/171, Loss: 0.6654908458391825, Accuracy: 0.5791666666666667
TRAINING --> Epoch: 1/2, Step: 40/171, Loss: 0.6514295548200607, Accuracy: 0.6078125
TRAINING --> Epoch: 1/2, Step: 50/171, Loss: 0.6238089364767074, Accuracy: 0.64
TRAINING --> Epoch: 1/2, Step: 60/171, Loss: 0.6054089933633804, Accuracy: 0.6625
TRAINING --> Epoch: 1/2, Step: 70/171, Loss: 0.5919532435280936, Accuracy: 0.6785714285714286
TRAINING --> Epoch: 1/2, Step: 80/171, Loss: 0.5829774748533965, Accuracy: 0.68828125
TRAINING --> Epoch: 1/2, Step: 90/171, Loss: 0.582604418694973, Accuracy: 0.6923611111111111
TRAINING --> Epoch: 1/2, Step: 100/171, Loss: 0.5832276819646358, Accuracy: 0.693125
TRAINING --> Epoch: 1/2, Step: 110/171, Loss: 0.5747283040122553, Accuracy: 0.7005681818181818
TRAINING --> Epoch: 1/2, Step: 120/171, 

In [52]:
def evaluate(model, dataloader, device, total_steps):
    model.eval()
    running_loss = 0.0
    running_acc = 0.0

    with torch.no_grad():
        for i,data in enumerate(dataloader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
    
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            accuracy = compute_accuracy(outputs, targets)
    
            running_loss += loss.item()
            running_acc += accuracy
    
            if (i+1) % 256 == 0:
                print(
                    f'TEST' +
                    f'Step: {i+1}/{total_steps}, ' +
                    f'Loss: {running_loss / (i+1)}, '
                    f'Accuracy: {running_acc / (i+1)}'
                )
    running_loss = running_loss / total_steps
    running_acc = running_acc / total_steps

    return running_loss, running_acc

In [53]:
test_loss, test_accuracy = evaluate(model, test_dataloader, DEVICE, len(test_dataloader))
print(
    f'TEST --> DONE, ' +
    f'Avg Loss: {test_loss}, Avg Accuracy: {test_accuracy}'
)

TEST --> DONE, Avg Loss: 0.4594200160354376, Avg Accuracy: 0.8265625
