# Prerequisities

First, we install and import libraries we'll need later.

In [1]:
# %pip install datasets
# %pip install transformers

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
# from transformers import AutoTokenizer, GPT2Config, PreTrainedModel
# from transformers import AutoModelForCausalLM, PretrainedConfig, AutoConfig, AutoModel, AutoImageProcessor

# import torchtext.datasets as datasets

import os, math
import numpy as np
from tqdm.notebook import tqdm
import random
import copy

# import matplotlib.pyplot as plt
# %matplotlib inline

from IPython.display import clear_output

clear_output()

Next, we'll set the random seeds for reproducability.

In [2]:
SEED = 43

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Setting the device option:

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# The Config

In [4]:
config = { 
    'LLM_BATCH_SIZE': 50,
    'LLM_EPOCHS': 1,
    'LLM_loss>0': True,
    'temperature': 1,
    'loss_function': 'sina_loss()',
    'classifier_lr': 5e-1,
    'classifier_batch': 50,
    'classifier_epochs': 3,
    'base_model_optimizer': 'optim.Adam(base_model.parameters(), lr=1e-5)',
    'classifier_optimizer': 'optim.Adam(model.parameters(), lr=1e-5)',
    'use_scheduler': False,
    'scheduler': 'optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.01, max_lr=0.5)',
    'base_model_train_loss': 0,
    'base_model_valid_loss': 0,
    'loss_train': 0,
    'loss_valid': 0,
    'acc_train': 0,
    'acc_valid': 0,
    'precision_test': 0,
    'recall_test': 0,
    'f1_test': 0,
    'acc_test': 0,
}

# The Data

In this exercise we are going to use MultiNLI Dataset.

In [5]:
from datasets import load_dataset

dataset = load_dataset("multi_nli")
clear_output()

## Data fields

The data fields are the same among all splits.

*   *promptID:* Unique identifier for prompt

*   *pairID:* Unique identifier for pair
*   *{premise,hypothesis}:* combination of premise and hypothesis
*   *{premise,hypothesis} parse:* Each sentence as parsed by the Stanford PCFG Parser 3.5.2
*   *{premise,hypothesis} binary parse:* parses in unlabeled binary-branching format
*   *genre:* a string feature.
*   *label:* a classification label, with possible values including entailment (0), neutral (1), contradiction (2). Dataset instances which don't have any gold label are marked with -1 label. Make sure you filter them before starting the training using datasets.Dataset.filter.

You can see the number of rows in each split of the data.

In [6]:
set(dataset['train']['label'])

{0, 1, 2}

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9832
    })
})

Defining some key variables that will be used later on in the training.

In [8]:
BATCH_SIZE = 120
LR = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', truncation=True, do_lower_case=True)
MAX_LEN = 320 #tokenizer.max_model_input_sizes['bert-base-uncased']
clear_output()

In [9]:
class MNLIDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data['premise'])

    def __getitem__(self, index):
        premise = str(self.data['premise'][index])
        premise = " ".join(premise.split())

        hypothesis = str(self.data['hypothesis'][index])
        hypothesis = " ".join(hypothesis.split())

        inputs = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            return_tensors='pt',
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': ids.squeeze(),
            'mask': mask.squeeze(),
            'token_type_ids': token_type_ids.squeeze(),
            'targets': torch.tensor(self.data['label'][index], dtype=torch.long)
        }

In [10]:
# Use slicing to pick as much as data you want. EX: dataset['train'][:3000].
train_dataset = MNLIDataset(dataset['train'][:100000], tokenizer, MAX_LEN)
valid_dataset = MNLIDataset(dataset['train'][100000:110000], tokenizer, MAX_LEN)
test_dataset = MNLIDataset(dataset['validation_matched'], tokenizer, MAX_LEN)

In [11]:
len(train_dataset)

100000

Here we define our dataloaders:

In [12]:
train_dataloader = data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, drop_last=False)
valid_dataloader = data.DataLoader(valid_dataset, shuffle=True, batch_size=BATCH_SIZE, drop_last=False)
test_dataloader = data.DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE, drop_last=False)

# Contrastive Objective

Now we define our contrastive loss function.

In [13]:
class sina_loss():
    
    def forward(self, logits, labels):
        # t = 0.07
        t = 35
        loss = 0
        # mean_logits = torch.mean(logits[:, 1:-1, :], axis=1)
        cls_logits = logits[:, 0]

        for i, logit in enumerate(cls_logits):
            
            positives = []
            negatives = []
            for j in range(len(labels)):
                if labels[i] != labels[j]:
                    negatives.append(cls_logits[j].tolist())
                elif i != j:
                    positives.append(cls_logits[j].tolist())
                    
            if len(positives) != 0 and len(negatives) != 0:
                positives = torch.tensor(positives).to(device)
                pos_sum = torch.sum(torch.exp((positives @ logit) / t))
                negatives = torch.tensor(negatives).to(device)
                neg_sum = torch.sum(torch.exp((negatives @ logit) / t))
#                 loss += torch.log(neg_sum/pos_sum + neg_sum)
                loss += -torch.log(pos_sum/neg_sum)

        return loss if loss != 0 else torch.tensor(0.0, dtype=torch.float32, device='cuda:0', requires_grad=True)

# The Base Model

In [14]:
base_model = BertModel.from_pretrained('bert-base-uncased')
base_model.to(device)
clear_output()

# The Base Model Finetuning

In [15]:
# Defining the loss function and optimizer
base_model_optimizer = eval(config['base_model_optimizer'])

base_model_criterion = eval(config['loss_function'])
base_model_criterion = base_model_criterion#.to(device)

base_model_criterion2 = nn.CrossEntropyLoss()
base_model_criterion2 = base_model_criterion2.to(device)

In [19]:
# def train_base_model(model, optimizer, data_loader, dataset, criterion, criterion2):
#     # Set model to train mode
#     model.train()
#     # loss per epoch and number of correct predictions in order to calculate the accuracy
#     epoch_loss = 0
#     n_correct = 0

#     for idx, data in tqdm(enumerate(data_loader), desc='Training', leave=False):

#         ## Step 1: Move input data to device (only strictly necessary if we use GPU)
#         ids = data['ids'].to(device, dtype = torch.long)
#         mask = data['mask'].to(device, dtype = torch.long)
#         token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.long)

#         ## Step 2: Run the model on the input data        
#         output = model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids)
#         last_hidden_state = output[0] # last_hidden_state (batch_size, sequence_length, hidden_size)
#         # pooler = last_hidden_state[:, 0] # take only the CLS token from last_hidden_state
        
#         ## Step 3: Calculate the loss and accuracy
#         if idx % 2 == 0:
#             loss = criterion.forward(last_hidden_state, targets)
#         else:
#             loss = criterion2()

#         ## Step 4: Perform backpropagation
#         # Before calculating the gradients, we need to ensure that they are all zero.
#         # The gradients would not be overwritten, but actually added to the existing ones.
#         optimizer.zero_grad()
#         # Perform backpropagation
#         loss.backward()

#         ## Step 5: Update the parameters
#         optimizer.step()

#         epoch_loss += loss.item()

#     return epoch_loss / len(data_loader)

In [17]:
# def eval_base_model(model, data_loader, dataset, criterion):
#     # Set model to eval mode
#     model.eval()
#     # loss per epoch
#     epoch_loss = 0

#     with torch.no_grad(): # Deactivate gradients for the following code
#         for data in tqdm(data_loader, desc='Evaluation', leave=False):

#             ## Step 1: Move input data to device (only strictly necessary if we use GPU)
#             ids = data['ids'].to(device, dtype = torch.long)
#             mask = data['mask'].to(device, dtype = torch.long)
#             token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
#             targets = data['targets'].to(device, dtype = torch.long)

#             ## Step 2: Run the model on the input data
#             output = model(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids)
#             last_hidden_state = output[0] # last_hidden_state (batch_size, sequence_length, hidden_size)

#             ## Step 3: Calculate the loss and accuracy
#             loss = criterion.forward(last_hidden_state, targets)
            
#             epoch_loss += loss.item()

#     return epoch_loss / len(data_loader)

In [20]:
# EPOCHS = 3

# best_valid_loss_base_model = float('inf')

# # Training loop
# for epoch in tqdm(range(EPOCHS), desc='Epochs'):

#     train_loss_base_model = train_base_model(base_model, 
#                                              base_model_optimizer,
#                                              train_dataloader,
#                                              train_dataset, 
#                                              base_model_criterion,
#                                              base_model_criterion2)
    
#     valid_loss_base_model = eval_base_model(base_model, 
#                                             valid_dataloader,
#                                             valid_dataset, 
#                                             base_model_criterion)

#     if valid_loss_base_model < best_valid_loss_base_model:
#         best_valid_loss_base_model = valid_loss_base_model
#         torch.save(base_model.state_dict(), 'MNLI_bert_base_uncased_sina_loss_checkpoint3.pt')

#     print(f'Epoch: {epoch + 1:02}')
#     print(f'\tTrain loss: {train_loss_base_model:.3f}')
#     print(f'\tValid loss: {valid_loss_base_model:.3f}')

Epochs:   0%|          | 0/2 [00:00<?, ?it/s]

Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Evaluation:   0%|          | 0/200 [00:00<?, ?it/s]

Epoch: 01
	Train loss: 169.082
	Valid loss: 175.799


Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Evaluation:   0%|          | 0/200 [00:00<?, ?it/s]

Epoch: 02
	Train loss: 163.949
	Valid loss: 185.103


MNLI_bert_base_uncased_sina_loss_checkpoint2.pt

Epoch: 02
	Train loss: 173.986
	Valid loss: 174.286
    
    
cosine similarity

merge loss

new language

In [21]:
# base_model.load_state_dict(torch.load('./MNLI_bert_base_uncased_sina_loss_checkpoint2.pt'))

<All keys matched successfully>

In [65]:
# config['base_model_train_loss'] = round(train_loss_base_model, 2)
# config['base_model_valid_loss'] = round(valid_loss_base_model, 2)

# The Model

The model we are going to use is BERT.

In [22]:
# Defining BERT model and adding classfication head on top of it
class BertClass(nn.Module):
    def __init__(self, base_model):
        super(BertClass, self).__init__()
#         self.bare_bert = BertModel.from_pretrained('bert-base-uncased')
#         self.bare_bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
        self.bare_bert = base_model
        self.dense = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask, token_type_ids):

        output_1 = self.bare_bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True)
        hidden_state = output_1[0] # last_hidden_state (batch_size, sequence_length, hidden_size)
        pooler = hidden_state[:, 0] # take only the CLS token from last_hidden_state
        # pooler = hidden_state[:, 1:].mean(1)
        pooler = self.dense(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output, output_1

In [23]:
model = BertClass(base_model)
model.to(device)
clear_output()

# Fine-tuning

Here we fine-tune the generated model from previous section.

In [24]:
# Defining the loss function and optimizer
optimizer = eval(config['classifier_optimizer']) 

criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [25]:
# Accuracy function
def calculate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [26]:
def train_model(model, optimizer, data_loader, dataset, criterion):
    # Set model to train mode
    model.train()
    # loss per epoch and number of correct predictions in order to calculate the accuracy
    epoch_loss = 0
    n_correct = 0
    
    for idx, data in enumerate(tqdm(data_loader, desc='Training', leave=False)):

        ## Step 1: Move input data to device (only strictly necessary if we use GPU)
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        ## Step 2: Run the model on the input data
        preds, bert_rep = model(ids, mask, token_type_ids)
        last_hidden_state = bert_rep[0]
        
        ## Step 3: Calculate the loss and accuracy
        if idx % 2 == 0:
            loss = criterion(preds, targets)
        else:
            loss = criterion.forward(last_hidden_state, targets)

        max_val, max_idx = torch.max(preds.data, dim=1)
        n_correct += calculate_accuracy(max_idx, targets)

        ## Step 4: Perform backpropagation
        # Before calculating the gradients, we need to ensure that they are all zero.
        # The gradients would not be overwritten, but actually added to the existing ones.
        optimizer.zero_grad()
        # Perform backpropagation
        loss.backward()

        ## Step 5: Update the parameters
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(data_loader), (n_correct / len(dataset))

In [27]:
def eval_model(model, data_loader, dataset, criterion):
    # Set model to eval mode
    model.eval()

    # loss per epoch and number of correct predictions in order to calculate the accuracy
    epoch_loss = 0
    n_correct = 0
    preds_list = list()
    targets_list = list()

    with torch.no_grad(): # Deactivate gradients for the following code
        for data in tqdm(data_loader, desc='Evaluation', leave=False):

            ## Step 1: Move input data to device (only strictly necessary if we use GPU)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            # print(targets.tolist())
            targets_list.extend(targets.tolist())

            ## Step 2: Run the model on the input data
            preds, _ = model(ids, mask, token_type_ids)
            preds = preds.squeeze()
            # print(preds.tolist())
            preds_list.extend(preds.tolist())

            ## Step 3: Calculate the loss and accuracy
            loss = criterion(preds, targets)

            max_val, max_idx = torch.max(preds.data, dim=1)
            n_correct += calculate_accuracy(max_idx, targets)

            epoch_loss += loss.item()

    return epoch_loss / len(data_loader), (n_correct / len(dataset)), preds_list, targets_list

In [None]:
EPOCHS = 6

best_valid_loss = float('inf')

# Training loop
for epoch in tqdm(range(EPOCHS), desc='Epochs'):

    train_loss, train_acc = train_model(model, optimizer, train_dataloader, train_dataset, criterion)
    valid_loss, valid_acc, _, _ = eval_model(model, valid_dataloader, valid_dataset, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'MNLI_bert_base_uncased_classifier_sina_loss_checkpoint2.pt')

    print(f'Epoch: {epoch + 1:02}')
    print(f'\tTrain loss: {train_loss:.3f} | Train acc: {train_acc * 100:.2f}%')
    print(f'\tValid loss: {valid_loss:.3f} | Valid acc: {valid_acc * 100:.2f}%')

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/2000 [00:00<?, ?it/s]

In [67]:
config['loss_train'] = round(train_loss, 2)
config['loss_valid'] = round(valid_loss, 2)
config['acc_train'] = round(train_acc, 2)
config['acc_valid'] = round(valid_acc, 2)

# Evaluation

Now, we evaluate the performance of the fine-tuned model on the test set by computing metrics like accuracy, precision, recall, and F1-score. First we load the best checkpoint:

In [19]:
# model.load_state_dict(torch.load('/content/drive/MyDrive/NLP 01 - Dr. Pilehvar/checkpoint_Rotten_RoBERTa.pt'))

## Accuracy

In [58]:
test_loss, test_acc, preds_list, targets_list = eval_model(model, test_dataloader, test_dataset, criterion)
print(f'Test loss: {test_loss:.3f} | Test acc: {test_acc * 100:.2f}%')

Evaluation:   0%|          | 0/197 [00:00<?, ?it/s]

Test loss: 0.610 | Test acc: 80.43%


Using torchmetrics to calculate remaining metrics.

In [21]:
# %pip install torchmetrics
# clear_output()

In [59]:
from torchmetrics.classification import Precision, Recall, F1Score

In [60]:
# generated prediction and target lists are correct in size
print(len(preds_list))
print(len(targets_list))

9815
9815


In [61]:
# torchmetrics uses tensors
preds_tensor = torch.tensor(preds_list)
targets_tensor = torch.tensor(targets_list)

In [62]:
# max for each prediction
preds_tensor_max = torch.max(preds_tensor, 1)

In [63]:
## Precision

precision_metric = Precision(task="multiclass", average='macro', num_classes=3)
precision = precision_metric(preds_tensor_max.indices, targets_tensor).item()
print(f'Precision: {precision * 100:.2f}%')

## Recall

recall_metric = Recall(task="multiclass", average='macro', num_classes=3)
recall = recall_metric(preds_tensor_max.indices, targets_tensor).item()
print(f'Recall: {recall * 100:.2f}%')

## F-1 Score

f1score_metric = F1Score(task="multiclass", average='macro', num_classes=3)
f1score = f1score_metric(preds_tensor_max.indices, targets_tensor)
print(f'F-1 Score: {f1score * 100:.2f}%')

Precision: 80.38%
Recall: 80.35%
F-1 Score: 80.35%


In [73]:
config['precision_test'] = round(precision, 4)
config['recall_test'] = round(recall, 4)
config['f1_test'] = round(f1score.item(), 4)
config['acc_test'] = round(test_acc, 4)

In [76]:
import json
filename = 'dict_results.json'
# 1. Read file contents
with open(filename, "r") as file:
    data_json = json.load(file)
# 2. Update json object
data_json.append(config)
# 3. Write json file
with open(filename, "w") as file:
    json.dump(data_json, file)

In [27]:
tokenizer.sep_token_id
tokenizer.cls_token_id
tokenizer.pad_token_id
tokenizer.unk_token_id

100

In [28]:
tokenizer.sep_token
tokenizer.cls_token
tokenizer.pad_token
tokenizer.unk_token

'[UNK]'

In [29]:
# inputs = tokenizer.encode_plus(
#     'Hi my name is sina.',
#     'Where are you?',
#     add_special_tokens=True,
#     max_length=MAX_LEN,
#     return_tensors= 'pt',
#     padding='max_length',
#     return_token_type_ids=True,
#     truncation=True
# )

In [30]:
# inputs['input_ids'].tolist()

In [31]:
# tokenizer.decode(inputs['input_ids'])