In [1]:
%%capture
!pip install transformers

In [2]:
%%capture
# important imports, do not modify this cell
import transformers
import subprocess as sp
import os, gc, sys
import torch
from torch import nn, optim
import pandas as pd
import numpy as np
import multiprocessing, time, warnings
import matplotlib.pyplot as plt
import psutil

# for XLM-RoBERTa model
from transformers import AutoTokenizer, XLMRobertaModel
# for DistillBERT
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
# for ALBERT
from transformers import AlbertTokenizer, AlbertModel
# for RoBERTa model
from transformers import RobertaTokenizer, RobertaModel

# suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
print("Choose one of the models to continue:- 1) RoBERTa 2) ALBERT 3) DistillBERT 4) XLM-RoBERTa")
try:
    val = int(input())
    assert val in list(range(1,5))
    if val == 1:
        model_name = "RoBERTa Base"
        # use RoBERTa-BASE model with input embedding size 768
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaModel.from_pretrained('roberta-base')
        # change the dense layer at end to be a classification head (3 categories for sentiment analysis)
        model.pooler.dense = nn.Linear(768, 3)  
    elif val == 2:
        model_name = "ALBERT"
        # use ALBERT model with input dimensions 768
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = AlbertModel.from_pretrained("albert-base-v2")
        # change the dense layer at end to be a classification head (3 categories for sentiment analysis)
        model.pooler = nn.Linear(768, 3)
        model.pooler_activation = nn.Tanh()
    elif val == 3:
        model_name = "DistillBERT"
        # use DistillBERT model with input dimensions 768
        tokenizer = AutoTokenizer.from_pretrained("Tejas3/distillbert_base_uncased_80")
        model = AutoModelForSequenceClassification.from_pretrained("Tejas3/distillbert_base_uncased_80")
        # change the dense layer at end to be a classification head (3 categories for sentiment analysis)
        model.vocab_projector = nn.Linear(768, 3)
    else:
        model_name = "XLM-RoBERTa Base"
        # use XLM-RoBERTa BASE model with embedding dimensions 768
        tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
        model = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-base")
        # change the dense layer at end to be a classification head (3 categories for sentiment analysis)
        model.pooler.dense = nn.Linear(768, 3)
except AssertionError:
    print("Value error, please try again.(run this cell again to continue)")
except:
    print("Program has run into an exception, please try again.(run this cell again to continue)")
finally:
    print("Selection logic end!")

Choose one of the models to continue:- 1) RoBERTa 2) ALBERT 3) DistillBERT 4) XLM-RoBERTa


 3


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Selection logic end!


In [4]:
# prints number of parameters in the model
c = 0
for p in model.parameters():
    c += (p.numel())
print("Number of parameters in model {} (in millions) is {:.4f}".format(model_name, c/1e6)) 

'''
11.095 million parameters in Alberta 
124.0573 million parameters in RoBERTa Base
66.9573 million parameters in DistillBERT
277.4554 million parameters in XLMR-RoBERTa Base
'''

Number of parameters in model DistillBERT (in millions) is 66.9581


'\n11.095 million parameters in Alberta \n124.0573 million parameters in RoBERTa Base\n66.9573 million parameters in DistillBERT\n277.4554 million parameters in XLMR-RoBERTa Base\n'

Dataloader reference:- https://stackoverflow.com/questions/44429199/how-to-load-a-list-of-numpy-arrays-to-pytorch-dataset-loader

BERT model variations reference:- https://360digitmg.com/blog/bert-variants-and-their-differences 

Reference for Jupyter markdown cell formatting:- IBM Documentation

BERT implementation from scratch and related explanation:- https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891

BERT paper reference:- https://arxiv.org/pdf/1810.04805

Logical processors a system and hyperthreading:- https://unix.stackexchange.com/questions/743285/difference-between-cores-and-logical-processors

GPU P100 vs GPU T4 x2:- https://www.linkedin.com/pulse/kaggle-accelerators-comparison-rukshar-alam-ki9bc

In [5]:
batch_size_en ,batch_size_th = 50, 25

start_time = time.time()

sys.path.insert(1,"/kaggle/input/minor-project-helper")
import twitter_dataset_preprocess, thai_dataset_preprocess

nlp_dataloaders = {} # dictionary for storing the dataloaders for different language datasets

nlp_dataloaders['twitter'] = twitter_dataset_preprocess.get_data_loaders(batch_size_en)
nlp_dataloaders['thai'] = thai_dataset_preprocess.get_data_loaders(batch_size_th)

end_time = time.time()
print(f"Created {len(nlp_dataloaders)} language datasets in {end_time-start_time:.10f} seconds.")

# perform garbage collection and cache clearing
gc.collect()
torch.cuda.empty_cache()

Created 2 language datasets in 2.5201516151 seconds.


In [6]:
sentences, labels = next(iter(nlp_dataloaders['twitter'][0]))
encoded_input = tokenizer(list(sentences), return_tensors='pt', padding=True)
output = model(**encoded_input) # output of the chosen model

# perform garbage collection and cache clearing
gc.collect()
torch.cuda.empty_cache()

In [7]:
# ROUGH
# output.pooler_output.shape # for ALBERT, XLMR-RoBERTa Base and RoBERTa Base models
# output.logits # for DistillBERT model

In [8]:
# defining the devices on which training needs to be done
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# defining the optimizer and learning rate scheduler 
optimizer = optim.Adam(model.parameters(), lr = 0.003, weight_decay = 0.0001)

# wait for one epoch while metric has stopped improving, then reduce lr by 30%
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.7, patience = 1) 

# defining the evaluation metric to be used, to compare probabilities predicted and actual labels
criterion = nn.CrossEntropyLoss()

# perform garbage collection and cache clearing
gc.collect()
torch.cuda.empty_cache()

In [9]:
# load model from checkpoint

if model_name == "XLM-RoBERTa Base":
    checkpoint_file_name = "XLMR_RoBERTa_Base_checkpoint.pth"
elif model_name == "RoBERTa Base":
    checkpoint_file_name = "RoBERTa_Base_checkpoint.pth"
else:
    checkpoint_file_name = model_name + "_checkpoint.pth"

checkpoint_directory = "/kaggle/input/minorproject-checkpoints/"
checkpoint_dict = torch.load(checkpoint_directory + checkpoint_file_name, map_location = device)

try:
    model.load_state_dict(checkpoint_dict['model_state_dict'])
    print("Model has been loaded from the checkpoint successfully!")
except:
    print("An exception has occured. Please try again!")
finally:
    # perform garbage collection and cache clearing
    gc.collect()
    torch.cuda.empty_cache()

Model has been loaded from the checkpoint successfully!


In [9]:
# for logging GPU memory usage in Python
def get_gpu_memory():
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    print("Available GPU:", memory_free_values, "MB")
    
# for logging RAM memory usage in Python
def get_ram_memory():
    # Get information about the system's virtual memory
    vm = psutil.virtual_memory()

    # Get the available memory
    ram_available = vm.available / 1024**2

    print("Available RAM:", ram_available, "MB")

# perform garbage collection and cache clearing
gc.collect()
torch.cuda.empty_cache()

## TODOs

1) Find out why importing torchtext library gives error, while no such error for torchvision or torch

2) Find out why passing Thai sentences to RoBERTa model, in torchtext, didn't give large embedding error, as obtained in case of ALBERT used here 

3) Find out why importing PyTorch/XLA gives error

In [10]:
nlp_parallel_model = nn.DataParallel(model)
nlp_parallel_model.to(device)

# check for cuda availability
cuda_is_available = torch.cuda.is_available()

# define sample size for which logs to be taken
sample_size_th = 5000

# define training loop within a modularised funtion
# is_log_needed: Boolean variable defining the need for computing logs or not. If set to False, no logs are recorded and training done on entire dataset
def train(train_dataloader, validation_dataloader, optimizer, epochs, is_log_needed):
    training_loss_list, validation_loss_list = [], []
    for e in range(1, epochs+1): 
        training_loss = 0.0
        validation_loss = 0.0
        time1 = time.time() # start the timer
        
        # define the number of samples processed across each epoch
        processed_samples = 0  
        
        # put the model in training mode
        nlp_parallel_model.train()
        for sentence, label in train_dataloader:
            length_samples = len(sentence)
            
            # zero out the gradients in the optimizer, to avoid accumulation
            optimizer.zero_grad()
            
            if processed_samples == sample_size_th and is_log_needed:
                print("After clearing gradients from the PyTorch optimizer, free memory is logged as....")
                if cuda_is_available:
                    get_gpu_memory()
                get_ram_memory()
            
            encoded_input = tokenizer(list(sentence), padding=True, return_tensors='pt', truncation=True, max_length=512, add_special_tokens = True)
            del sentence
            
            if processed_samples == sample_size_th and is_log_needed:
                print("After tokenizing the sentences, free memory is logged as....")
                if cuda_is_available:
                    get_gpu_memory()
                get_ram_memory()
                
            output = nlp_parallel_model(**encoded_input)
            del encoded_input
            
            if processed_samples == sample_size_th and is_log_needed:
                print("After processing the tokenized sentences, free memory is logged as....")
                if cuda_is_available:
                    get_gpu_memory()
                get_ram_memory()
            
            label = torch.Tensor(label).to(device, dtype=torch.int64)
            if model_name == "DistillBERT":
                output = output.logits.to(device)
            else:
                output = output.pooler_output.to(device)
                
            if processed_samples == sample_size_th and is_log_needed:
                print("After computing the tensor for output values, free memory is logged as....")
                if cuda_is_available:
                    get_gpu_memory()
                get_ram_memory()
            
            loss = criterion(output, label)
            
            if processed_samples == sample_size_th and is_log_needed:
                print("After computing the loss, free memory is logged as....")
                if cuda_is_available:
                    get_gpu_memory()
                get_ram_memory()
            
            del output, label

            loss.backward(retain_graph = False)
            optimizer.step()
            
            if processed_samples == sample_size_th and is_log_needed:
                print("After computing loss gradients and performing backpropagation, free memory is logged as....")
                if cuda_is_available:
                    get_gpu_memory()
                get_ram_memory()
            
            training_loss += loss.item()
            
            if processed_samples == sample_size_th and is_log_needed:
                # break when 5000 samples are processed
                time2 = time.time() # stop the timer
                print("Time taken to train per 5000 samples-> {:.4f} seconds".format(time2-time1))
                return "Usage log tests being done!"
                
            processed_samples += length_samples 
            del length_samples
        
            # try deallocating heavy objects to free up memory and GPU required for training
            # needed as Python's Garbage collector does not work automatically as in Java.
            del loss
            gc.collect()
            torch.cuda.empty_cache() 
        
        # put model in validation mode
        nlp_parallel_model.eval()
        with torch.no_grad():
            for sentence, label in validation_dataloader:
                encoded_input = tokenizer(list(sentence), padding=True, return_tensors='pt', truncation=True, max_length=512, add_special_tokens = True)
                output = nlp_parallel_model(**encoded_input)
                del sentence

                label = torch.Tensor(label).to(device, dtype=torch.int64)
                if model_name == "DistillBERT":
                    output = output.logits.to(device)
                else:
                    output = output.pooler_output.to(device)
                loss = criterion(output, label)
                del output, label

                validation_loss += loss.item()

                # try deallocating heavy objects to free up memory and GPU required for training
                # needed as Python's Garbage collector does not work automatically as in Java.
                del loss
                gc.collect()
                torch.cuda.empty_cache() 
        
        time2 = time.time() # stop the timer 
        lr_scheduler.step(validation_loss)
        print("Epoch {}-> Training loss {:.10f}, Validation loss {:.10f}, Time taken {:.10f} seconds".format(e, training_loss / len(train_dataloader), validation_loss / len(validation_dataloader), time2-time1))
        training_loss_list.append(training_loss / len(train_dataloader))
        validation_loss_list.append(validation_loss / len(validation_dataloader))
    return (training_loss_list, validation_loss_list)

**<p style="text-align:center;"> RAM / GPU Usage and Training Time LOG**
    
__<u>Note that the readings are taken for processing per 5000 training samples for both English and Thai datasets. Batch sizes for English and Thai datasets is assumed to be 50 and 25 respectively, unless stated otherwise. The tests for English and Thai datasets have been done together orderwise to determine which language requires more computational resources during sentence processing.</u>__
    
__<u>It shall be assumed that observations are recorded for English followed by Thai.</u>__
    
__<u>It shall be assumed that Adam optimizer is being used, unless stated otherwise.</u>__
    
<br> __1==>__ _Using ALBERT model:-_ 
   
* Using CPU: 
    
    _Getting sentence embedding mismatch error for thai dataset, as ALBERT can process only 512 tokens (context length),     <br> while embedding sizes greter than 512 tokens is also being obtained._
    <br> __This occurs due to model assigning more tokens to words outside of its vocabulary set.__
    <br> _Reference for error:-_ https://stackoverflow.com/questions/64320883/the-size-of-tensor-a-707-must-match-the-size-of-tensor-b-512-at-non-singleto 
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available RAM: 23849.35546875 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available RAM: 23849.35546875 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available RAM: 24313.61328125 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available RAM: 24313.61328125 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available RAM: 24313.61328125 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available RAM: 24260.5 MB
    <br> Time taken to train per 5000 samples-> **1319.1426 seconds (~22 minutes)**
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available RAM: 24454.15625 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available RAM: 24454.15625 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available RAM: 24454.15625 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available RAM: 24454.15625 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available RAM: 24454.15625 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available RAM: 24454.15625 MB
    <br> Time taken to train per 5000 samples-> **3227.5199 seconds (~54 minutes)** 

<br> __2==>__ _Using RoBERTa model:-_ 
   
* Using CPU: 
    * English Dataset:- <br>
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available RAM: 23727.98046875 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available RAM: 23727.98046875 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available RAM: 23726.83984375 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available RAM: 23726.83984375 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available RAM: 23726.83984375 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available RAM: 23733.60546875 MB
    <br> Time taken to train per 5000 samples-> **1155.1060 seconds (~20 minutes)**
    
    * Thai Dataset:- <br>
    <br> **Taking more than 3 hours to process 5000 samples with RAM usage upto 24-26 GB**
    
    It can be hence seen that RoBERTa occupies huge memory and takes more processing time, hence I feel     <br> XLM-RoBERTa should not have its logs recorded, as it is based on RoBERTa architecture. For         <br> DistillBERT, I feel CPU is inefficient, so enough logs have been taken.   
    
__Conclusion-__ _As it can be seen, CPU is inefficient at training tasks, as it's instruction cycle is optimised for small dimensions of data. For higher dimensional data, GPUs are preferred as they promote multiprocessing by allocating a thread to each GPU core, with thousands of core present inside a GPU device. For small scale data, CPU outperforms GPU due to optimised fetch-decode-execute cycle     of CPU and additional overhead of creating threads for subunits of data in case of GPU._

_Also, it can be seen that Thai sentences require large amount of training time, which seems       obvious as ALBERT model is not trained on Thai and the words used in Thai are outside the               vocabulary set of Thai._
    
___Thus, we will be using GPU for recording logs. Two types of GPUs will be used:- GPU P100 and GPU T4 x2___ 
    
<u><em>Training on GPU P100 and GPU T4 x2:-</em></u>
    
__1==>__ _Using ALBERT model:-_ 
    
* Using GPU P100:
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [15668] MB
    <br> Available RAM: 26858.90625 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [15668] MB
    <br> Available RAM: 26858.875 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [13106] MB
    <br> Available RAM: 26848.046875 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [13106] MB
    <br> Available RAM: 26848.046875 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [13106] MB
    <br> Available RAM: 26848.046875 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [12996] MB
    <br> Available RAM: 26848.046875 MB
    <br> Time taken to train per 5000 samples-> **52.5883 seconds**

    For Thai dataset, **OutOfMemoryError** came as the GPU usage exceeded permissible limit. (16 GiB)
    
* Using GPU T4 x2:
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [14631, 14835] MB
    <br> Available RAM: 26296.4609375 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [14631, 14835] MB
    <br> Available RAM: 26295.2109375 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [13087, 13193] MB
    <br> Available RAM: 26296.84765625 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [13087, 13193] MB
    <br> Available RAM: 26296.07421875 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [13087, 13193] MB
    <br> Available RAM: 26296.07421875 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [13027, 13133] MB
    <br> Available RAM: 26287.22265625 MB
    <br> Time taken to train per 5000 samples-> **69.0760 seconds (~1 minute)**
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [14627, 14835] MB
    <br> Available RAM: 26249.203125 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [14627, 14835] MB
    <br> Available RAM: 26245.515625 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [14003, 14153] MB
    <br> Available RAM: 26245.125 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [14003, 14153] MB
    <br> Available RAM: 26244.828125 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [14003, 14153] MB
    <br> Available RAM: 26244.828125 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [13983, 14133] MB
    <br> Available RAM: 26244.48046875 MB
    <br> Time taken to train per 5000 samples-> **115.7149 seconds (~2 minutes)**
   
__2==>__ _Using RoBERTa model:-_ 
    
* Using GPU P100:    
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [13690] MB
    <br> Available RAM: 28158.8046875 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [13690] MB
    <br> Available RAM: 28169.6953125 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [12346] MB
    <br> Available RAM: 28159.0 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [12346] MB
    <br> Available RAM: 28158.47265625 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [12346] MB
    <br> Available RAM: 28177.46484375 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [12090] MB
    <br> Available RAM: 28166.8828125 MB
    <br> Time taken to train per 5000 samples-> **45.3848 seconds**
    
    For Thai dataset, **OutOfMemoryError** came as the GPU usage exceeded permissible limit. (16 GiB)
    
* Using GPU T4 x2:
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [12489, 14835] MB
    <br> Available RAM: 27663.0078125 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [12489, 14835] MB
    <br> Available RAM: 27662.9921875 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [12455, 13605] MB
    <br> Available RAM: 27662.4140625 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [12455, 13605] MB
    <br> Available RAM: 27662.1796875 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [12455, 13605] MB
    <br> Available RAM: 27662.1796875 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [12207, 13457] MB
    <br> Available RAM: 27659.9140625 MB
    <br> Time taken to train per 5000 samples-> **78.9255 seconds (~1 minute 30 seconds)**
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [12659, 14833] MB
    <br> Available RAM: 27617.72265625 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [12659, 14833] MB
    <br> Available RAM: 27617.48828125 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [5037, 6841] MB
    <br> Available RAM: 27616.4140625 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [5037, 6841] MB
    <br> Available RAM: 27616.4140625 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [5037, 6841] MB
    <br> Available RAM: 27615.99609375 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [4803, 6841] MB
    <br> Available RAM: 27615.984375 MB
    <br> Time taken to train per 5000 samples-> **354.7913 seconds (~6 minutes)**
    
__3==>__ _Using DistillBERT model:-_ 
    
* Using GPU P100:    
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [14564] MB
    <br> Available RAM: 28590.79296875 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [14564] MB
    <br> Available RAM: 28590.078125 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [14378] MB
    <br> Available RAM: 28590.078125 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [14378] MB
    <br> Available RAM: 28590.078125 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [14378] MB
    <br> Available RAM: 28590.078125 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [14268] MB
    <br> Available RAM: 28590.11328125 MB
    <br> Time taken to train per 5000 samples-> **31.4232 seconds**
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [14654] MB
    <br> Available RAM: 28595.55859375 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [14654] MB
    <br> Available RAM: 28595.5390625 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [13750] MB
    <br> Available RAM: 28595.37109375 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [13750] MB
    <br> Available RAM: 28595.35546875 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [13750] MB
    <br> Available RAM: 28595.35546875 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [13474] MB
    <br> Available RAM: 28595.21875 MB
    <br> Time taken to train per 5000 samples-> **60.8670 seconds (~1 minute)**
    
* Using GPU T4 x2:
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [13513, 14835] MB
    <br> Available RAM: 28342.21484375 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [13513, 14835] MB
    <br> Available RAM: 28343.65234375 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [13503, 14211] MB
    <br> Available RAM: 28350.3671875 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [13503, 14211] MB
    <br> Available RAM: 28349.70703125 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [13503, 14211] MB
    <br> Available RAM: 28336.08984375 MB
    <br> Available GPU: [13393, 14121] MB
    <br> Available RAM: 28335.90234375 MB
    <br> Time taken to train per 5000 samples-> **55.7659 seconds**
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [13599, 14833] MB
    <br> Available RAM: 28306.3203125 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [13599, 14833] MB
    <br> Available RAM: 28294.51953125 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [13589, 14341] MB
    <br> Available RAM: 28294.62890625 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [13589, 14341] MB
    <br> Available RAM: 28294.3828125 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [13589, 14341] MB
    <br> Available RAM: 28294.3828125 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [13399, 14251] MB
    <br> Available RAM: 28298.25 MB
    <br> Time taken to train per 5000 samples-> **89.3750 seconds (~1 minute 30 seconds)** 
    
__Conclusion:-__ _It can be clearly observed that ALBERT and RoBERTa fail to train even 5000 samples for Thai dataset on GPU P100, but able to do so easily on GPU T4 x2. Even though GPU P100 is ideal for training, one plausible reason that can be stated is that it provides less memory compared to T4 x2. Two things can also be concluded henceforth:-
<br> 1) ALBERT and RoBERTa are memory intensive models, especially when working with non-English languages.
<br> 2) RoBERTa consumes much memory on T4 x2. Thus, for multilingual version XLMR, we will assume T4 x2 works best.
<br> One more thing that can be seen is DistillBERT achieves low latency, in terms of time and memory , be it GPU P100 or GPU T4 x2. This shows that using distilled network, taking in semantic knowledge from larger teacher network (BERT) plays a major role in computation time reduction.
<br> Interesting thing is that ALBERT is designed to reduce model parameters. Yet, it is memory intensive._
    
__Assuming that GPU T4 x2 is ideal for training the models, result log for XLM-RoBERTa BASE is shown below:__
    
* Using GPU T4 x2:
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [9861, 14835] MB
    <br> Available RAM: 27270.69140625 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [9861, 14835] MB
    <br> Available RAM: 27270.31640625 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [9821, 12873] MB
    <br> Available RAM: 27269.25390625 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [9821, 12873] MB
    <br> Available RAM: 27268.0625 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [9821, 12873] MB
    <br> Available RAM: 27264.57421875 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [9087, 12139] MB
    <br> Available RAM: 27264.5234375 MB
    <br> Time taken to train per 5000 samples-> **92.7214 seconds (~1 minute 30 seconds)**
    
    After clearing gradients from the PyTorch optimizer, free memory is logged as....
    <br> Available GPU: [9341, 14833] MB
    <br> Available RAM: 27237.44140625 MB
    <br> After tokenizing the sentences, free memory is logged as....
    <br> Available GPU: [9341, 14833] MB
    <br> Available RAM: 27237.20703125 MB
    <br> After processing the tokenized sentences, free memory is logged as....
    <br> Available GPU: [9313, 12915] MB
    <br> Available RAM: 27225.46875 MB
    <br> After computing the tensor for output values, free memory is logged as....
    <br> Available GPU: [9313, 12915] MB
    <br> Available RAM: 27225.375 MB
    <br> After computing the loss, free memory is logged as....
    <br> Available GPU: [9313, 12915] MB
    <br> Available RAM: 27225.56640625 MB
    <br> After computing loss gradients and performing backpropagation, free memory is logged as....
    <br> Available GPU: [9233, 12915] MB
    <br> Available RAM: 27224.55859375 MB
    <br> Time taken to train per 5000 samples-> **201.5807 seconds (~3 minutes 30 seconds)**
    
__Conclusion:-__ _It can be seen that for GPU T4 x2, RoBERTa seems to perform worse that XLM-RoBERTa (expected since XLMR can handle multiple languages), but for English, performance of XLMR is worst._

One common thing that is missed during model testing is that all the GPU cores are not utilised, even when it is done properly during training. This may cause issues like CudaError when testing on bulk language datasets. To counter this, we are using DataParallel wrapper to distribute model processing across all GPU cores, irrespective of whether it is GPU P100/GPU T4 x2.
    

In [11]:
msg = ""

if cuda_is_available:
    msg = "Voila! You have set it up all correct."
    num_gpu_cores = torch.cuda.device_count()
    if num_gpu_cores < 2:  # just to be on safe side, using T4 x2 for more memory
        msg = "Oops! Wrong training hardware chosen!"

print(msg)
assert msg == "Voila! You have set it up all correct."

Voila! You have set it up all correct.


In [12]:
# call training modules for English and Thai language datasets
try:
    if cuda_is_available:
        # change hyperparameters here
        epochs = 3
        optimizer.param_groups[0]['lr'] = 0.03
        optimizer.param_groups[0]['weight_decay'] = 0.007
        
        is_log_needed = bool(int(input("Want to record logs or not(0/1)?:->"))) # we require logs to be computed, thus setting is_log_needed to be True
        if not is_log_needed:
            print("Training {} model with learning rate: {}, weight decay: {} for {} number of epochs".format(model_name, optimizer.param_groups[0]['lr'], optimizer.param_groups[0]['weight_decay'], epochs))
        l = train(nlp_dataloaders['twitter'][0], nlp_dataloaders['twitter'][1], optimizer, epochs, is_log_needed)
        l = train(nlp_dataloaders['thai'][0], nlp_dataloaders['thai'][1], optimizer, epochs, is_log_needed)
    else:
        print("Please train on GPUs!")
        raise AssertionError("Appropriate training hardware unit not chosen")
except:
    print("Exception has occured! Please try again.")

Want to record logs or not(0/1)?:-> 0


Training DistillBERT model with learning rate: 0.03, weight decay: 0.007 for 3 number of epochs
Epoch 1-> Training loss 0.9807632326, Validation loss 0.6863198912, Time taken 1670.5965814590 seconds
Epoch 2-> Training loss 1.8362497858, Validation loss 0.6860823764, Time taken 1664.6386559010 seconds
Epoch 3-> Training loss 0.6924033436, Validation loss 0.7234542374, Time taken 1652.1677691936 seconds
Epoch 1-> Training loss 4.5085909089, Validation loss 0.9921664245, Time taken 407.0619394779 seconds
Epoch 2-> Training loss 1.2584961391, Validation loss 0.9845785437, Time taken 412.9943861961 seconds
Epoch 3-> Training loss 0.9818475802, Validation loss 0.9948842360, Time taken 409.3536829948 seconds


__<center>Model Training logs</center>__

1) Using XLM-RoBERTa Base model:-
   <br> Training XLM-RoBERTa Base model with learning rate: 0.003, weight decay: 0.0001 for 1 number of epochs
   <br> Epoch 1-> Training loss **0.7590388735**, Validation loss **0.7540896507**, Time taken **3059.0003983974 seconds (~51 minutes)**
   <br> Epoch 1-> Training loss **1.0206603203**, Validation loss **0.9978039391**, Time taken **894.9079222679 seconds (~15 minutes)**
   <br> Testing on English Language dataset with batch size 50:-
   <br> Correct predictions **0 out of 30000 test samples**
   <br> Testing on Thai Language dataset with batch size 25:- 
   <br> Correct predictions **1510 out of 2675 test samples**

    <br> Training XLM-RoBERTa Base model with learning rate: 0.007, weight decay: 0.001 for 2 number of epochs
    <br> Epoch 1-> Training loss **0.7582048333**, Validation loss **0.7552044338**, Time taken **2969.9936687946 seconds**
    <br> Epoch 2-> Training loss **0.7570085176**, Validation loss **0.7557407733**, Time taken **2940.7576088905 seconds**
    <br> Epoch 1-> Training loss **1.0431717821**, Validation loss **0.9942249988**, Time taken **887.8887867928 seconds**
    <br> Epoch 2-> Training loss **0.9859426652**, Validation loss **0.9968404985**, Time taken **888.1932497025 seconds**
    <br> Testing on English Language dataset with batch size 50
    <br> Correct predictions **0 out of 30000 test samples**
    <br> Testing on Thai Language dataset with batch size 25
    <br> Correct predictions **1510 out of 2675 test samples**

2) Using DistillBERT model:-
    <br> Training DistillBERT model with learning rate: 0.03, weight decay: 0.007 for 3 number of epochs
    <br> Epoch 1-> Training loss **0.9807632326**, Validation loss **0.68631989122**,Time taken **1670.5965814590 seconds (~28 minutes)**
    <br> Epoch 2-> Training loss **1.8362497858**, Validation loss **0.6860823764**, Time taken **1664.6386559010 seconds**
    <br> Epoch 3-> Training loss **0.6924033436**, Validation loss **0.7234542374**, Time taken **1652.1677691936 seconds**
    <br> Epoch 1-> Training loss **4.5085909089**, Validation loss **0.9921664245**, Time taken **407.0619394779 seconds (~7 minutes)**
    <br> Epoch 2-> Training loss **1.2584961391**, Validation loss **0.9845785437**, Time taken **412.9943861961 seconds**
    <br> Epoch 3-> Training loss **0.9818475802**, Validation loss **0.9948842360**, Time taken **409.3536829948 seconds**
    <br> Testing on English Language dataset with batch size 50
    <br> Correct predictions **0 out of 30000 test samples**
    <br> Testing on Thai Language dataset with batch size 25
    <br> Correct predictions **1510 out of 2675 test samples**

In [13]:
# Save the trained model parameters in a checkpoint file and save it for future inference purposes

model_state_dict = model.state_dict()
optimizer_name = "Adam"
lr_scheduler_name = "ReduceLROnPlateau"

checkpoint_dict = {
    'model_state_dict': model_state_dict,
    'optimizer_name': optimizer_name,
    'lr': optimizer.param_groups[0]['lr'],
    'weight_decay': optimizer.param_groups[0]['weight_decay'],
    'lr_scheduler_name': lr_scheduler_name
                  }

# Kaggle input files are read-only and cannot be overwritten
# thus need to download the checkpoint and manually upload it to the dataset

if cuda_is_available:
    torch.save(checkpoint_dict, '/kaggle/working/' + checkpoint_file_name)
else:
    print("Not able to save file as GPU has not been set up!")

# perform garbage collection and cache clearing
gc.collect()
torch.cuda.empty_cache()

In [14]:
# test the model's performance
nlp_parallel_model = nn.DataParallel(model)
nlp_parallel_model.to(device) # move model to CPU/GPU

def test(test_dataloader, batch_size):
    correct_predictions = 0
    for sentence, label in test_dataloader:
        encoded_input = tokenizer(list(sentence), padding=True, return_tensors='pt', truncation=True, max_length=512, add_special_tokens = True)
        del sentence
        
        output = nlp_parallel_model(**encoded_input) 
        del encoded_input
        
        label = torch.Tensor(label).to(device, dtype=torch.int64)
        
        if model_name == "DistillBERT":
            output = output.logits.to(device)
        else:
            output = output.pooler_output.to(device)
            
        _, output_label = output.topk(k = 1, dim = 1)
        
        del output

        output_label = torch.reshape(output_label, (torch.numel(output_label),))

        # correct prediction made by model
        correct_predictions += int(sum(output_label==label))

        # try deallocating heavy objects to free up memory and GPU required for training
        # needed as Python's Garbage collector does not work automatically as in Java.
        del output_label, label
        gc.collect()
        torch.cuda.empty_cache()

    # printing the result (observes that model performance has improved slightly with training)
    print("Correct predictions {} out of {} test samples".format(correct_predictions, len(test_dataloader)*batch_size))

if cuda_is_available:
    print("Testing on English Language dataset with batch size {}".format(batch_size_en))
    test(nlp_dataloaders['twitter'][2], batch_size_en)
    print("Testing on Thai Language dataset with batch size {}".format(batch_size_th))
    test(nlp_dataloaders['thai'][2], batch_size_th)

Testing on English Language dataset with batch size 50
Correct predictions 0 out of 30000 test samples
Testing on Thai Language dataset with batch size 25
Correct predictions 1510 out of 2675 test samples


## TODOs:

1) State proper reasons for GPU P100 being better for training, validation than GPU T4 x2

2) State proper reasons for why ALBERT may be memory intensive

3) State proper reasons for 0% accuracy on English dataset

   Cue:- Due to limitation of available public dataset/ label imbalance, work could not be completed and future work can continue on symmetric datasets with 3 sentiment labels (positive, negative and neutral)

In [6]:
'''
Plausible reason for poor performance on English dataset is that model seems to get stuck in local 
minima by optimising parameters while classifying sentences as irrelevant, while no possible labels are 
there in original dataset, thereby indicating that class imbalance issue might be there.   


can refer to https://huggingface.co/datasets/prithivMLmods/GPT-Sentiment-Analysis-200K for eng dataset
'''

sentence, label = next(iter(nlp_dataloaders['twitter'][2]))

In [11]:
encoded_input = tokenizer(list(sentence), padding=True, return_tensors='pt', truncation=True, max_length=512, add_special_tokens = True)
output = model(**encoded_input)
output = output.pooler_output

_, output_label = output.topk(1, dim = -1)

output_label

tensor([[1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1]])

In [12]:
label = torch.Tensor(label).to(dtype=torch.int64)
label

tensor([2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 2, 2,
        2, 0])