IMPORTING LIBRARIES

In [1]:
#installing packages
!pip install datasets
!pip install transformers


#libraries import
import csv
import torch
import datasets
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1

DATASET

In [2]:
train,test  = load_dataset("imdb", split =['train', 'test'])

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


DATA PREPROCESSING
- Tokenization
- Formatting

In [4]:
def tokenization(batched_text):
  tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
  return tokenizer(batched_text['text'], padding = True, truncation=True)

In [5]:
train_data = train.map(tokenization, batched = True, batch_size = len(train))
test_data = test.map(tokenization, batched = True, batch_size = len(test))
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

TRAINING ARGUMENTS 
- Compute metrics on train and test data after 1 epoch run

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
training_args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/Colab Notebooks/Chase/output_dir',
    num_train_epochs=1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,    #after how many steps does learning rate reduce 
    weight_decay=0.01,   #L2 regularisation
    logging_steps = 8, 
    fp16 = False,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/Chase/logs',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification'
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.2159,0.192767,0.94348,0.942305,0.962305,0.92312


TrainOutput(global_step=390, training_loss=0.3069078607436938, metrics={'train_runtime': 3149.3907, 'train_samples_per_second': 7.938, 'train_steps_per_second': 0.124, 'total_flos': 6567251941785600.0, 'train_loss': 0.3069078607436938, 'epoch': 1.0})

In [8]:
model1 = RobertaForSequenceClassification.from_pretrained('roberta-base')

for module in model1.roberta.encoder.layer[:-2]:
  for param in module.parameters():
      param.requires_grad = False


training_args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/Colab Notebooks/Chase/output_dir',
    num_train_epochs=1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=500,    #after how many steps does learning rate reduce 
    weight_decay=0.01,   #L2 regularisation
    logging_steps = 8, 
    fp16 = False,
    logging_dir='/content/drive/MyDrive/Colab Notebooks/Chase/logs',
    dataloader_num_workers = 8,
    run_name = 'roberta-classification_finetune_last2layers'
)

trainer = Trainer(
    model=model1,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

'cuda'

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,0.1805,0.155704,0.94212,0.94182,0.94673,0.93696


TrainOutput(global_step=390, training_loss=0.3840401012163896, metrics={'train_runtime': 2601.8254, 'train_samples_per_second': 9.609, 'train_steps_per_second': 0.15, 'total_flos': 6567251941785600.0, 'train_loss': 0.3840401012163896, 'epoch': 1.0})

Observations:
1. Faster Training when compared with fine tuning all layers than last 2 layers


Could have done?
1. Confusion Matrix 

Faster Training