# <font color='salmon'>Final project - sentiment analysis comparison part 4</font>
using hagging face wrapper and pytorch

#Imports & functions

In [None]:
import pandas as pd
import numpy as np
import math
import string
#connect to drive
from google.colab import drive
# visualisation
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [None]:
!pip install -qq transformers

import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel,DistilBertConfig, DistilBertForSequenceClassification, AdamW, AutoModelForSequenceClassification ,DistilBertPreTrainedModel
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from transformers.modeling_outputs import (SequenceClassifierOutput)
# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Define a custom dataset class
class SentimentDataset(Dataset):
    def __init__(self, data,x_name,y_name, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = data[x_name].tolist()
        self.labels = data[y_name].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {
            'input_ids': input_ids.to(device),
            'attention_mask': attention_mask.to(device),
            'label': torch.tensor(label).to(device, dtype=torch.long)
        }

In [None]:
class DistilBERTClass(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.distil_bert = DistilBertModel(config)
        #freeze embeding layers
        for param in self.distil_bert.parameters():
          param.requires_grad = False
        self.num_labels=2
        self.pre_classifier = torch.nn.Linear(config.dim, config.dim)
        self.dropout = torch.nn.Dropout(config.seq_classif_dropout)
        self.classifier = torch.nn.Linear(config.dim, 2)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,):

        distilbert_output = self.distil_bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_state = distilbert_output[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = torch.nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                loss_fct = nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if not return_dict:
            output = (logits,) + distilbert_output[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


In [None]:
def training_loop(model,train_data_loader,len_train_dataset,epoch):
    #train
    model.train()
    train_loss = 0.0
    train_correct = 0
    i=0
    # Set the optimizer and decaying learning rate
    if epoch < 6: # run 4 epoch with lr=1e-3
      optimizer = AdamW(model.parameters(), lr=5e-4)
    else:
      if epoch < 9: # run 6 epoch with lr=1e-4
        optimizer = AdamW(model.parameters(), lr=1e-4)
      else:
        optimizer = AdamW(model.parameters(), lr=1e-5)
    for batch in train_data_loader:
        i=i+1
        if (i %250 == 0):
          print (f'batch number {i}')
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, predicted = torch.max(logits, dim=1)
        train_correct += (predicted == labels).sum().item()
        train_loss += loss.item() * input_ids.size(0)

    train_accuracy = train_correct / len_train_dataset
    train_loss /= len_train_dataset # avg loss between batches
    loss.backward()
    optimizer.step()
    return train_accuracy,train_loss


In [None]:
def val_model(model,data_loader,len_dataset,phase):# phase = test or else
# test the model and get scores for accuracy recall and precision
  model.eval()
  val_loss = 0.0
  val_correct = 0
  val_true_positives = 0
  val_false_positives = 0
  val_false_negatives = 0
  with torch.no_grad():
      for batch in data_loader:
          input_ids = batch['input_ids']
          attention_mask = batch['attention_mask']
          labels = batch['label']
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
          loss = outputs.loss
          logits = outputs.logits
          _, predicted = torch.max(logits, dim=1)
          val_correct += (predicted == labels).sum().item()
          val_loss += loss.item() * input_ids.size(0)
          val_true_positives += torch.logical_and(predicted == 1, labels == 1).sum().item()
          val_false_positives += torch.logical_and(predicted == 1, labels == 0).sum().item()
          val_false_negatives += torch.logical_and(predicted == 0, labels == 1).sum().item()

  val_accuracy = val_correct / len_dataset
  val_loss /= len_dataset
  val_precision = val_true_positives / (val_true_positives + val_false_positives)
  val_recall = val_true_positives / (val_true_positives + val_false_negatives)
  if phase=="test":
    print(f"Test Accuracy: {val_accuracy:.4f} | Test Precision: {val_precision:.4f} | Test Recall: {val_recall:.4f}")
    return val_accuracy,val_precision,val_recall
  else: #phase = validation
    return val_accuracy,val_loss

# Load data

In [None]:
# data is from : https://www.kaggle.com/datasets/therohk/ireland-historical-news?datasetId=30661
drive.mount('/content/drive')
path = "/content/drive/MyDrive/final project/data_files/nlp_data.csv"
df = pd.read_csv(path)
df.drop(['Unnamed: 0'], axis='columns', inplace=True)
df=df.rename(columns={"clean_reviews": "review"})
print(df.shape)
df.head()

Mounted at /content/drive
(50000, 3)


Unnamed: 0,label,review,final_tokenized
0,0.0,thats keep ask many fight scream match swear g...,"['thats', 'keep', 'ask', 'many', 'fight', 'scr..."
1,0.0,watch entire movie could watch entire movie st...,"['watch', 'entire', 'movie', 'could', 'watch',..."
2,1.0,touch love story reminiscent mood love draw he...,"['touch', 'love', 'story', 'reminiscent', 'moo..."
3,0.0,latterday Fulci schlocker totally abysmal conc...,"['latterday', 'Fulci', 'schlocker', 'totally',..."
4,0.0,First firmly believe Norwegian movie continual...,"['First', 'firmly', 'believe', 'Norwegian', 'm..."


## split to train validation and test

In [None]:
# 70% for training, 15% for validation and 15% for testing

train_df,temp_val_df=train_test_split(df, test_size=0.3, random_state = 0)
val_df,test_df=train_test_split(temp_val_df, test_size=0.5, random_state = 0)

print(train_df.shape, val_df.shape, test_df.shape)

(35000, 3) (7500, 3) (7500, 3)


# Tokenize and encode

In [None]:
# Load the pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# Define the batch size and maximum sequence length
batch_size = 64
max_length = 512

# Create the training and validation datasets
train_dataset = SentimentDataset(train_df,'review','label', tokenizer, max_length)
val_dataset = SentimentDataset(val_df,'review','label', tokenizer, max_length)
test_dataset = SentimentDataset(test_df,'review','label', tokenizer, max_length)

# Create data loaders for batch processing
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# create model's instance
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model.to(device)

# freeze all layers so training won't happen and unfreeze just the last two layers - the classifier layers after encoding
for params in model.parameters():
    params.requires_grad=False
model.classifier.weight.requires_grad=True
model.pre_classifier.weight.requires_grad=True

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
#check model's architecture to know which layers to freeze and which to allow grad
# model.modules

In [None]:
# # Train the model to find ideal hyperparameters
# num_epochs = 12
# results_summary = pd.DataFrame(columns=['epoch','phase', 'loss', 'accuracy'])

# for epoch in range(num_epochs):
#     #train
#     train_accuracy,train_loss=training_loop(model,train_loader,len(train_dataset),epoch)
#     #validation
#     val_accuracy,val_loss=val_model(model,val_loader,len(val_dataset),"validaion")

#     epoch_results_dict={'epoch':[epoch+1,epoch+1],
#         'phase':['train','val'],
#         'loss':[train_loss,val_loss],
#         'accuracy':[train_accuracy,val_accuracy]
#        }
#     epoch_results_df = pd.DataFrame(epoch_results_dict)
#     results_summary = pd.concat([results_summary, epoch_results_df], ignore_index = True)

#     print(f"Epoch {epoch+1}/{num_epochs}")
#     print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
#     print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")
#     print("--------------------")

# results_summary.head(20)



batch number 500
batch number 1000
Epoch 1/15
Train Loss: 0.8104 | Train Accuracy: 0.7693
Val Loss: 0.6118 | Val Accuracy: 0.7949
--------------------




batch number 500
batch number 1000
Epoch 2/15
Train Loss: 0.6647 | Train Accuracy: 0.7727
Val Loss: 0.5027 | Val Accuracy: 0.8055
--------------------




batch number 500
batch number 1000
Epoch 3/15
Train Loss: 0.5323 | Train Accuracy: 0.7896
Val Loss: 0.4221 | Val Accuracy: 0.8151
--------------------




batch number 500
batch number 1000
Epoch 4/15
Train Loss: 0.4400 | Train Accuracy: 0.8018
Val Loss: 0.4088 | Val Accuracy: 0.8233
--------------------




batch number 500
batch number 1000
Epoch 5/15
Train Loss: 0.4220 | Train Accuracy: 0.8062
Val Loss: 0.4064 | Val Accuracy: 0.8272
--------------------




batch number 500
batch number 1000
Epoch 6/15
Train Loss: 0.4105 | Train Accuracy: 0.8210
Val Loss: 0.3995 | Val Accuracy: 0.8296
--------------------




batch number 500
batch number 1000
Epoch 7/15
Train Loss: 0.4067 | Train Accuracy: 0.8203
Val Loss: 0.3945 | Val Accuracy: 0.8285
--------------------




batch number 500
batch number 1000
Epoch 8/15
Train Loss: 0.4028 | Train Accuracy: 0.8166
Val Loss: 0.3942 | Val Accuracy: 0.8284
--------------------




batch number 500
batch number 1000
Epoch 9/15
Train Loss: 0.4040 | Train Accuracy: 0.8168
Val Loss: 0.3941 | Val Accuracy: 0.8292
--------------------




batch number 500
batch number 1000
Epoch 10/15
Train Loss: 0.4042 | Train Accuracy: 0.8175
Val Loss: 0.3940 | Val Accuracy: 0.8292
--------------------




batch number 500


In [None]:
# fig = px.line(results_summary, x="epoch", y="loss", color='phase',color_discrete_sequence=[px.colors.qualitative.Pastel[4],px.colors.qualitative.Pastel[0]],title="loss per epoch")
# fig.show()

# merge tarin and validation for final training

In [None]:
final_train = pd.concat([train_df, val_df], ignore_index = True)

In [None]:
final_train_dataset = SentimentDataset(final_train,'review','label', tokenizer, max_length)

# Create data loaders for batch processing
final_train_loader = DataLoader(final_train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Train the model
num_epochs = 6
final_results_summary = pd.DataFrame(columns=['epoch','phase', 'loss', 'accuracy'])

for epoch in range(num_epochs):
    epoch=epoch+6
    #train
    train_accuracy,train_loss=training_loop(model,final_train_loader,len(final_train_dataset),epoch)

    epoch_final_results_dict={'epoch':epoch,
        'phase':'final_train',
        'loss':[train_loss],
        'accuracy':[train_accuracy]
       }
    epoch_final_results_df = pd.DataFrame(epoch_final_results_dict)
    final_results_summary = pd.concat([final_results_summary, epoch_final_results_df], ignore_index = True)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Final Train Loss: {train_loss:.4f} | Final Train Accuracy: {train_accuracy:.4f}")
    print("--------------------")

test_accuracy,test_precision,test_recall=val_model(model,test_loader,len(test_dataset),"test")



batch number 500
Epoch 7/2
Final Train Loss: 0.7759 | Final Train Accuracy: 0.8079
--------------------




batch number 500
Epoch 8/2
Final Train Loss: 0.7613 | Final Train Accuracy: 0.8072
--------------------
Test Accuracy: 0.8189 | Test Precision: 0.8717 | Test Recall: 0.7448
