In [None]:
# Group - 14
# Department of Computer Science, University at Buffalo



# PS-2: Sentiment Analysis
# Data Set - IMDB movie reviews - Kaggle
# Bert

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import matplotlib.pyplot as plt
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset,random_split, RandomSampler,SequentialSampler
from torch.utils.data import random_split


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,f1_score,recall_score

In [None]:
#installing transformers, this step needs to be done when using colab
!pip install transformers



In [None]:
#used error_bad_lines =False because when reading data in colab errors are coming saying character 'c' or few characters are not found etc.
df=pd.read_csv('IMDB Dataset.csv',error_bad_lines=False,engine="python")



  df=pd.read_csv('IMDB Dataset.csv',error_bad_lines=False,engine="python")


In [None]:
from transformers import BertTokenizer
from transformers import InputExample, InputFeatures
from transformers import BertForSequenceClassification

#tokenizer of bert-base model
note = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
print(df.head())
df.info()

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
sen_update={'positive':0, 'negative':1}
df['sentiment'] = df['sentiment'].replace(sen_update)

In [None]:
df['review'].isnull().sum(axis=0) #0 implies no null values in any row of review

0

In [None]:
dataset = df.drop_duplicates() #duplicate values are droped

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49582 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [None]:
#In this step we are preparing input for our BERT model
# as discussed in model we are adding special tokens and also making all inputs to max length of 256 making all inputs to be same size by adding padding to them if size is less than 256
data_bert_input = note.batch_encode_plus(
    dataset['review'],
    return_tensors='pt',
    padding=True,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=256,
    truncation=True
)
input_ids = data_bert_input['input_ids']
attention_mask = data_bert_input['attention_mask']
labels = torch.tensor(dataset['sentiment'].values)

tensoredDataset = TensorDataset(input_ids, attention_mask, labels)
# 70% of data for training purpose, 20% testing purpose and 10% for validation
train_size = int(0.7 * len(tensoredDataset))
test_size =int(0.2 * len(tensoredDataset))
val_size = len(tensoredDataset) - train_size - test_size

#batch size is kept low because the increase in batch size is crashing the gpu of colab
batch_size = 8

train_dataset, val_dataset, test_dataset= random_split(tensoredDataset, [train_size, val_size, test_size])
# creating dataloaders for our model
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset),batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset,sampler=RandomSampler(test_dataset),batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# BERT model finetuned for sequence classification. Our task is to classify into positive , negative classes so number of output labels for the sequence classification task is 2.
# basically we are instantiating bert model for sequence classification
Bertimdb = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels = 2, output_attentions = False, output_hidden_states = False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
#evaluation of the model , here in our code used for the validation dataset.
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    total_correct = 0
    all_predictions = []
    all_true_vals = []

    with torch.no_grad():
        for values in dataloader:
            values = tuple(map(lambda x: x.to(device), values))
            inputs = {
                'input_ids': values[0],
                'attention_mask': values[1],
                'labels': values[2]
            }
            outputs = model(**inputs)
            logits = outputs.logits
            #onehotencode to convert into required shape of labels for comparision
            labels_onehot = torch.nn.functional.one_hot(inputs['labels'], num_classes=2).float()
            loss = criterion(logits,labels_onehot)
            total_loss += loss.item()
            _, predicted_labels = torch.max(outputs.logits, 1)
            #checking the correct number of predicted values for calculating accuracy of our model
            total_correct += (predicted_labels == inputs['labels']).sum().item()
            all_predictions.extend(predicted_labels.tolist())
            all_true_vals.extend(inputs['labels'].tolist())

    avg_loss = total_loss / len(dataloader)
    return {
        'val_loss': avg_loss,
        'predictions': all_predictions,
        'true_vals': all_true_vals,
        'val_correct_total': total_correct,
    }


In [None]:
torch.cuda.empty_cache()
optimizer = torch.optim.Adam(Bertimdb.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

In [None]:
#training block for the model
def train_model(Bertimdb,train_dataloader, val_dataloader, epochs, learning_rate):
    Bertimdb.to(device)
    train_correct_total = 0
    train_total_samples = 0
    val_correct_total = 0
    val_total_samples = 0
    train_loss =0
    trainloss_values=[]
    trainaccuracy_values=[]
    valloss_values=[]
    valaccuracy_values=[]
    #to display the output in bars tqdm is used
    bars = tqdm(range(1, epochs+1), desc='Epochs', leave=False)
    for epoch in bars:
        Bertimdb.train()
        loss_train_total = 0
        for batch_index, batch in enumerate(train_dataloader):
            Bertimdb.zero_grad()
            batch = tuple(map(lambda b: b.to(device), batch))
            inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
            }
            outputs = Bertimdb(**inputs)
            logits = outputs.logits
            #onehotencode to convert into required shape of labels for comparision
            labels_onehot = torch.nn.functional.one_hot(inputs['labels'], num_classes=2).float()

            loss = criterion(logits,labels_onehot)
            train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(Bertimdb.parameters(), 1.0)
            optimizer.step()
            _, predicted_labels = torch.max(outputs.logits, 1)
            # predicted values comparision for finding out the correctness of the model
            train_correct_total += (predicted_labels == inputs['labels']).sum().item()
            train_total_samples += len(inputs['labels'])
            #for the display of progress in the output while running the code , it'll help to know and estimate on how much part of data our model has trained and approx time for the completion of training
            bars.set_postfix({
            'Epoch': epoch,
            'Batch': batch_index,
            'Training Loss': '{:.3f}'.format(loss.item() / len(batch))
            })
        #loss for the epoch
        avgtrainloss = train_loss / len(train_dataloader)
        trainloss_values.append(avgtrainloss)
        #validation dataset used to evaluate the model
        validation_output = evaluate_model(Bertimdb,val_dataloader)
        validationloss = validation_output['val_loss']
        predictions = validation_output['predictions']
        originalvalues= validation_output['true_vals']
        val_correct_total= validation_output['val_correct_total']
        #f1 score calculation
        originalvals_array = np.array(originalvalues)
        predictions_array = np.array(predictions)
        f1value = f1_score(originalvals_array.flatten(), np.argmax(predictions_array, axis=1).flatten())
        valloss_values.append(validationloss)
        train_accuracy = 100* train_correct_total / train_total_samples
        validationaccuracy =100* val_correct_total / len(val_dataloader.dataset)
        trainaccuracy_values.append(train_accuracy)
        valaccuracy_values.append(validationaccuracy)
        tqdm.write(f"|Epoch {epoch}/{epochs} | Train Loss: {avgtrainloss} |Train Accuracy: {train_accuracy} | Val Loss: {validationloss} | Val Accuracy: {validationaccuracy}| F1 score: {f1value}")




In [None]:
#model training
epochs =2

train_model(Bertimdb,train_dataloader, val_dataloader, epochs, learning_rate=1e-5)



Epochs:   0%|          | 0/2 [26:28<?, ?it/s, Epoch=1, Batch=4338, Training Loss=0.001]
  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:09,  2.23it/s][A
  1%|▏         | 2/155 [00:00<01:09,  2.20it/s][A
  2%|▏         | 3/155 [00:01<01:08,  2.22it/s][A
  3%|▎         | 4/155 [00:01<01:08,  2.21it/s][A
  3%|▎         | 5/155 [00:02<01:07,  2.21it/s][A
  4%|▍         | 6/155 [00:02<01:07,  2.22it/s][A
  5%|▍         | 7/155 [00:03<01:06,  2.22it/s][A
  5%|▌         | 8/155 [00:03<01:06,  2.22it/s][A
  6%|▌         | 9/155 [00:04<01:06,  2.21it/s][A
  6%|▋         | 10/155 [00:04<01:05,  2.21it/s][A
  7%|▋         | 11/155 [00:04<01:05,  2.20it/s][A
  8%|▊         | 12/155 [00:05<01:04,  2.21it/s][A
  8%|▊         | 13/155 [00:05<01:04,  2.21it/s][A
  9%|▉         | 14/155 [00:06<01:03,  2.21it/s][A
 10%|▉         | 15/155 [00:06<01:03,  2.21it/s][A
 10%|█         | 16/155 [00:07<01:02,  2.21it/s][A
 11%|█         | 17/155 [00:07<01:02,  2.21it/

|Epoch 1/2 | Train Loss: 0.3000410685439844 |Train Accuracy: 89.26729478203244 | Val Loss: 0.29469912809229665 | Val Accuracy: 91.7322040734019| F1 score: 0.9172401934089471


Epochs:  50%|█████     | 1/2 [54:08<27:38, 1658.85s/it, Epoch=2, Batch=4338, Training Loss=0.639]
  0%|          | 0/155 [00:00<?, ?it/s][A
  1%|          | 1/155 [00:00<01:08,  2.26it/s][A
  1%|▏         | 2/155 [00:00<01:09,  2.21it/s][A
  2%|▏         | 3/155 [00:01<01:08,  2.23it/s][A
  3%|▎         | 4/155 [00:01<01:08,  2.22it/s][A
  3%|▎         | 5/155 [00:02<01:07,  2.22it/s][A
  4%|▍         | 6/155 [00:02<01:07,  2.22it/s][A
  5%|▍         | 7/155 [00:03<01:06,  2.21it/s][A
  5%|▌         | 8/155 [00:03<01:06,  2.22it/s][A
  6%|▌         | 9/155 [00:04<01:06,  2.21it/s][A
  6%|▋         | 10/155 [00:04<01:05,  2.22it/s][A
  7%|▋         | 11/155 [00:04<01:05,  2.21it/s][A
  8%|▊         | 12/155 [00:05<01:04,  2.21it/s][A
  8%|▊         | 13/155 [00:05<01:04,  2.21it/s][A
  9%|▉         | 14/155 [00:06<01:03,  2.20it/s][A
 10%|▉         | 15/155 [00:06<01:03,  2.21it/s][A
 10%|█         | 16/155 [00:07<01:02,  2.21it/s][A
 11%|█         | 17/155 [00:07<01:02

|Epoch 2/2 | Train Loss: 0.2052658158826669 |Train Accuracy: 91.88204108681246 | Val Loss: 0.31719807696288393 | Val Accuracy: 92.09518047993546| F1 score: 0.9209309146843441




In [None]:
#saving the model into pt file so that we can use the trained weights later when testing
torch.save(Bertimdb.state_dict(), "bert.pt")

In [None]:
#for testing the unkown dataset or test dataset
def test_model(Bertimdb,test_loader):
    correct =0
    total=0
    for batch in tqdm(test_loader):
        batch = tuple(map(lambda b: b.to(device), batch))
        inputs = {'input_ids': batch[0],'attention_mask': batch[1],'labels': batch[2]}
        with torch.no_grad():
            outputs = Bertimdb(**inputs)
            _, predicted_labels = torch.max(outputs.logits, 1)
            correct += (predicted_labels == inputs['labels']).sum().item()
            total += len(inputs['labels'])
    accuracy=100 * (correct /total)
    print('Test Accuracy : ', accuracy)

test_model(Bertimdb,test_dataloader)

100%|██████████| 310/310 [02:20<00:00,  2.21it/s]

Test Accuracy :  91.82129891085114





In [None]:
#using our trained model when new review is passed calculation of its class.
def predict_custom_text(text):
    inputs = note(text, return_tensors='pt', truncation=True, padding=True)
    inputs=inputs.to(device)
    Bertimdb.eval()
    with torch.no_grad():
        outputs = Bertimdb(**inputs)
    logits = outputs.logits
    _, predicted_label = torch.max(logits, dim=1)
    class_labels = ['positive','negative'] # used first positive because when we changed our positive, negatives values of dataset in the beginning of our code we assigned 0 to positive and 1 to negative
    predicted_class_label = class_labels[predicted_label.item()]
    #softmax is used since our model is a classification problem
    probabilities = torch.softmax(logits, dim=1)
    return predicted_class_label, probabilities

custom_text = "My first exposure to the Templarios & not a good one. I was excited to find this title among the offerings from Anchor Bay Video, which has brought us other cult classics such as 'Spider Baby'. The print quality is excellent, but this alone can't hide the fact that the film is deadly dull. There's a thrilling opening sequence in which the villagers exact a terrible revenge on the Templars (& set the whole thing in motion), but everything else in the movie is slow, ponderous &, ultimately, unfulfilling. Adding insult to injury: the movie was dubbed, not subtitled, as promised on the video jacket. "
predicted_label, probabilities = predict_custom_text(custom_text)
print("Predicted Label:", predicted_label)
print("Review:",custom_text)

Predicted Label: negative
Review: My first exposure to the Templarios & not a good one. I was excited to find this title among the offerings from Anchor Bay Video, which has brought us other cult classics such as 'Spider Baby'. The print quality is excellent, but this alone can't hide the fact that the film is deadly dull. There's a thrilling opening sequence in which the villagers exact a terrible revenge on the Templars (& set the whole thing in motion), but everything else in the movie is slow, ponderous &, ultimately, unfulfilling. Adding insult to injury: the movie was dubbed, not subtitled, as promised on the video jacket. 


References:
1. https://www.analyticsvidhya.com/blog/2021/12/fine-tune-bert-model-for-sentiment-analysis-in-google-colab/
2. https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook
3. https://www.kaggle.com/code/chayan8/sentiment-analysis-using-bert-pytorch