In [1]:
!pip install transformers
import pandas as pd
import datetime
import torch
import sys
import numpy as np
import random
import time
from tqdm import tqdm
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
#Upload IMDB Dataset
df = pd.read_csv('IMDB_Dataset.csv', header = 0, sep = ",")
# df = df.head(100)

In [3]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [4]:
df.shape
df.info()

#Replace sentiment with 0 and 1
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

#Clean reviews
import re
clean = re.compile('<.*?>')
re.sub(clean,'',df.iloc[1].review)
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)
df['review'] = df['review'].apply(clean_html)

#Convert chars to lowercase (RoBERTa Tokenizer handles this)
# def convert_lower(text):
#     return text.lower()
# df['review'] = df['review'].apply(convert_lower)

#Remove special characters (non-alphanumeric)
def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x
df['review'] = df['review'].apply(remove_special)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     100 non-null    object
 1   sentiment  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there s a family where a little boy ...,0
4,Petter Mattei s Love in the Time of Money is...,1


In [5]:
# max_length = df['review'].apply(lambda x: len(x.split())).max()
# print(max_length)

In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

#Remove stopwords
def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y
df['review'] = df['review'].apply(remove_stopwords)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saniyanangia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment
0,"[One, reviewers, mentioned, watching, 1, Oz, e...",1
1,"[A, wonderful, little, production, The, filmin...",1
2,"[I, thought, wonderful, way, spend, time, hot,...",1
3,"[Basically, family, little, boy, Jake, thinks,...",0
4,"[Petter, Mattei, Love, Time, Money, visually, ...",1


In [7]:
#RoBERTa Model
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW

max_len = 300
batch_size = 8
epochs = 5
learning_rate = 5e-6
seed = 42
# model_type = "cardiffnlp/twitter-roberta-base-sentiment"
model_type = "roberta-base"
random.seed(seed)

In [8]:
reviews = df["review"]
sentiments = df["sentiment"]

input_ids = []
attention_masks = []

#Tokenize the text
tokenizer = RobertaTokenizer.from_pretrained(model_type, do_lower_case=True)

for review in reviews:
    encoded_dict = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True
    )
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(sentiments)

# print(input_ids[:10])

In [9]:
data = list(zip(input_ids, attention_masks, labels))

#Separate data into input features and labels
X = [(ids, masks) for ids, masks, label in data]
y = [label for ids, masks, label in data]

#Perform initial train-test split to get a combined train and test set
train_input_ids, test_input_ids, train_attention_masks, test_attention_masks, train_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42, stratify=labels
)

#Perform second split to get training and validation sets from the combined train set
train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(
    train_input_ids, train_attention_masks, train_labels, test_size=0.25, random_state=42, stratify=train_labels
)

train_dataset = TensorDataset(
    torch.tensor(train_input_ids),
    torch.tensor(train_attention_masks),
    torch.tensor(train_labels)
)

valid_dataset = TensorDataset(
    torch.tensor(val_input_ids),
    torch.tensor(val_attention_masks),
    torch.tensor(val_labels)
)

test_dataset = TensorDataset(
    torch.tensor(test_input_ids),
    torch.tensor(test_attention_masks),
    torch.tensor(test_labels)
)


  torch.tensor(train_input_ids),
  torch.tensor(train_attention_masks),
  torch.tensor(train_labels)
  torch.tensor(val_input_ids),
  torch.tensor(val_attention_masks),
  torch.tensor(val_labels)
  torch.tensor(test_input_ids),
  torch.tensor(test_attention_masks),
  torch.tensor(test_labels)


In [10]:
#Dataloaders
train_dataloader = DataLoader(
    train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size
)

valid_dataloader = DataLoader(
    valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=batch_size
)

test_dataloader = DataLoader(
    test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size
)

In [11]:
#Print sample input
for i in range(5):
    sample = train_dataset[i]
    input_ids, attention_masks, labels = sample
    print(f"Input IDs: {input_ids}, Attention Masks: {attention_masks}, Labels: {labels}")

Input IDs: tensor([    0, 33082, 22760, 11584,     3, 37745,  2977, 26355,  3583,  8773,
            3, 23375, 44590, 23167,  5881,     3, 35621,  8773,     3,  1264,
            3, 23375,     3,     3,     3, 22760, 27367,  8773,     3, 12338,
         9408,     3, 28084,     3, 17693, 12005, 19827,   100,  7048, 12891,
         4162, 22760, 20042, 27066,   133, 31233,     3,  7078,     3,     3,
         2716,     3, 18891,  8396, 19746, 33456,  9502, 22760,     2,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,    

In [12]:
#Get labels from each dataset
train_labels = [sample[-1] for sample in train_dataset]
valid_labels = [sample[-1] for sample in valid_dataset]
test_labels = [sample[-1] for sample in test_dataset]

#Count occurrences of each label
train_label_counts = {0: train_labels.count(0), 1: train_labels.count(1)}
valid_label_counts = {0: valid_labels.count(0), 1: valid_labels.count(1)}
test_label_counts = {0: test_labels.count(0), 1: test_labels.count(1)}

#Print counts
print("Training Label Counts:", train_label_counts)
print("Validation Label Counts:", valid_label_counts)
print("Test Label Counts:", test_label_counts)

Training Label Counts: {0: 34, 1: 26}
Validation Label Counts: {0: 12, 1: 8}
Test Label Counts: {0: 12, 1: 8}


In [13]:
#Load the model
model = RobertaForSequenceClassification.from_pretrained(model_type, num_labels = 2, ignore_mismatched_sizes=True)
optimizer = AdamW(model.parameters(), lr = learning_rate)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
#Test model with finetuning
model.eval()
predictions, true_labels = [], []

for batch in tqdm(test_dataloader, position=0, file=sys.stdout, leave=True):
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(
            b_input_ids, token_type_ids=None, attention_mask=b_input_mask
        )
    
    logits = outputs[0]
    probabilities = torch.softmax(logits, dim=1)

    # print("Logits:", logits)
    # print("Probabilities:", probabilities)
    
    probabilities = probabilities.cpu().detach().numpy()
    label_ids = b_labels.cpu().detach().numpy()
    
    predictions.append(probabilities)
    true_labels.append(label_ids)

# Print intermediate outputs
# print("Probabilities:", predictions)
# print("True Labels:", true_labels)

threshold = 0.5
binary_predictions = [[1 if prob[1] > threshold else 0 for prob in batch] for batch in predictions]

# Print binary predictions
# print("Binary Predictions:", binary_predictions)

100%|██████████| 3/3 [00:08<00:00,  2.89s/it]


In [15]:
#Model performance (without finetuning)
true_labels_flat = [label for sublist in true_labels for label in sublist]
predicted_labels_flat = [label for sublist in binary_predictions for label in sublist]

unique_labels = set(true_labels_flat)

for label in unique_labels:
    true_labels_label = [1 if l == label else 0 for l in true_labels_flat]
    predicted_labels_label = [1 if l == label else 0 for l in predicted_labels_flat]

    accuracy_label = accuracy_score(true_labels_label, predicted_labels_label)
    precision_label = precision_score(true_labels_label, predicted_labels_label)
    recall_label = recall_score(true_labels_label, predicted_labels_label)
    f1_label = f1_score(true_labels_label, predicted_labels_label)

    print(f"\nLabel {label}:")
    print(f"  Accuracy: {accuracy_label:.4f}")
    print(f"  Precision: {precision_label:.4f}")
    print(f"  Recall: {recall_label:.4f}")
    print(f"  F1 Score: {f1_label:.4f}")

#Overall accuracy
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
print(f"\nOverall Accuracy: {accuracy}")

#Generate confusion matrix
conf_matrix = confusion_matrix(true_labels_flat, predicted_labels_flat)
print("\nConfusion Matrix:")
print(conf_matrix)


Label 0:
  Accuracy: 0.4000
  Precision: 0.0000
  Recall: 0.0000
  F1 Score: 0.0000

Label 1:
  Accuracy: 0.4000
  Precision: 0.4000
  Recall: 1.0000
  F1 Score: 0.5714

Overall Accuracy: 0.4

Confusion Matrix:
[[ 0 12]
 [ 0  8]]


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
#Train the model
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    print(f"Epoch: {epoch + 1}")
    model.train()
    epoch_losses = []
    
    for batch in tqdm(train_dataloader, position=0, file=sys.stdout, leave=True):
        b_input_ids, b_input_mask, b_labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs[0]
        loss = outputs.loss
        
        if loss is not None:
            epoch_losses.append(loss.item())

        loss.backward()
        optimizer.step()

    avg_train_loss = sum(epoch_losses) / len(epoch_losses)
    print(f"Average training loss: {avg_train_loss:.4f}")

    #Validate the model
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(valid_dataloader, position=0, file=sys.stdout, leave=True):
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(
                b_input_ids, token_type_ids=None, attention_mask=b_input_mask
            )
        
        logits = outputs[0]
        probabilities = torch.softmax(logits, dim=1)
        
        # print("Logits:", logits)
        # print("Probabilities:", probabilities)

        probabilities = probabilities.detach().numpy()
        label_ids = b_labels.detach().numpy()
        
        predictions.append(probabilities)
        true_labels.append(label_ids)

    # Print intermediate outputs
    # print("Probabilities:", predictions)
    # print("True Labels:", true_labels)

    threshold = 0.5
    binary_predictions = [[1 if prob[1] > threshold else 0 for prob in batch] for batch in predictions]

    true_labels_flat = [label for sublist in true_labels for label in sublist]
    predicted_labels_flat = [label for sublist in binary_predictions for label in sublist]

    # Print binary predictions
    # print("Binary Predictions:", binary_predictions)
    print("Validation Accuracy: ", accuracy_score(true_labels_flat, predicted_labels_flat))


Epoch: 1
100%|██████████| 8/8 [02:14<00:00, 16.75s/it]
Average training loss: 0.6883
100%|██████████| 3/3 [00:07<00:00,  2.54s/it]
Validation Accuracy:  0.5
Epoch: 2
100%|██████████| 8/8 [01:51<00:00, 13.88s/it]
Average training loss: 0.6837
100%|██████████| 3/3 [00:06<00:00,  2.10s/it]
Validation Accuracy:  0.6
Epoch: 3
100%|██████████| 8/8 [02:01<00:00, 15.18s/it]
Average training loss: 0.6765
100%|██████████| 3/3 [00:06<00:00,  2.24s/it]
Validation Accuracy:  0.55
Epoch: 4
100%|██████████| 8/8 [01:51<00:00, 13.91s/it]
Average training loss: 0.6654
100%|██████████| 3/3 [00:06<00:00,  2.12s/it]
Validation Accuracy:  0.55
Epoch: 5
100%|██████████| 8/8 [01:41<00:00, 12.66s/it]
Average training loss: 0.6395
100%|██████████| 3/3 [00:06<00:00,  2.16s/it]
Validation Accuracy:  0.65


In [19]:
#Test the model's performance on test dataset
model.eval()
predictions, true_labels = [], []

for batch in tqdm(test_dataloader, position=0, file=sys.stdout, leave=True):
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    logits = outputs[0]
    probabilities = torch.softmax(logits, dim=1)
    
    probabilities = probabilities.detach().numpy()
    label_ids = b_labels.detach().numpy()
    
    predictions.append(probabilities)
    true_labels.append(label_ids)

# Print intermediate outputs
# print("Probabilities:", predictions)
# print("True Labels:", true_labels)

threshold = 0.5
binary_predictions = [[1 if prob[1] > threshold else 0 for prob in batch] for batch in predictions]

# Print binary predictions
# print("Binary Predictions:", binary_predictions)

100%|██████████| 3/3 [00:07<00:00,  2.41s/it]


In [20]:
#Model performance (with finetuning)
true_labels_flat = [label for sublist in true_labels for label in sublist]
predicted_labels_flat = [label for sublist in binary_predictions for label in sublist]

unique_labels = set(true_labels_flat)

for label in unique_labels:
    true_labels_label = [1 if l == label else 0 for l in true_labels_flat]
    predicted_labels_label = [1 if l == label else 0 for l in predicted_labels_flat]

    accuracy_label = accuracy_score(true_labels_label, predicted_labels_label)
    precision_label = precision_score(true_labels_label, predicted_labels_label)
    recall_label = recall_score(true_labels_label, predicted_labels_label)
    f1_label = f1_score(true_labels_label, predicted_labels_label)

    print(f"\nLabel {label}:")
    print(f"  Accuracy: {accuracy_label:.4f}")
    print(f"  Precision: {precision_label:.4f}")
    print(f"  Recall: {recall_label:.4f}")
    print(f"  F1 Score: {f1_label:.4f}")

#Overall accuracy
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
print(f"\nOverall Accuracy: {accuracy}")

#Generate confusion matrix
conf_matrix = confusion_matrix(true_labels_flat, predicted_labels_flat)
print("\nConfusion Matrix:")
print(conf_matrix)


Label 0:
  Accuracy: 0.6500
  Precision: 0.6471
  Recall: 0.9167
  F1 Score: 0.7586

Label 1:
  Accuracy: 0.6500
  Precision: 0.6667
  Recall: 0.2500
  F1 Score: 0.3636

Overall Accuracy: 0.65

Confusion Matrix:
[[11  1]
 [ 6  2]]
