In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from collections import Counter
import re
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
import torch.optim as optim
import wandb
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


**Download necessary NLTK Packages**

nltk.download('stopwords')
nltk.download('punkt')

In [3]:
dataset = load_dataset("stanfordnlp/imdb")

In [4]:
df = pd.DataFrame(dataset['train'])

In [5]:
df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [6]:
df.shape

(25000, 2)

## Datasets and Dataloader

In [7]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def clean_n_remove_stop(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    word_tokens = word_tokenize(text)
    filtered_sentence = [] 
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

    

In [8]:
df['tokens'] = df['text'].apply(clean_n_remove_stop)
df = df[df['tokens'].map(len).between(100, 500)]

In [9]:
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(token) for token in x])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10912 entries, 0 to 24998
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10912 non-null  object
 1   label   10912 non-null  int64 
 2   tokens  10912 non-null  object
dtypes: int64(1), object(2)
memory usage: 341.0+ KB


In [11]:
df.shape

(10912, 3)

In [12]:
df.label.unique()

array([0, 1])

In [13]:
word_counts = Counter([word for tokens in df['tokens'] for word in tokens])
vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}
df['indexed_tokens'] = df['tokens'].apply(lambda x: [vocab[token] for token in x])

In [78]:
len(vocab)

70598

In [14]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [15]:
type(train_df['tokens'].to_numpy())

numpy.ndarray

In [37]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, sentiments):
        self.reviews = reviews
        self.sentiments = sentiments

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return torch.tensor(self.reviews[idx]), torch.tensor(self.sentiments[idx])

def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    xx_pad = torch.nn.utils.rnn.pad_sequence(xx, batch_first=True, padding_value=0)
    return xx_pad, torch.tensor(yy), x_lens

In [38]:
train_dataset = IMDBDataset(train_df['indexed_tokens'].to_numpy(), train_df['label'].to_numpy())
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_collate)
val_dataset = IMDBDataset(val_df['indexed_tokens'].to_numpy(), val_df['label'].to_numpy())
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True, collate_fn=pad_collate)
test_dataset = IMDBDataset(test_df['indexed_tokens'].to_numpy(), test_df['label'].to_numpy())
test_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=pad_collate)

In [44]:
len(train_dataset)

7856

In [45]:
a,b = train_dataset[0]

In [46]:
b

tensor(1)

In [61]:
a

tensor([  669,     2,  5063,  3199,   340,   996,  2145,    87,  3274,  5063,
         5174, 63736, 25051, 15380, 14280,  6554,  1262,  6241,  1942,   443,
           21,  3214, 25052, 30528,  2086,  7358,  1166,    86,    42,  2386,
        63737,    23,  2050,   116,   714,  1166,  5063,    15,  2348,     1,
          755,  9435,   626, 32011,  2050, 63738, 12624,   698,    27,   482,
          356,    87,  3274,  5063,   187,     3,   397,    53,  4833,  1156,
          160,   118,    89,   784,   957,   392,    87,  3274,   974,   274,
          103,  2342,  3185, 63739,  6144,   694,  1363,  2514,  1225,   302,
         8271,     1,    13,     2,  5063,  1021, 63740,  1572,   944,  1956,
          499,  2742, 63741,    87,  1082,  1094,   580,  2454,  1257,  5285,
         1207,   905,   838, 32342,  2266,    42,  5063,    43,  2651,   241,
        11803, 32343, 12928, 63742, 13175,   765, 11855,  8551, 63743,  5174,
        32343,   698,   360,   614,  1166, 12442,   522,     1, 

##  RNN and LSTM Models

In [114]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.0,output_strategy='last'):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.output_strategy = output_strategy
        # if dropout:
        #     self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_embedded)
        if self.output_strategy == 'last':
            #print(hidden.squeeze(0).shape)
            out = self.fc(hidden.squeeze(0))[-1]
        elif self.output_strategy == 'mean':
            output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
            out = self.fc(output.mean(dim=1))
        return out

class LSTMModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.2, output_strategy='last'):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.output_strategy = output_strategy
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.lstm(packed_embedded)
        if self.output_strategy == 'last':
            out = self.fc(hidden.squeeze(0))[-1]
        elif self.output_strategy == 'mean':
            output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
            out = self.fc(output.mean(dim=1))
        return out


**Training with Binary Cross Entropy Loss**

In [63]:
def train_bce(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0

        for inputs, labels, lengths in train_loader:
            inputs, labels = inputs.to(device), labels.to(device, dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(inputs, lengths).squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()
            
            optimizer.step()

            train_loss += loss.item()
            predicted = torch.round(torch.sigmoid(outputs))
            train_correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_accuracy = 100 * train_correct / total
        train_loss /= len(train_loader)

        val_loss, val_accuracy = evaluate_bce(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

        wandb.log({"epoch": epoch+1, "train_loss": train_loss, "train_accuracy": train_accuracy,
                   "val_loss": val_loss, "val_accuracy": val_accuracy})

def evaluate_bce(model, dataloader, criterion):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu")
    total_loss = 0.0
    total_correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels, lengths in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, lengths).squeeze(1)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            predicted = torch.round(torch.sigmoid(outputs))
            total_correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * total_correct / total
    total_loss /= len(dataloader)
    return total_loss, accuracy


In [64]:
# model = RNNModel(len(vocab) + 1, config.embedding_dim, config.hidden_dim, config.output_dim, config.num_layers, config.dropout)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

# # Train the model
# train(model, train_loader, val_loader, criterion, optimizer, num_epochs=config.num_epochs)

# # Evaluate on test set
# test_loss, test_accuracy = evaluate(model, test_loader, criterion)
# print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
# wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})


**Training with Cross Entropy**
With experimentations we note that both the losses perform equally, hence we stick with Cross Entropy

In [117]:
def train_ce(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        total = 0

        for inputs, labels, lengths in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, lengths).softmax(dim=1)
            #print(outputs.shape)
            loss = criterion(outputs, labels)
            loss.backward()
            
            optimizer.step()

            train_loss += loss.item()
            #predicted = torch.round(torch.sigmoid(outputs))
            _, predicted = torch.max(outputs, 1)
            train_correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_accuracy = 100 * train_correct / total
        train_loss /= len(train_loader)

        val_loss, val_accuracy = evaluate_ce(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

        wandb.log({"epoch": epoch+1, "train_loss": train_loss, "train_accuracy": train_accuracy,
                   "val_loss": val_loss, "val_accuracy": val_accuracy})

def evaluate_ce(model, dataloader, criterion):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #device = torch.device("cpu")
    total_loss = 0.0
    total_correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels, lengths in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, lengths)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = 100 * total_correct / total
    total_loss /= len(dataloader)
    return total_loss, accuracy


**In the following experiments we note that taking the mean of all the intermediate outputs greatly improves the performance of both LSTM and RNN**

In [None]:
# wandb.init(project="Imdb_rnn_classification")
# config = wandb.config
# config.embedding_dim = 100
# config.hidden_dim = 200
# config.output_dim = 2
# config.num_layers = 2
# config.dropout = 0.1
# config.learning_rate = 0.001
# config.num_epochs = 20
# config.batch_size = 64
# config.output_strategy = 'mean'

After different hyperparameter tuning we find the below setup to be apt for our use case.

In [97]:
wandb.init(project="Imdb_rnn_classification")
config = wandb.config
config.embedding_dim = 128
config.hidden_dim = 256
config.output_dim = 2
config.num_layers = 3
config.dropout = 0.2
config.learning_rate = 0.001
config.num_epochs = 20
config.batch_size = 64
config.output_strategy = 'mean'

0,1
epoch,▁▁▂▂▁▂▂▁▂▂▁▂▂▁▁▂▁▁▂▂▁▂▂▃▃▄▄▄▅▅▆▆▇▇██▁▂▂▁
test_accuracy,█▃▃▂▁▆▇
test_loss,▁▁█▅▂▁▁
train_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅██▁
train_loss,████████████████████████████████████▅▁▁█
val_accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇██▁
val_loss,▃▃▃▃▃▃▃▄▄▆▃▃▅▃▃▃▃▃▃▃▄▃▃▄▄▄▅▄▄▄▄▄▅▄▃▄▁▄█▃

0,1
epoch,8.0
test_accuracy,51.0947
test_loss,0.69143
train_accuracy,49.46538
train_loss,0.69327
val_accuracy,50.63001
val_loss,0.69282


In [115]:
model_rnn_last = RNNModel(len(vocab) + 1, config.embedding_dim, config.hidden_dim, config.output_dim, config.num_layers, config.dropout)
criterion = nn.CrossEntropyLoss()
optimizer_rnn_last = optim.Adam(model_rnn_last.parameters(), lr=config.learning_rate)


In [118]:
train_ce(model_rnn_last, train_loader, val_loader, criterion, optimizer_rnn_last, num_epochs=config.num_epochs)

# Evaluate on test set
test_loss, test_accuracy = evaluate_ce(model_rnn, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})

Epoch 1/20, Train Loss: 0.6864, Train Accuracy: 56.58%, Val Loss: 0.6809, Val Accuracy: 57.16%
Epoch 2/20, Train Loss: 0.6635, Train Accuracy: 61.25%, Val Loss: 0.6895, Val Accuracy: 55.78%
Epoch 3/20, Train Loss: 0.6880, Train Accuracy: 56.92%, Val Loss: 1.4029, Val Accuracy: 53.49%
Epoch 4/20, Train Loss: 0.6543, Train Accuracy: 62.89%, Val Loss: 0.7608, Val Accuracy: 60.82%
Epoch 5/20, Train Loss: 0.5987, Train Accuracy: 70.02%, Val Loss: 1.0922, Val Accuracy: 67.01%
Epoch 6/20, Train Loss: 0.6365, Train Accuracy: 63.81%, Val Loss: 1.0668, Val Accuracy: 57.73%
Epoch 7/20, Train Loss: 0.6961, Train Accuracy: 52.46%, Val Loss: 0.7744, Val Accuracy: 52.35%
Epoch 8/20, Train Loss: 0.6942, Train Accuracy: 52.30%, Val Loss: 0.7540, Val Accuracy: 52.92%
Epoch 9/20, Train Loss: 0.6878, Train Accuracy: 53.88%, Val Loss: 0.7917, Val Accuracy: 47.31%
Epoch 10/20, Train Loss: 0.6891, Train Accuracy: 53.17%, Val Loss: 0.7398, Val Accuracy: 56.59%
Epoch 11/20, Train Loss: 0.6639, Train Accuracy: 

In [119]:
model_rnn_mean = RNNModel(len(vocab) + 1, config.embedding_dim, config.hidden_dim, config.output_dim, config.num_layers, config.dropout,config.output_strategy)
optimizer_rnn_mean = optim.Adam(model_rnn_mean.parameters(), lr=config.learning_rate)


In [120]:
# Train the model
train_ce(model_rnn_mean, train_loader, val_loader, criterion, optimizer_rnn_mean, num_epochs=config.num_epochs)

# Evaluate on test set
test_loss, test_accuracy = evaluate_ce(model_rnn_mean, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})


Epoch 1/20, Train Loss: 0.6206, Train Accuracy: 66.26%, Val Loss: 0.5839, Val Accuracy: 72.28%
Epoch 2/20, Train Loss: 0.5272, Train Accuracy: 77.48%, Val Loss: 0.7714, Val Accuracy: 75.49%
Epoch 3/20, Train Loss: 0.4680, Train Accuracy: 83.99%, Val Loss: 0.5309, Val Accuracy: 79.61%
Epoch 4/20, Train Loss: 0.4289, Train Accuracy: 88.23%, Val Loss: 0.7682, Val Accuracy: 79.84%
Epoch 5/20, Train Loss: 0.3985, Train Accuracy: 91.48%, Val Loss: 0.9484, Val Accuracy: 80.07%
Epoch 6/20, Train Loss: 0.3783, Train Accuracy: 93.65%, Val Loss: 0.9752, Val Accuracy: 80.99%
Epoch 7/20, Train Loss: 0.3688, Train Accuracy: 94.59%, Val Loss: 1.2428, Val Accuracy: 80.76%
Epoch 8/20, Train Loss: 0.3615, Train Accuracy: 95.35%, Val Loss: 1.4530, Val Accuracy: 80.87%
Epoch 9/20, Train Loss: 0.3574, Train Accuracy: 95.75%, Val Loss: 1.5294, Val Accuracy: 80.64%
Epoch 10/20, Train Loss: 0.3551, Train Accuracy: 95.93%, Val Loss: 1.1137, Val Accuracy: 80.76%
Epoch 11/20, Train Loss: 0.3514, Train Accuracy: 

## LSTM

In [121]:
wandb.init(project="Imdb_LSTM_classification")
config = wandb.config
config.embedding_dim = 128
config.hidden_dim = 256
config.output_dim = 2
config.num_layers = 3
config.dropout = 0.2
config.learning_rate = 0.001
config.num_epochs = 20
config.batch_size = 64
config.output_strategy = 'mean'

0,1
epoch,▁▁▂▂▃▄▄▅▅▆▇▇██▁▂▂▃▄▄▅▅▆▆▇▇█▁▂▂▃▃▄▄▅▅▆▇▇█
test_accuracy,▁▁▁
test_loss,█▁▁
train_accuracy,▂▁▂▁▄▆▇▇██████▂▂▄▃▁▁▂▃▄▅▅▅▆▃▆▇▇▇██▅▅▇▇██
train_loss,████▆▄▃▂▁▁▁▁▁▁▇█▆▇██▇▇▆▅▅▄▄▇▄▃▂▂▁▁▄▅▂▂▁▁
val_accuracy,▃▃▂▂▆▇████████▃▂▅▃▂▁▃▄▄▅▅▅▄▆▇▇████▆▇▇█▇█
val_loss,▅▂▃▂▂▁▂▃▃▃▃▃▄▄▂▆▄▄▃▃▃▃▄▆▇▆█▂▁▃▄▅▇▄▂▂▁▃▆▄

0,1
epoch,20.0
test_accuracy,98.0779
test_loss,0.21643
train_accuracy,97.11049
train_loss,0.34319
val_accuracy,83.04696
val_loss,1.05965


In [122]:
model_lstm = LSTMModel(len(vocab) + 1, config.embedding_dim, config.hidden_dim, config.output_dim, config.num_layers, config.dropout,config.output_strategy)
criterion = nn.CrossEntropyLoss()
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=config.learning_rate)


In [123]:
# Train the model
train_ce(model_lstm, train_loader, val_loader, criterion, optimizer_lstm, num_epochs=config.num_epochs)

# Evaluate on test set
test_loss, test_accuracy = evaluate_ce(model_lstm, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")
wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})


Epoch 1/20, Train Loss: 0.6164, Train Accuracy: 66.41%, Val Loss: 0.6079, Val Accuracy: 74.11%
Epoch 2/20, Train Loss: 0.5003, Train Accuracy: 80.60%, Val Loss: 0.6832, Val Accuracy: 78.69%
Epoch 3/20, Train Loss: 0.4427, Train Accuracy: 86.77%, Val Loss: 0.6498, Val Accuracy: 82.36%
Epoch 4/20, Train Loss: 0.4058, Train Accuracy: 90.85%, Val Loss: 0.8607, Val Accuracy: 82.13%
Epoch 5/20, Train Loss: 0.3853, Train Accuracy: 92.82%, Val Loss: 0.8301, Val Accuracy: 82.13%
Epoch 6/20, Train Loss: 0.3693, Train Accuracy: 94.53%, Val Loss: 1.0716, Val Accuracy: 81.33%
Epoch 7/20, Train Loss: 0.3540, Train Accuracy: 96.07%, Val Loss: 1.1751, Val Accuracy: 80.64%
Epoch 8/20, Train Loss: 0.3523, Train Accuracy: 96.32%, Val Loss: 1.1727, Val Accuracy: 82.59%
Epoch 9/20, Train Loss: 0.3468, Train Accuracy: 96.74%, Val Loss: 1.0950, Val Accuracy: 82.25%
Epoch 10/20, Train Loss: 0.3427, Train Accuracy: 97.19%, Val Loss: 1.3791, Val Accuracy: 80.30%
Epoch 11/20, Train Loss: 0.3418, Train Accuracy: 

**We find that LSTMs beat RNNs due to their ability to recall long contexts which is essential in use cases like sentiment analysis.**