In [186]:
# dataset url: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download

In [187]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam

In [188]:
# read in the data
df = pd.read_csv("data/IMDB-Dataset.csv")
print(df.head())
print(f"\nNumber of reviews: {len(df.index)}")

# remove duplicate rows
df.drop_duplicates(subset=['review'], inplace=True)
print(f"Number of unqiue reviews: {len(df.index)}\n")

# check the distribution of the labels
print(df.sentiment.value_counts())

# get baseline
print(f"\nOur baseline accuracy is : {max(df.sentiment.value_counts(normalize=True)):.6f}")


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Number of reviews: 50000
Number of unqiue reviews: 49582

positive    24884
negative    24698
Name: sentiment, dtype: int64

Our baseline accuracy is : 0.501876


In [189]:
# convert labels to 0 and 1
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [190]:
# build dataset class for our movie dataset
from transformers import DistilBertTokenizerFast
from torch.utils.data import Dataset

class MovieDataset(Dataset):
    def __init__(self, dataframe, max_len=512):
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        self.data = dataframe
        # self.reviews = self.data.review
        # self.targets = self.data.sentiment
        self.reviews = dataframe.review.to_numpy()
        self.targets = dataframe.sentiment.to_numpy()
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoded_review = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            # return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoded_review['input_ids'].flatten(),
            'attention_mask': encoded_review['attention_mask'].flatten(),
            'targets': torch.tensor(target)
        }


In [191]:
from transformers import DistilBertModel, DistilBertForSequenceClassification

# create custom model
class DistilBertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(768, 768)
        # self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        print(pooled_output)
        linear_output = self.pre_classifier(pooled_output)
        # dropout_output = self.dropout(linear_output)
        output = self.classifier(linear_output)
        return self.sigmoid(output)


In [192]:
# training loop
def train(model, train_data, val_data, learning_rate, epochs):
    print("Training started...")

    train, val = MovieDataset(train_data), MovieDataset(val_data)

    print(f"Training on {len(train)} samples")
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=16, shuffle=True)

    print(f"Training on {len(train_dataloader)} batches")

    use_cuda = torch.cuda.is_available()
    # device = torch.device("cuda" if use_cuda else "cpu")
    device = torch.device("cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    # if use_cuda:
    #     model = model.cuda()
    #     criterion = criterion.cuda()

    epochs = 1
    for epoch in range(epochs):
        model.train()
        print("Starting epoch {}".format(epoch))
        train_acc = 0
        train_loss = 0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            optimizer.zero_grad() # clear gradients
            outputs = model(input_ids, attention_mask) # forward pass
            loss = criterion(outputs, targets) # calculate loss
            train_loss += loss.item() # add loss to train_loss
            loss.backward() # backward pass
            optimizer.step() # update weights
            break # remove this to train on all batches

        val_acc = 0
        val_loss = 0
        with torch.no_grad():
            model.eval()
            for batch in val_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = batch['targets'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        print(f"Epoch: {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss/len(train_dataloader)}")
        print(f"Val Loss: {val_loss/len(val_dataloader)}")

In [193]:
# split data into train and validation sets
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

In [194]:
EPOCHS = 5
model = DistilBertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training started...
Training on 39665 samples
Training on 2480 batches
Starting epoch 0
