# Shashank Suroju
# 19CS10061

## Importing Libraries

In [14]:
import os
import sys
import shutil
from tqdm import tqdm

import numpy as np 
import pandas as pd 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import re
import string
import gensim
from nltk.corpus import stopwords 
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

## Initializing Constants

In [15]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv") 

classes_dict = {
    "negative" : 0,
    "positive" : 1
}

df["sentiment"] = df["sentiment"].map(lambda x : classes_dict[x])
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [16]:
config = dict(
        max_seq_len = 500,
        embedding_dim = 64,
        output_dim = 1,
        hidden_dim = 256,
        no_layers = 2,
        batch_size = 32,
        epochs = 40,
        num_classes = 2,
        seed = 2022,
        learning_rate = 0.0001,
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    )

## Data Splitting

In [17]:
X,y = df['review'].values, df['sentiment'].values

X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.8,stratify=y,random_state=config["seed"])
X_val,X_test,y_val,y_test = train_test_split(X_val,y_val,train_size=0.5,stratify=y_val,random_state=config["seed"])

print(f"Number of Samples in Training data : {len(X_train)}")
print(f"Number of Samples in Validation data : {len(X_val)}")
print(f"Number of Samples in Training data : {len(X_test)}")

Number of Samples in Training data : 40000
Number of Samples in Validation data : 5000
Number of Samples in Training data : 5000


## Tokenization

In [18]:
SOS="<s>"
EOS="</s>"
UNK="<UNK>"

def add_sentence_tokens(sentences):
    return ['{} {} {}'.format(SOS, s, EOS) for s in sentences]

def replace_singletons(tokens):
    vocab = Counter([token for token in tokens])
    return [token if vocab[token] > 1 else UNK for token in tokens]

def preprocess_word(s):
    if s not in [SOS, EOS, UNK]:
        s = re.sub(r"[^\w\s]", '', s) # Remove all non-word characters (everything except numbers and letters)
        s = re.sub(r"\s+", '', s) # Replace all runs of whitespaces with no space
        s = re.sub(r"\d", '', s) # replace digits with no space
        s = s.lower()
    return s

def preprocess_reviews(sentences):
    sentences = add_sentence_tokens(sentences)
    tokens = ' '.join(sentences).split(' ')
    tokens = replace_singletons(tokens)

    ans=[]
    temp=[]
    
    stop_words = set(stopwords.words('english')) 
    
    for token in tokens:
        token = preprocess_word(token)
        #if token not in stop_words:
        temp.append(token)
        if token == EOS:
            ans.append(temp)
            temp=[]
    return ans


train_strings = preprocess_reviews(X_train)
#train_strings = [" ".join(i) for i in train_strings]
word_model = gensim.models.Word2Vec(train_strings, size=config["embedding_dim"], min_count=1, window=5)
pretrained_weights = word_model.wv.vectors
vocab_size, emdedding_size = pretrained_weights.shape

config["vocab_size"] = vocab_size
print(f"Vocabulary Size: {vocab_size}")

def word2idx(word):
    try:
        return word_model.wv.vocab[word].index
    except:
        return word_model.wv.vocab[UNK].index
    
def idx2word(idx):
    return word_model.wv.index2word[idx]

def tokenize(reviews):
    tokens = preprocess_reviews(reviews)
    sequences = np.zeros([len(reviews), config["max_seq_len"]], dtype=np.int32)
    
    for i,review in tqdm(enumerate(tokens)):
        for j,word in enumerate(review[:config["max_seq_len"]]):    
            sequences[i,j] = word2idx(word)
    
    return sequences

Vocabulary Size: 61909


## Data Loader

In [19]:
X_train = tokenize(X_train)
X_val = tokenize(X_val)
X_test = tokenize(X_test)

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# DataLoader
train_loader = DataLoader(train_data, shuffle=True, batch_size=config["batch_size"])
valid_loader = DataLoader(val_data, shuffle=False, batch_size=config["batch_size"])
test_loader = DataLoader(test_data, shuffle=False, batch_size=config["batch_size"])

40000it [00:05, 6761.74it/s]
5000it [00:00, 7245.56it/s]
5000it [00:00, 7025.60it/s]


In [20]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size())
print('Sample input: \n', sample_x)
print('Sample input: \n', sample_y)

Sample input size:  torch.Size([32, 500])
Sample input: 
 tensor([[   30,    33,     0,  ...,     0,     0,     0],
        [   30, 38005,    26,  ...,     0,     0,     0],
        [   30,    46,    23,  ...,     0,     0,     0],
        ...,
        [   30,     9,    14,  ...,     0,     0,     0],
        [   30,  1258,  1509,  ...,     0,     0,     0],
        [   30,    46,    36,  ...,     0,     0,     0]], dtype=torch.int32)
Sample input: 
 tensor([0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 1, 0])


## Classifier Model

In [21]:
class SentimentClassifier(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,output_dim,drop_prob=0.3):
        super(SentimentClassifier,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # embedding and LSTM layers
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_weights)) # Initializing with Word2Vec 
        
        #Bidirectional LSTM
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True, bidirectional=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        self.classifier = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        
        lstm_out, hidden = self.lstm(embeds) # dropout and fully connected layer
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        out = self.dropout(lstm_out) # dropout and fully connected layer
        out = self.fc(out)
        
        sig_out = self.classifier(out) # sigmoid function
        sig_out = sig_out.view(batch_size, -1) # reshape to be batch_size first
        sig_out = sig_out[:, -1] # get last batch of labels
        
        return sig_out, hidden
        
    def init_hidden(self, batch_size):
        ''' 
        Initializes hidden state 
        Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        Initialized to zero, for hidden state and cell state of LSTM
        '''
        h0 = torch.zeros((self.no_layers,config["batch_size"],self.hidden_dim)).to(config["device"])
        c0 = torch.zeros((self.no_layers,config["batch_size"],self.hidden_dim)).to(config["device"])
        hidden = (h0,c0)
        return hidden

## The Model

In [22]:
model = SentimentClassifier(config["no_layers"],config["vocab_size"],config["hidden_dim"],config["embedding_dim"],config["output_dim"],drop_prob=0.3)
print(model)
model.to(config["device"])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config["learning_rate"])

SentimentClassifier(
  (embedding): Embedding(61909, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (classifier): Sigmoid()
)


## Training

In [None]:
epoch_tr_loss,epoch_vl_loss = [],[]
epoch_tr_acc,epoch_vl_acc = [],[]
epoch_tr_f1, epoch_vl_f1 = [], []
val_best_f1 = -1*np.inf

device = config["device"]
for epoch in range(config["epochs"]):
    train_loss, total_instances = 0, 0
    y_true, y_pred = [], []
    model.train()
    # initialize hidden state 
    h = model.init_hidden(config["batch_size"])
    for inputs, labels in train_loader:
        
        inputs, labels = inputs.to(device, dtype=torch.long), labels.to(device)   
        h = tuple([each.data for each in h])
        
        model.zero_grad()
        output,h = model(inputs,h)
        
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        
        num_instances = labels.size(0)
        train_loss += (loss.item()*num_instances)
        total_instances += num_instances
        
        y_true.append(labels.cpu().detach().numpy())
        y_pred.append([1 if i>=0.5 else 0 for i in output.cpu().detach().numpy()])
        
        optimizer.step()
    
    train_loss = train_loss/total_instances
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    
    train_acc = accuracy_score(y_true,y_pred)
    train_f1 = f1_score(y_true,y_pred)
    
    val_h = model.init_hidden(config["batch_size"])
    val_loss, total_instances = 0, 0
    y_true, y_pred = [], []
    model.eval()
    for inputs, labels in valid_loader:
            val_h = tuple([each.data for each in val_h])

            inputs, labels = inputs.to(device, dtype=torch.long), labels.to(device)

            output, val_h = model(inputs,h)
            
            val_loss = criterion(output.squeeze(), labels.float())
            num_instances = labels.size(0)
            val_loss += (loss.item()*num_instances)
            total_instances += num_instances
            
            y_true.append(labels.cpu().detach().numpy())
            y_pred.append([1 if i>=0.5 else 0 for i in output.cpu().detach().numpy()])
    
    val_loss = val_loss/total_instances
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    
    val_acc = accuracy_score(y_true,y_pred)
    val_f1 = f1_score(y_true,y_pred)
    
    epoch_tr_loss.append(train_loss)
    epoch_vl_loss.append(val_loss)
    epoch_tr_acc.append(train_acc)
    epoch_vl_acc.append(val_acc)
    epoch_tr_f1.append(train_f1)
    epoch_vl_f1.append(val_f1)
    
    print(f'Epoch {epoch+1}') 
    print(f'Train_loss : {train_loss} || Val_loss : {val_loss}')
    print(f'Train_accuracy : {train_acc*100} || Val_accuracy : {val_acc*100}')
    print(f'Train_f1_score : {train_f1*100} || Val_f1_score : {val_f1*100}')
    if val_f1 >= val_best_f1:
        torch.save(model.state_dict(), './best_model.pt')
        print('Validation F1 loss increased ({:.6f} --> {:.6f}). Saving model ...'.format(val_best_f1,val_f1))
        val_best_f1 = val_f1
    print(25*'==')

Epoch 1
Train_loss : 0.6932062998771668 || Val_loss : 0.0012452338123694062
Train_accuracy : 50.372499999999995 || Val_accuracy : 50.03999999999999
Train_f1_score : 49.18989480150503 || Val_f1_score : 66.55061596143545
Validation F1 loss increased (-inf --> 0.665506). Saving model ...
Epoch 2
Train_loss : 0.6922527379512787 || Val_loss : 0.0012467908672988415
Train_accuracy : 50.795 || Val_accuracy : 50.63999999999999
Train_f1_score : 43.25664533241077 || Val_f1_score : 66.14540466392319
Epoch 3
Train_loss : 0.6918382560253143 || Val_loss : 0.001244785264134407
Train_accuracy : 50.81250000000001 || Val_accuracy : 50.760000000000005
Train_f1_score : 45.95819485263823 || Val_f1_score : 15.337001375515818
Epoch 4
Train_loss : 0.6897377736568451 || Val_loss : 0.0012352577177807689
Train_accuracy : 51.082499999999996 || Val_accuracy : 51.06
Train_f1_score : 44.690052859202304 || Val_f1_score : 66.88320476383814
Validation F1 loss increased (0.665506 --> 0.668832). Saving model ...
Epoch 5
T

## Training History

In [None]:
fig = plt.figure(figsize = (21, 6))
plt.subplot(1, 3, 1)
plt.plot(epoch_tr_loss, label='Train loss')
plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()
    
plt.subplot(1, 3, 2)
plt.plot(epoch_tr_acc, label='Train Accuracy')
plt.plot(epoch_vl_acc, label='Validation Accuracy')
plt.title("Accuracy")
plt.legend()
plt.grid()

plt.subplot(1, 3, 3)
plt.plot(epoch_tr_f1, label='Train F1 Score')
plt.plot(epoch_vl_f1, label='Validation F1 Score')
plt.title("F1 Score")
plt.legend()
plt.grid()

## Inference

In [None]:
model.load_state_dict(torch.load("../input/best-model/Best_train_model.pt"))

y_true, y_pred = [], []
model.eval()
for inputs, labels in test_loader:
        test_h = tuple([each.data for each in val_h])

        inputs, labels = inputs.to(device, dtype=torch.long), labels.to(device)
        output, test_h = model(inputs,test_h)
            
        y_true.append(labels.cpu().detach().numpy())
        y_pred.append([1 if i>=0.5 else 0 for i in output.cpu().detach().numpy()])
            
y_true = np.concatenate(y_true)
y_pred = np.concatenate(y_pred) 

test_acc = accuracy_score(y_true,y_pred)
test_f1 = f1_score(y_true,y_pred)

print(f"Test Accuracy : {test_acc*100} || Test F1 Score: {test_f1*100}")

## Got Test Accuracy : 89.4 
## Test F1 Score: 89.34459187776437

# [Best_trained_model](https://drive.google.com/file/d/1ks0rDTMqAUCnBFqpkfCYgNoj-3tUjGRm/view?usp=sharing)

In [None]:
plt.plot(y_pred)