# ASSIGNMENT 2
# SENTIMENT CLASSIFICATION
## SHRUTI SHREYASI (19EC10086)

In [None]:
#necessary imports
import numpy as np
import pandas as pd
import os
import torch
from torch.autograd import Variable
from collections import Counter
import argparse
from torch import nn, optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [None]:
# loading the dataset
df = pd.read_csv('/kaggle/input/imdb-dataset/IMDB Dataset.csv')
print(df.head())
print(df.tail())

In [None]:
# removing the stopwords
stop_words = set(stopwords.words('english'))
for i in range(len(df['review'])):
    word_tokens = word_tokenize(df['review'][i])
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    df['review'][i] = filtered_sentence

In [None]:
# removing non english words
words = set(nltk.corpus.words.words())
for i in range(len(df['review'])):
    sent = ''
    for word in df['review'][i]:
        sent += word + ' '
    sent = sent[:len(sent)-1]
    sent = " ".join(w for w in nltk.wordpunct_tokenize(sent) \
         if w.lower() in words or not w.isalpha())
    df['review'][i] = sent
df.head()

In [None]:
# removing punctuations
tokenizer = nltk.RegexpTokenizer(r"\w+")
for i in range(len(df['review'])):
    df['review'][i] = tokenizer.tokenize(df['review'][i])
    sent = ''
    for word in df['review'][i]:
        sent += word + ' '
    sent = sent[:len(sent)-1]
    df['review'][i] = sent
df.head()

In [None]:
# loading GloVe embeddings
vocab,embeddings = [],[]
with open('/kaggle/input/glove6b/glove.6B.50d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)
    
vocab_glove = np.array(vocab)
embs_glove = np.array(embeddings)

In [None]:
# hash of GloVe vocabulary
vocab_G = {}
for i in range(len(vocab_glove)):
    if vocab_glove[i,] not in vocab_G:
        vocab_G[vocab_glove[i,]] = embs_glove[i]

In [None]:
# hash of review vocabulary
vocab = {}
for sent in df['review']:
    for word in sent.split():
        if word not in vocab:
            vocab[word] = 1

In [None]:
# hash of vocabulary of words present in GloVe
vocab_simplified = {}
for word in vocab:
    if vocab_G.get(word,0) is not 0:
        vocab_simplified[word] = 1
print(len(vocab_simplified))

In [None]:
# length of reviews considered
LEN = 200

# removing words not present in GloVe hash
for i in range(len(df)):
    sent = ''
    count = 0
    for word in df['review'][i].split():
        if count < LEN and vocab_simplified.get(word,0) is not 0:
            sent += word + ' '
            count += 1
    sent = sent[:len(sent)-1]
    df['review'][i] = sent
df.head()

In [None]:
# padding reviews to obtain same length
for i in range(len(df)):
    df['review'][i] = ((LEN - len(df['review'][i].split())) * '<PAD> ') + df['review'][i]

In [None]:
# preprocessed dataframe
df_preprocessed = df

# storing GloVe embeddings in hash form
embeddings_glove_complete = {}
with open('/kaggle/input/glove6b/glove.6B.50d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    embeddings_glove_complete[i_word] = i_embeddings
    
vocab_imdb = {}
for sentence in df['review']:
    for word in sentence.split():
        if word in embeddings_glove_complete:
            vocab_imdb[word] = embeddings_glove_complete[word]

In [None]:
# dataset class for train set
class Dataset(torch.utils.data.Dataset):
    def __init__(self, args):
        self.args = args
        (self.train_df, self.train_label) = self.load_words()

    def load_words(self):
        '''
            return train data and train labels
        '''
        df = df_preprocessed
        train_df, val_df = train_test_split(df,test_size=0.20,random_state=0)
        val_df, test_df = train_test_split(val_df,test_size=0.50,random_state=0)
        traindata = []
        trainlabel = []
        for data in train_df['review']:
            traindata.append(data)
        for data in train_df['sentiment']:
            trainlabel.append(data)
        return (traindata, trainlabel)
        
    
    def words_indexes(self, index):
        '''
            embedding of preprocessed review
        '''
        arr = []
        for i in range(index, index + 1):
            brr = []
            for word in self.train_df[i].split():
                brr.append(embeddings_glove_complete.get(word, [0]*50))
            arr.append(brr)
        return arr
    
                 
    def label(self, index):
        '''
            return label of the review in binary form
        '''
        arr = []
        for i in range(index, index + 1):
            if self.train_label[i] == 'positive':
                arr.append(1)
            else:
                arr.append(0)
        return arr

    def __len__(self):
        '''
            for dataloader
        '''
        return self.args.batch_size

    def __getitem__(self, index):
        '''
            for dataloader
            return <input, label>
        '''
        sentence = self.train_df[index]
        arr = []
        for i in range(len(sentence.split())):
            arr.append(embeddings_glove_complete.get(sentence.split()[i], [0]*50))
        y = 0
        if self.train_label[index] == 'positive':
            y = 1
        return (
            (torch.tensor(arr).float()),
            (torch.tensor([y])),
        )

In [None]:
# validation dataset
class DatasetVal(torch.utils.data.Dataset):
    def __init__(self, args):
        self.args = args
        (self.val_df, self.val_label) = self.load_words()

    def load_words(self):
        df = df_preprocessed
        train_df, val_df = train_test_split(df,test_size=0.20,random_state=0)
        val_df, test_df = train_test_split(val_df,test_size=0.50,random_state=0)
        valdata = []
        vallabel = []
        for data in val_df['review']:
            valdata.append(data)
        for data in val_df['sentiment']:
            vallabel.append(data)
        return (valdata, vallabel)
        
    
    def words_indexes(self, index):
        arr = []
        for i in range(index, index + 1):
            brr = []
            for word in self.val_df[i].split():
                brr.append(embeddings_glove_complete.get(word, [0]*50))
            arr.append(brr)
        return arr
    
                 
    def label(self, index):
        arr = []
        for i in range(index, index + 1):
            if self.val_label[i] == 'positive':
                arr.append(1)
            else:
                arr.append(0)
        return arr

    def __len__(self):
        return self.args.batch_size

    def __getitem__(self, index):
        sentence = self.val_df[index]
        arr = []
        for i in range(len(sentence.split())):
            arr.append(embeddings_glove_complete.get(sentence.split()[i], [0]*50))
        y = 0
        if self.val_label[index] == 'positive':
            y = 1
        return (
            (torch.tensor(arr).float()),
            (torch.tensor([y])),
        )

In [None]:
# class for model
class Model(nn.Module):
    def __init__(self, dataset):
        '''
            Bi-LSTM dimension: 50, 1 layer, 25 hidden dimensions
            Linear layer dimension: 2*hidden dimensions
        '''
        super(Model, self).__init__()
        self.embedding_dim = 50
        self.num_layers = 1
        self.hidden_dim = 100

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=True,
        )
        self.fc = nn.Linear(self.hidden_dim*2, 2)

    def forward(self, x, prev_state):
        output, state = self.lstm(x, prev_state)
        return self.fc(output.transpose(0,1)[-1]), state

    def init_state(self, batch_size):
        return (Variable(torch.zeros(self.num_layers*2,batch_size, self.hidden_dim)),
                Variable(torch.zeros(self.num_layers*2,batch_size, self.hidden_dim)))

In [None]:
# training function
def train(dataset, model, args):
    model.train()
    
    dataloader = DataLoader(dataset, batch_size=int(args.batch_size))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.002)
    sigmoid = torch.nn.Sigmoid()

    for epoch in range(int(args.max_epochs)):
        state_h, state_c = model.init_state(int(args.batch_size))
        state_h = state_h.cuda()
        state_c = state_c.cuda()
        
        # to calculate F1 score
        tp = 0
        fp = 0
        tn = 0
        fn = 0
        
        count = 0
        accu = 0
        
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()
            x = x.cuda()
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            state_h = state_h.cuda()
            state_c = state_c.cuda()
            y_pred = y_pred.cuda()
            y = y.cuda()
            loss = criterion(y_pred, y.flatten())
            #y_pred is predicted label
            
            for i in range(len(y.flatten())):
                actual = y[i].item()
                predicted = torch.argmax(y_pred[i]).item()
    
                if actual is 1 and predicted is 1:
                    tp += 1
                elif actual is 1 and predicted is 0:
                    fn += 1
                elif actual is 0 and predicted is 1:
                    fp += 1
                else:
                    tn += 1
            
            state_h = state_h.detach()
            state_c = state_c.detach()
            loss.backward()
            optimizer.step()
            count += 1
            
        print({ 'epoch': epoch, 'loss': loss.item(), 'tp':tp, 'tn':tn, 'fp':fp, 'fn':fn})
        
        # validation
        dataset_val = DatasetVal(args)
        test(dataset_val, model, args)

In [None]:
# args
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', '--max-epochs', '--sequence-length')
args = parser.parse_args(['--batch-size', '200', '--max-epochs', '20', '--sequence-length', '200'])
args.batch_size = 200
args.max_epochs = 13
args.sequence_length = 100

In [None]:
# test dataset
class DatasetTest(torch.utils.data.Dataset):
    def __init__(self, args):
        self.args = args
        (self.test_df, self.test_label) = self.load_words()

    def load_words(self):
        df = df_preprocessed
        train_df, val_df = train_test_split(df,test_size=0.20,random_state=0)
        val_df, test_df = train_test_split(val_df,test_size=0.50,random_state=0)
        testdata = []
        testlabel = []
        for data in test_df['review']:
            testdata.append(data)
        for data in test_df['sentiment']:
            testlabel.append(data)
        return (testdata, testlabel)
        
    
    def words_indexes(self, index):
        arr = []
        for i in range(index, index + 1):
            brr = []
            for word in self.test_df[i].split():
                brr.append(embeddings_glove_complete.get(word, [0]*50))
            arr.append(brr)
        return arr
    
                 
    def label(self, index):
        arr = []
        for i in range(index, index + 1):
            if self.test_label[i] == 'positive':
                arr.append(1)
            else:
                arr.append(0)
        return arr

    def __len__(self):
        return self.args.batch_size

    def __getitem__(self, index):
        sentence = self.test_df[index]
        arr = []
        for i in range(len(sentence.split())):
            arr.append(embeddings_glove_complete.get(sentence.split()[i], [0]*50))
        y = 0
        if self.test_label[index] == 'positive':
            y = 1
        return (
            (torch.tensor(arr).float()),
            (torch.tensor([y])),
        )

In [None]:
def test(dataset, model, args):
    dataloader = DataLoader(dataset, batch_size=int(args.batch_size))
    criterion = nn.CrossEntropyLoss()
    sigmoid = torch.nn.Sigmoid()
    state_h, state_c = model.init_state(int(args.batch_size))
    state_h = state_h.cuda()
    state_c = state_c.cuda()
        
    tp = 0
    fp = 0
    tn = 0
    fn = 0
        
    count = 0
    accu = 0
        
    for batch, (x, y) in enumerate(dataloader):
        x = x.cuda()
        with torch.no_grad():
            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            y_pred = y_pred.cuda()
            y = y.cuda()
            loss = criterion(y_pred, y.flatten())
            
            for i in range(len(y.flatten())):
                actual = y[i].item()
                predicted = torch.argmax(y_pred[i]).item()
    
                if actual is 1 and predicted is 1:
                    tp += 1
                elif actual is 1 and predicted is 0:
                    fn += 1
                elif actual is 0 and predicted is 1:
                    fp += 1
                else:
                    tn += 1
            print({'tp':tp, 'tn':tn, 'fp':fp, 'fn':fn, 'F1':(2*tp/(2*tp+fp+fn))})
            count += 1   

In [None]:
dataset = Dataset(args)
model = Model(dataset).cuda()
# comment the below line to use pretrained model
train(dataset, model, args)
# uncomment the below line to train the model
# model.load_state_dict(torch.load('/kaggle/input/model.txt'))
torch.save(model.state_dict(), '/kaggle/working/model.txt')
dataset_test = DatasetTest(args)
test(dataset_test, model, args)

In [None]:
model.load_state_dict(torch.load('/kaggle/working/model.txt'))

In [None]:
dataset_test = DatasetTest(args)
test(dataset_test, model, args)