In [None]:
import warnings; warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
import numpy as np
import pandas as pd

In [None]:
path = Path('C:/Users/sappusamy/Documents/SriWK/datasets/IMDB.csv')

In [None]:
dataset = pd.read_csv(path)

In [None]:
dataset.sentiment.value_counts()

In [None]:
dataset = dataset[:250]

In [None]:
dataset.sentiment.value_counts()

In [None]:
train = dataset[:100]
train.sentiment.value_counts()

In [None]:
test = dataset[100:].reset_index(drop=True)
test.sentiment.value_counts()

## NLP: Cleaning & Preprocessing text
- Remove HTML
- Tokenization + Remove punctuation
- Remove stopwords
- Lemmatization or stemming

In [None]:
text = train.review[0]

In [None]:
stop_words = stopwords.words('english')

In [None]:
def remove_html(text):
    return BeautifulSoup(text,'lxml').get_text()

In [None]:
def remove_punctuation(text):
    return "".join([c for c in text if c not in string.punctuation])

In [None]:
def remove_stopwords(text):
    return [w for w in text if w not in stop_words]

In [None]:
def word_lemmatizer(text):
    return [lemmatizer.lemmatize(i) for i in text]

In [None]:
def word_stemmer(text):
    return [stemmer.stem(i) for i in text]

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
stemmer = PorterStemmer()

In [None]:
t = train['review'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [None]:
labels = train['sentiment']

set(labels)

In [None]:
labels_index = {'negative':0,'positive':1}

#### Represent each word in ONE-HOT encoding
- as of now, for reducing vector length of input we set min frequency of token to be **SOME INT VALUE**

In [None]:
def token_frequency(l):
    count = {}
    for tokens in l:
        for token in tokens:
            if token not in count:
                count[token]=1
            else:
                count[token]+=1
    return count

In [None]:
tf = token_frequency(t)

In [None]:
sorted(tf.items(),key=lambda x:x[1],reverse=True)[:10]

In [None]:
min_freq = 2

In [None]:
def build_vocabulary(tf):
    v = {}
    v['pad']=0
    v['unk']=1
    for token in tf:
        if tf[token]>=min_freq:
            v[token] = len(v)
    return v

In [None]:
vocab = build_vocabulary(tf)

In [None]:
len(vocab)

## Let's convert tokens to index and make vector for each word in a review

In [None]:
tokens = t[0]
tokens[:10],len(tokens)

In [None]:
indices = [vocab[token] if token in vocab else vocab['unk'] for token in tokens]
len(indices)

## Let's convert these indices to one-hot vectors
- shape of input will be sequence_length * vocab_size

In [None]:
x = torch.zeros(len(indices),len(vocab))

In [None]:
x[range(len(indices)),indices]=1

In [None]:
x,x.shape

## Equation of RNN:
\begin{equation*}
h_t = tanh( W_{ih}X_t + b_{ih} + W_{hh}h_{t-1} + b_{hh})
\end{equation*}

In [None]:
rnn_cell = nn.RNNCell(len(vocab),100)

In [None]:
for name,p in rnn_cell.named_parameters():
    print(name,p.shape)

In [None]:
h = torch.zeros(1,100)

In [None]:
h

In [None]:
for i in range(len(indices)):
    h = rnn_cell(x[i].view(1,-1),h)

In [None]:
print("**value of hidden vector after iterating over all input time steps**")
h

In RNNCell,
    
    inputs are looped over each time steps

<a href="https://pytorch.org/docs/stable/nn.html?highlight=rnncell#torch.nn.RNNCell">PyTorch RNNCell reference</a>

## RNN for 1 training example

In [None]:
rnn = nn.RNN(len(vocab),100,batch_first=True)

In [None]:
indices = [vocab[token] if token in vocab else vocab['unk'] for token in tokens]

In [None]:
x = torch.zeros(len(indices),len(vocab))

x[range(len(indices)),indices]=1


In [None]:
x = x.expand((1,-1,len(vocab)))
x,x.shape

**Shape of h:-**
    
    h = (num_layers*num_directions, bacth_size, hidden_units)
    
    where num_layers parameter take int value as inputs which build ups STACKED RNN

In [None]:
h = torch.zeros(1,1,100)

In [None]:
output,hidden = rnn(x,h)

In [None]:
output.shape,hidden.shape

**output** - output of all time steps of all batches

**hidden** - output of last time step of all batches

In [None]:
classifier = nn.Linear(100,2)

In [None]:
preds = classifier(hidden.view(1,100))
preds

## RNN for m training examples

In [None]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [None]:
crit = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.SGD([{'params':rnn.parameters()},{'params':classifier.parameters()}],lr=0.01)

In [None]:
for doc,y in zip(indices,labels):
    x = torch.zeros(len(doc),len(vocab))
    x[range(len(doc)),doc]=1
    x = x.expand((1,-1,len(vocab)))
    
    optimizer.zero_grad()
    
    h = torch.zeros(1,1,100)
    output,hidden = rnn(x,h)
    preds = classifier(hidden.view(1,100))
    
    loss = crit(preds,torch.LongTensor([labels_index[y]]))
    loss.backward()
    
    optimizer.step()
    
    print(loss.item())
    

**Why loop over each document?**

    different documents are of differnt lengths

## Batch-wise processing

**Why do we need padding?**
- sequence length of different documents will be of different length
- so flow will be as follows:<br>
      for doc in documents:
          for token in doc:
              pass to RNNCell
- the above methods has huge complexity
- in order to process in batch-wise, we make all documents length to be of same length

    
    Thus padding comes in, which means pads 0's to documents which has length less than max length

In [None]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [None]:
sequence_lengths = [len(d) for d in indices]

In [None]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

In [None]:
X = torch.zeros(len(indices),max_length,len(vocab))

In [None]:
X.shape

In [None]:
for i in range(len(indices)):
    X[[i],range(len(indices[i])),indices[i]]=1

In [None]:
Y = torch.LongTensor([labels_index[i] for i in labels])

In [None]:
class Sentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(len(vocab),100,batch_first=True)
        self.classifier = nn.Linear(100,2)
        
    def forward(self,x,h):
        output,hidden = self.rnn(x,h)
        return self.classifier(hidden.view(-1,100))

In [None]:
model = Sentiment()

In [None]:
crit = nn.CrossEntropyLoss()

In [None]:
optimizer = optim.SGD(model.parameters(),lr=0.1)

In [None]:
bs=16

In [None]:
for epoch in range(2):
    for i in range(0,X.shape[0],bs):
        xb = X[i:i+bs]
        yb = Y[i:i+bs]
        h = torch.zeros(1,xb.shape[0],100)
        
        optimizer.zero_grad()
        preds = model(xb,h)
        loss = crit(preds,yb)
        loss.backward()
        optimizer.step()
        print(loss.item())
        

## Test

In [None]:
t = test['review'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [None]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [None]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

In [None]:
TY = torch.LongTensor([labels_index[i] for i in test.sentiment])

In [None]:
predictions=[]
for index_num in range(len(test)):

    T = torch.zeros(len(indices[index_num]),len(vocab))
    T[range(len(indices[index_num])),indices[index_num]]=1
    T = T.expand((1,-1,len(vocab)))

    h = torch.zeros(1,T.shape[0],100)
    with torch.no_grad():
        predictions.append(F.softmax(model(T,h)).argmax().item())

In [None]:
predictions = torch.LongTensor(predictions)

In [None]:
predictions

In [None]:
"Accuracy:",(predictions==TY).float().mean()

In [None]:
"Number of positives predicted",(predictions==1).sum()

In [None]:
"Number of negatives predicted",(predictions==0).sum()

## Use nn.Embedding

In [None]:
t = train['review'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [None]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [None]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

In [None]:
for i in range(len(indices)):
    if len(indices[i])<max_length:
        indices[i]+=[0]*(max_length-len(indices[i]))

In [None]:
X = torch.LongTensor(indices)
X.shape

In [None]:
Y = torch.LongTensor([labels_index[i] for i in labels])

In [None]:
class Sentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(len(vocab),200,padding_idx=0)
        self.rnn = nn.RNN(200,100,batch_first=True)
        self.classifier = nn.Linear(100,2)
        
    def forward(self,x,h):
        x = self.emb(x)
        output,hidden = self.rnn(x,h)
        return self.classifier(hidden.view(-1,100))

In [None]:
model = Sentiment()

crit = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(),lr=0.1)

bs=32

for epoch in range(10):
    for i in range(0,X.shape[0],bs):
        xb = X[i:i+bs]
        yb = Y[i:i+bs]
        h = torch.zeros(1,xb.shape[0],100)
        
        optimizer.zero_grad()
        preds = model(xb,h)
        loss = crit(preds,yb)
        loss.backward()
        optimizer.step()
        print(loss.item())
        

## Test

In [None]:
t = test['review'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [None]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

max_length = len(max(t,key=lambda x:len(x)))
max_length

In [None]:
TY = torch.LongTensor([labels_index[i] for i in test.sentiment])

In [None]:
predictions = []
for index_num in range(len(test)):
    T = torch.LongTensor([indices[index_num]])
    h = torch.zeros(1,T.shape[0],100)
    with torch.no_grad():
        predictions.append(F.softmax(model(T,h)).argmax().item())

In [None]:
predictions = torch.LongTensor(predictions)

In [None]:
(predictions==TY).float().mean()

In [None]:
tp = ((TY==1)*(predictions==1)).sum().item()

tn = ((TY==0)*(predictions==0)).sum().item()

fn = ((TY==1)*(predictions==0)).sum().item()

fp = ((TY==0)*(predictions==1)).sum().item()
tp,tn,fp,tn

#### Postive label - measures

In [None]:
print("Precision:",tp/(tp+fp))

print("Recall:",tp/(tp+fn))

#### Negative label - measures

In [None]:
print("Precision:",tn/(tn+fn))

print("Recall:",tn/(tn+fp))

In [None]:
model.rnn.weight_hh_l0.grad

In [None]:
model.rnn.weight_ih_l0.grad