In [1]:
import warnings; warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [3]:
from pathlib import Path
import matplotlib.pyplot as plt

In [4]:
import numpy as np
import pandas as pd

In [5]:
path = Path('C:/Users/sappusamy/Documents/SriWK/datasets/IMDB.csv')

In [6]:
dataset = pd.read_csv(path)

In [7]:
dataset.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [8]:
dataset = dataset[:250]

In [9]:
dataset.sentiment.value_counts()

negative    137
positive    113
Name: sentiment, dtype: int64

In [10]:
train = dataset[:100]
train.sentiment.value_counts()

negative    58
positive    42
Name: sentiment, dtype: int64

In [11]:
test = dataset[100:].reset_index(drop=True)
test.sentiment.value_counts()

negative    79
positive    71
Name: sentiment, dtype: int64

## NLP: Cleaning & Preprocessing text
- Remove HTML
- Tokenization + Remove punctuation
- Remove stopwords
- Lemmatization or stemming

In [12]:
text = train.review[0]

In [13]:
stop_words = stopwords.words('english')

In [14]:
def remove_html(text):
    return BeautifulSoup(text,'lxml').get_text()

In [15]:
def remove_punctuation(text):
    return "".join([c for c in text if c not in string.punctuation])

In [16]:
def remove_stopwords(text):
    return [w for w in text if w not in stop_words]

In [17]:
def word_lemmatizer(text):
    return [lemmatizer.lemmatize(i) for i in text]

In [18]:
def word_stemmer(text):
    return [stemmer.stem(i) for i in text]

In [19]:
tokenizer = RegexpTokenizer(r'\w+')

In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
stemmer = PorterStemmer()

In [22]:
t = train['review'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [23]:
labels = train['sentiment']

set(labels)

{'negative', 'positive'}

In [24]:
labels_index = {'negative':0,'positive':1}

#### Represent each word in ONE-HOT encoding
- as of now, for reducing vector length of input we set min frequency of token to be **SOME INT VALUE**

In [25]:
def token_frequency(l):
    count = {}
    for tokens in l:
        for token in tokens:
            if token not in count:
                count[token]=1
            else:
                count[token]+=1
    return count

In [26]:
tf = token_frequency(t)

In [27]:
sorted(tf.items(),key=lambda x:x[1],reverse=True)[:10]

[('movi', 209),
 ('film', 156),
 ('one', 104),
 ('like', 84),
 ('see', 59),
 ('even', 59),
 ('get', 58),
 ('good', 56),
 ('scene', 55),
 ('go', 54)]

In [28]:
min_freq = 2

In [29]:
def build_vocabulary(tf):
    v = {}
    v['pad']=0
    v['unk']=1
    for token in tf:
        if tf[token]>=min_freq:
            v[token] = len(v)
    return v

In [30]:
vocab = build_vocabulary(tf)

In [31]:
len(vocab)

1665

## Let's convert tokens to index and make vector for each word in a review

In [32]:
tokens = t[0]
tokens[:10],len(tokens)

(['one',
  'review',
  'mention',
  'watch',
  '1',
  'oz',
  'episod',
  'youll',
  'hook',
  'right'],
 168)

In [33]:
indices = [vocab[token] if token in vocab else vocab['unk'] for token in tokens]
len(indices)

168

## Let's convert these indices to one-hot vectors
- shape of input will be sequence_length * vocab_size

In [34]:
x = torch.zeros(len(indices),len(vocab))

In [35]:
x[range(len(indices)),indices]=1

In [36]:
x,x.shape

(tensor([[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]), torch.Size([168, 1665]))

## Equation of RNN:
\begin{equation*}
h_t = tanh( W_{ih}X_t + b_{ih} + W_{hh}h_{t-1} + b_{hh})
\end{equation*}

In [37]:
rnn_cell = nn.RNNCell(len(vocab),100)

In [38]:
for name,p in rnn_cell.named_parameters():
    print(name,p.shape)

weight_ih torch.Size([100, 1665])
weight_hh torch.Size([100, 100])
bias_ih torch.Size([100])
bias_hh torch.Size([100])


In [39]:
h = torch.zeros(1,100)

In [40]:
h

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]])

In [41]:
for i in range(len(indices)):
    h = rnn_cell(x[i].view(1,-1),h)

In [42]:
print("**value of hidden vector after iterating over all input time steps**")
h

**value of hidden vector after iterating over all input time steps**


tensor([[ 1.7929e-01, -1.9317e-02, -1.8906e-01, -6.7048e-02,  6.1098e-02,
         -2.3759e-01, -1.1638e-02,  7.4640e-02,  1.2089e-01,  1.0619e-01,
         -1.1507e-02,  1.4114e-02,  7.5826e-02, -1.2116e-01, -6.0855e-02,
          1.0763e-01, -1.4239e-02,  2.8667e-02,  3.0135e-02, -1.9374e-01,
         -1.1213e-01,  2.2333e-02,  1.3957e-01,  1.9572e-01,  1.9239e-02,
         -6.6040e-03, -1.6054e-03,  1.5771e-01,  6.4664e-02, -2.3021e-03,
         -2.3642e-01,  1.9323e-02, -5.6472e-02, -3.0124e-01,  7.6608e-02,
         -5.2564e-02, -1.8292e-01, -2.9735e-02, -4.2790e-02,  1.3185e-01,
         -2.0286e-01, -2.9462e-02,  5.5448e-02, -6.3531e-02, -1.7999e-01,
          1.3329e-01, -1.4681e-01, -2.1744e-01, -1.8383e-01,  1.0304e-01,
          1.5663e-02,  5.6457e-02,  1.8697e-01, -9.9677e-02, -4.0687e-02,
         -1.5075e-01, -2.4930e-02,  1.6335e-01, -1.6228e-01, -9.4328e-03,
         -1.1788e-01,  2.7047e-01, -4.9756e-02,  2.8024e-02,  1.0957e-01,
          1.2552e-01,  4.3644e-03,  2.

In RNNCell,
    
    inputs are looped over each time steps

<a href="https://pytorch.org/docs/stable/nn.html?highlight=rnncell#torch.nn.RNNCell">PyTorch RNNCell reference</a>

## RNN for 1 training example

In [43]:
rnn = nn.RNN(len(vocab),100,batch_first=True)

In [44]:
indices = [vocab[token] if token in vocab else vocab['unk'] for token in tokens]

In [45]:
x = torch.zeros(len(indices),len(vocab))

x[range(len(indices)),indices]=1


In [46]:
x = x.expand((1,-1,len(vocab)))
x,x.shape

(tensor([[[0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]), torch.Size([1, 168, 1665]))

**Shape of h:-**
    
    h = (num_layers*num_directions, bacth_size, hidden_units)
    
    where num_layers parameter take int value as inputs which build ups STACKED RNN

In [47]:
h = torch.zeros(1,1,100)

In [48]:
output,hidden = rnn(x,h)

In [49]:
output.shape,hidden.shape

(torch.Size([1, 168, 100]), torch.Size([1, 1, 100]))

**output** - output of all time steps of all batches

**hidden** - output of last time step of all batches

In [50]:
classifier = nn.Linear(100,2)

In [51]:
preds = classifier(hidden.view(1,100))
preds

tensor([[-0.0723,  0.1606]], grad_fn=<AddmmBackward>)

## RNN for m training examples

In [52]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [53]:
crit = nn.CrossEntropyLoss()

In [54]:
optimizer = optim.SGD([{'params':rnn.parameters()},{'params':classifier.parameters()}],lr=0.01)

In [55]:
for doc,y in zip(indices,labels):
    x = torch.zeros(len(doc),len(vocab))
    x[range(len(doc)),doc]=1
    x = x.expand((1,-1,len(vocab)))
    
    optimizer.zero_grad()
    
    h = torch.zeros(1,1,100)
    output,hidden = rnn(x,h)
    preds = classifier(hidden.view(1,100))
    
    loss = crit(preds,torch.LongTensor([labels_index[y]]))
    loss.backward()
    
    optimizer.step()
    
    print(loss.item())
    

0.5834324359893799
0.5777077078819275
0.5672720670700073
0.81904137134552
0.572853684425354
0.5507466197013855
0.562406063079834
0.8033809661865234
0.8111679553985596
0.5684423446655273
0.7959633469581604
0.7790740132331848
0.7386037707328796
0.7636117339134216
0.6128301024436951
0.8154166340827942
0.6816398501396179
0.6763375401496887
0.6631404161453247
0.7713841795921326
0.671399712562561
0.7575178146362305
0.6831709146499634
0.7249537706375122
0.6409131288528442
0.6898464560508728
0.6790663003921509
0.7523068189620972
0.7011736035346985
0.6783504486083984
0.7271554470062256
0.6285293102264404
0.7757940888404846
0.5878118276596069
0.7632633447647095
0.7834609746932983
0.7288427352905273
0.67417973279953
0.7026206851005554
0.7295390963554382
0.7333039045333862
0.6966962814331055
0.7047434449195862
0.6778765916824341
0.7618573307991028
0.6927838921546936
0.6623178720474243
0.6760919094085693
0.7401516437530518
0.6507338881492615
0.6960970163345337
0.7483596205711365
0.71957927942276
0.

**Why loop over each document?**

    different documents are of differnt lengths

## Batch-wise processing

**Why do we need padding?**
- sequence length of different documents will be of different length
- so flow will be as follows:<br>
      for doc in documents:
          for token in doc:
              pass to RNNCell
- the above methods has huge complexity
- in order to process in batch-wise, we make all documents length to be of same length

    
    Thus padding comes in, which means pads 0's to documents which has length less than max length

In [65]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [66]:
sequence_lengths = [len(d) for d in indices]

In [67]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

372

In [68]:
for i in range(len(indices)):
    if len(indices)<max_length:
        indices[i]+=[0]*(max_length-len(indices[i]))

In [69]:
X = torch.LongTensor(indices)
X.shape

torch.Size([100, 372])

In [70]:
Y = torch.LongTensor([labels_index[i] for i in labels])

In [80]:
class Sentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(len(vocab),200)
        self.rnn = nn.RNN(200,100,batch_first=True)
        self.classifier = nn.Linear(100,2)
        
    def forward(self,x,h):
        x = self.emb(x)
        output,hidden = self.rnn(x,h)
        return self.classifier(hidden.view(-1,100))

In [81]:
model = Sentiment()

In [82]:
crit = nn.CrossEntropyLoss()

In [83]:
optimizer = optim.SGD(model.parameters(),lr=0.1)

In [84]:
bs=32

In [85]:
for epoch in range(10):
    for i in range(0,X.shape[0],bs):
        xb = X[i:i+bs]
        yb = Y[i:i+bs]
        h = torch.zeros(1,xb.shape[0],100)
        
        optimizer.zero_grad()
        preds = model(xb,h)
        loss = crit(preds,yb)
        loss.backward()
        optimizer.step()
        print(loss.item())
        

0.6912732124328613
0.6942989230155945
0.8087497353553772
1.220121145248413
2.2280969619750977
0.8051763772964478
0.702520489692688
0.6217736005783081
0.9866464138031006
0.8311755061149597
0.649505078792572
0.5851980447769165
0.8783285021781921
0.7267104387283325
0.6451024413108826
0.5826385021209717
0.8621660470962524
0.703143835067749
0.6480700373649597
0.5837917327880859
0.8536221981048584
0.6915607452392578
0.6502559185028076
0.5854310393333435
0.8465548753738403
0.6840533018112183
0.6515491008758545
0.5869928598403931
0.8407400846481323
0.6791639924049377
0.6526533961296082
0.5888698101043701
0.833543062210083
0.6746482849121094
0.6530837416648865
0.590402364730835
0.8273991346359253
0.6716400980949402
0.6534587740898132
0.5920885801315308


In [86]:
t = test['review'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [90]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [91]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

450

In [232]:
index_num = 43

In [233]:
T = torch.LongTensor([indices[index_num]])

In [234]:
TY = torch.LongTensor([labels_index[i] for i in test.sentiment])

In [235]:
h = torch.zeros(1,T.shape[0],100)

In [236]:
with torch.no_grad():
    print(F.softmax(model(T,h)).argmax())

tensor(1)


tensor([[0.5665, 0.4335]], grad_fn=<SoftmaxBackward>)