In [1]:
import warnings; warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [3]:
from pathlib import Path
import matplotlib.pyplot as plt

In [4]:
import numpy as np
import pandas as pd

In [5]:
path = Path('C:/Users/sappusamy/Documents/SriWK/datasets/SPAM.csv')

In [6]:
dataset = pd.read_csv(path)

In [10]:
dataset.columns

Index(['Category', 'Message'], dtype='object')

In [11]:
dataset.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [21]:
ham = dataset[dataset.Category=='ham']

In [23]:
spam = dataset[dataset.Category=='spam']

In [32]:
train = ham[:50]
train = train.append(spam[:50])
train = train.sample(frac=1,random_state=42).reset_index(drop=True)

In [34]:
test = ham[50:100]
test = test.append(spam[50:100])
test = test.sample(frac=1,random_state=42).reset_index(drop=True)

## NLP: Cleaning & Preprocessing text
- Remove HTML
- Tokenization + Remove punctuation
- Remove stopwords
- Lemmatization or stemming

In [37]:
text = train.Message[0]

In [38]:
stop_words = stopwords.words('english')

In [39]:
def remove_html(text):
    return BeautifulSoup(text,'lxml').get_text()

In [40]:
def remove_punctuation(text):
    return "".join([c for c in text if c not in string.punctuation])

In [41]:
def remove_stopwords(text):
    return [w for w in text if w not in stop_words]

In [42]:
def word_lemmatizer(text):
    return [lemmatizer.lemmatize(i) for i in text]

In [43]:
def word_stemmer(text):
    return [stemmer.stem(i) for i in text]

In [44]:
tokenizer = RegexpTokenizer(r'\w+')

In [45]:
lemmatizer = WordNetLemmatizer()

In [46]:
stemmer = PorterStemmer()

In [47]:
t = train['Message'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [51]:
labels = train['Category']

set(labels)

{'ham', 'spam'}

In [52]:
labels_index = {'ham':0,'spam':1}

#### Represent each word in ONE-HOT encoding
- as of now, for reducing vector length of input we set min frequency of token to be **SOME INT VALUE**

In [53]:
def token_frequency(l):
    count = {}
    for tokens in l:
        for token in tokens:
            if token not in count:
                count[token]=1
            else:
                count[token]+=1
    return count

In [54]:
tf = token_frequency(t)

In [55]:
sorted(tf.items(),key=lambda x:x[1],reverse=True)[:10]

[('u', 23),
 ('call', 21),
 ('2', 17),
 ('go', 16),
 ('free', 16),
 ('ur', 14),
 ('mobil', 13),
 ('text', 12),
 ('txt', 11),
 ('claim', 11)]

In [60]:
min_freq = 1

In [61]:
def build_vocabulary(tf):
    v = {}
    v['pad']=0
    v['unk']=1
    for token in tf:
        if tf[token]>=min_freq:
            v[token] = len(v)
    return v

In [62]:
vocab = build_vocabulary(tf)

In [63]:
len(vocab)

749

## Let's convert tokens to index and make vector for each word in a review

In [64]:
tokens = t[0]
tokens[:10],len(tokens)

(['500',
  'new',
  'mobil',
  '2004',
  'must',
  'go',
  'txt',
  'nokia',
  '89545',
  'collect'],
 15)

In [65]:
indices = [vocab[token] if token in vocab else vocab['unk'] for token in tokens]
len(indices)

15

## Let's convert these indices to one-hot vectors
- shape of input will be sequence_length * vocab_size

In [66]:
x = torch.zeros(len(indices),len(vocab))

In [67]:
x[range(len(indices)),indices]=1

In [68]:
x,x.shape

(tensor([[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]), torch.Size([15, 749]))

## Equation of RNN:
\begin{equation*}
h_t = tanh( W_{ih}X_t + b_{ih} + W_{hh}h_{t-1} + b_{hh})
\end{equation*}

In [69]:
rnn_cell = nn.RNNCell(len(vocab),100)

In [70]:
for name,p in rnn_cell.named_parameters():
    print(name,p.shape)

weight_ih torch.Size([100, 749])
weight_hh torch.Size([100, 100])
bias_ih torch.Size([100])
bias_hh torch.Size([100])


In [71]:
h = torch.zeros(1,100)

In [72]:
h

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]])

In [73]:
for i in range(len(indices)):
    h = rnn_cell(x[i].view(1,-1),h)

In [74]:
print("**value of hidden vector after iterating over all input time steps**")
h

**value of hidden vector after iterating over all input time steps**


tensor([[-4.4142e-02,  9.4197e-02, -1.6878e-01,  2.6354e-01, -1.3140e-01,
         -1.4577e-02, -1.4490e-02,  3.5777e-02,  2.0831e-01, -1.4552e-01,
          9.4687e-02, -7.8661e-02, -7.1842e-02,  3.2611e-02, -2.0414e-01,
          1.8936e-02,  1.5348e-01,  9.2606e-03, -8.3286e-02, -6.1743e-02,
         -9.1368e-03,  1.8880e-01, -7.9705e-02,  2.1991e-01,  1.6692e-01,
          1.0081e-01,  2.4821e-02, -1.1609e-01, -1.7894e-01, -1.9801e-02,
         -4.5523e-01,  1.4640e-01,  1.5998e-01, -1.4366e-02,  1.9816e-01,
         -1.2855e-01, -6.1894e-02,  1.4198e-01, -3.9010e-02,  2.3670e-01,
          5.8060e-02,  1.4338e-01, -2.4463e-02, -1.0425e-01,  2.8902e-02,
          8.1776e-02,  1.4509e-01,  1.9462e-01, -5.3610e-02, -6.8339e-02,
          3.9568e-02, -1.8148e-02, -5.5171e-02, -1.2315e-02, -1.8912e-01,
          1.4017e-01, -2.3654e-01,  1.2702e-01,  8.6424e-02,  1.0395e-01,
         -9.0624e-02, -7.0265e-02, -6.5712e-02, -1.3446e-01, -1.1549e-01,
          2.5833e-01,  9.5647e-02, -6.

In RNNCell,
    
    inputs are looped over each time steps

<a href="https://pytorch.org/docs/stable/nn.html?highlight=rnncell#torch.nn.RNNCell">PyTorch RNNCell reference</a>

## RNN for 1 training example

In [75]:
rnn = nn.RNN(len(vocab),100,batch_first=True)

In [76]:
indices = [vocab[token] if token in vocab else vocab['unk'] for token in tokens]

In [77]:
x = torch.zeros(len(indices),len(vocab))

x[range(len(indices)),indices]=1


In [78]:
x = x.expand((1,-1,len(vocab)))
x,x.shape

(tensor([[[0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]), torch.Size([1, 15, 749]))

**Shape of h:-**
    
    h = (num_layers*num_directions, bacth_size, hidden_units)
    
    where num_layers parameter take int value as inputs which build ups STACKED RNN

In [79]:
h = torch.zeros(1,1,100)

In [80]:
output,hidden = rnn(x,h)

In [81]:
output.shape,hidden.shape

(torch.Size([1, 15, 100]), torch.Size([1, 1, 100]))

**output** - output of all time steps of all batches

**hidden** - output of last time step of all batches

In [82]:
classifier = nn.Linear(100,2)

In [83]:
preds = classifier(hidden.view(1,100))
preds

tensor([[-0.0198,  0.0784]], grad_fn=<AddmmBackward>)

## RNN for m training examples

In [84]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [85]:
crit = nn.CrossEntropyLoss()

In [86]:
optimizer = optim.SGD([{'params':rnn.parameters()},{'params':classifier.parameters()}],lr=0.01)

In [87]:
for doc,y in zip(indices,labels):
    x = torch.zeros(len(doc),len(vocab))
    x[range(len(doc)),doc]=1
    x = x.expand((1,-1,len(vocab)))
    
    optimizer.zero_grad()
    
    h = torch.zeros(1,1,100)
    output,hidden = rnn(x,h)
    preds = classifier(hidden.view(1,100))
    
    loss = crit(preds,torch.LongTensor([labels_index[y]]))
    loss.backward()
    
    optimizer.step()
    
    print(loss.item())
    

0.6452575325965881
0.6073843240737915
0.5250005125999451
0.8167086839675903
0.8078009486198425
0.7634955644607544
0.7709652781486511
0.6961570978164673
0.7559258937835693
0.7491001486778259
0.6903864145278931
0.7111412286758423
0.7321428656578064
0.6743751168251038
0.7143977880477905
0.6818093061447144
0.7547265291213989
0.6565196514129639
0.7174919247627258
0.7241765260696411
0.7170804738998413
0.668822169303894
0.7855167984962463
0.7120293378829956
0.7420763969421387
0.6734651327133179
0.6923825144767761
0.6842448115348816
0.6697331070899963
0.7355749011039734
0.6599501967430115
0.6665076613426208
0.7174286842346191
0.6172147989273071
0.731817901134491
0.6726245284080505
0.6945776343345642
0.7054689526557922
0.682533860206604
0.6886274814605713
0.6347139477729797
0.61833655834198
0.6650079488754272
0.8082593679428101
0.6154588460922241
0.5872073769569397
0.7802671790122986
0.613354504108429
0.8168739676475525
0.6534721851348877
0.6413669586181641
0.6010854840278625
0.5520371794700623

**Why loop over each document?**

    different documents are of differnt lengths

## Batch-wise processing

**Why do we need padding?**
- sequence length of different documents will be of different length
- so flow will be as follows:<br>
      for doc in documents:
          for token in doc:
              pass to RNNCell
- the above methods has huge complexity
- in order to process in batch-wise, we make all documents length to be of same length

    
    Thus padding comes in, which means pads 0's to documents which has length less than max length

In [88]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [89]:
sequence_lengths = [len(d) for d in indices]

In [90]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

26

In [91]:
X = torch.zeros(len(indices),max_length,len(vocab))

In [92]:
X.shape

torch.Size([100, 26, 749])

In [93]:
for i in range(len(indices)):
    X[[i],range(len(indices[i])),indices[i]]=1

In [94]:
Y = torch.LongTensor([labels_index[i] for i in labels])

In [95]:
class Sentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(len(vocab),100,batch_first=True)
        self.classifier = nn.Linear(100,2)
        
    def forward(self,x,h):
        output,hidden = self.rnn(x,h)
        return self.classifier(hidden.view(-1,100))

In [96]:
model = Sentiment()

In [97]:
crit = nn.CrossEntropyLoss()

In [98]:
optimizer = optim.SGD(model.parameters(),lr=0.1)

In [99]:
bs=16

In [100]:
for epoch in range(2):
    for i in range(0,X.shape[0],bs):
        xb = X[i:i+bs]
        yb = Y[i:i+bs]
        h = torch.zeros(1,xb.shape[0],100)
        
        optimizer.zero_grad()
        preds = model(xb,h)
        loss = crit(preds,yb)
        loss.backward()
        optimizer.step()
        print(loss.item())
        

0.7189193367958069
0.6982823610305786
0.695207953453064
0.6919353604316711
0.7174856662750244
0.6895442605018616
0.6674919128417969
0.7306748628616333
0.7034782767295837
0.7022602558135986
0.6942461133003235
0.7063891291618347
0.6883792281150818
0.6624077558517456


## Test

In [102]:
t = test['Message'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [103]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [104]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

26

In [107]:
TY = torch.LongTensor([labels_index[i] for i in test.Category])

In [108]:
predictions=[]
for index_num in range(len(test)):

    T = torch.zeros(len(indices[index_num]),len(vocab))
    T[range(len(indices[index_num])),indices[index_num]]=1
    T = T.expand((1,-1,len(vocab)))

    h = torch.zeros(1,T.shape[0],100)
    with torch.no_grad():
        predictions.append(F.softmax(model(T,h)).argmax().item())

In [109]:
predictions = torch.LongTensor(predictions)

In [110]:
predictions

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1])

In [111]:
"Accuracy:",(predictions==TY).float().mean()

('Accuracy:', tensor(0.5000))

In [112]:
"Number of positives predicted",(predictions==1).sum()

('Number of positives predicted', tensor(100))

In [113]:
"Number of negatives predicted",(predictions==0).sum()

('Number of negatives predicted', tensor(0))

## Use nn.Embedding

In [147]:
t = train['Message'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [148]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

In [149]:
max_length = len(max(t,key=lambda x:len(x)))
max_length

26

In [150]:
for i in range(len(indices)):
    if len(indices[i])<max_length:
        indices[i]+=[0]*(max_length-len(indices[i]))

In [151]:
X = torch.LongTensor(indices)
X.shape

torch.Size([100, 26])

In [152]:
Y = torch.LongTensor([labels_index[i] for i in labels])

In [153]:
class Sentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(len(vocab),200,padding_idx=0)
        self.rnn = nn.RNN(200,100,batch_first=True)
        self.classifier = nn.Linear(100,2)
        
    def forward(self,x,h):
        x = self.emb(x)
        output,hidden = self.rnn(x,h)
        return self.classifier(hidden.view(-1,100))

In [154]:
model = Sentiment()

crit = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(),lr=0.1)

bs=32

for epoch in range(10):
    for i in range(0,X.shape[0],bs):
        xb = X[i:i+bs]
        yb = Y[i:i+bs]
        h = torch.zeros(1,xb.shape[0],100)
        
        optimizer.zero_grad()
        preds = model(xb,h)
        loss = crit(preds,yb)
        loss.backward()
        optimizer.step()
        print(loss.item())
        

0.6892282366752625
0.6740912795066833
0.7422855496406555
0.7331304550170898
0.701262354850769
0.6843129396438599
0.7027245163917542
0.6879247426986694
0.7143274545669556
0.6926295757293701
0.6863281726837158
0.6673398017883301
0.7220810651779175
0.6965189576148987
0.6788063645362854
0.6574094295501709
0.7253758907318115
0.697486400604248
0.6749976277351379
0.6522319912910461
0.7261216640472412
0.6970082521438599
0.6728103756904602
0.6491697430610657
0.7256330847740173
0.6959238648414612
0.6713466048240662
0.6470130681991577
0.7245992422103882
0.6946271061897278
0.6702021956443787
0.6451825499534607
0.7233366966247559
0.6932797431945801
0.6691867709159851
0.6433745622634888
0.7219727635383606
0.6919354796409607
0.6682024598121643
0.641399085521698


## Test

In [155]:
t = test['Message'].apply(remove_html)

t = t.apply(remove_punctuation)

t = t.apply(lambda x: tokenizer.tokenize(x.lower()))

t = t.apply(remove_stopwords)

t = t.apply(word_lemmatizer)

t = t.apply(word_stemmer)

In [156]:
indices = [[vocab[token] if token in vocab else vocab['unk'] for token in doc] for doc in t]

max_length = len(max(t,key=lambda x:len(x)))
max_length

26

In [157]:
TY = torch.LongTensor([labels_index[i] for i in test.Category])

In [158]:
predictions = []
for index_num in range(len(test)):
    T = torch.LongTensor([indices[index_num]])
    h = torch.zeros(1,T.shape[0],100)
    with torch.no_grad():
        predictions.append(F.softmax(model(T,h)).argmax().item())

In [159]:
predictions = torch.LongTensor(predictions)

In [160]:
(predictions==TY).float().mean()

tensor(0.6200)

In [161]:
tp = ((TY==1)*(predictions==1)).sum().item()

tn = ((TY==0)*(predictions==0)).sum().item()

fn = ((TY==1)*(predictions==0)).sum().item()

fp = ((TY==0)*(predictions==1)).sum().item()
tp,tn,fp,tn

(40, 22, 28, 22)

#### Postive label - measures

In [162]:
print("Precision:",tp/(tp+fp))

print("Recall:",tp/(tp+fn))

Precision: 0.5882352941176471
Recall: 0.8


#### Negative label - measures

In [163]:
print("Precision:",tn/(tn+fn))

print("Recall:",tn/(tn+fp))

Precision: 0.6875
Recall: 0.44


In [164]:
model.rnn.weight_hh_l0.grad

tensor([[ 4.6326e-03,  1.5805e-03, -1.4267e-03,  ...,  8.2292e-04,
          1.9990e-04, -1.5737e-03],
        [-6.7638e-03, -6.2903e-03,  4.6110e-03,  ..., -4.1842e-03,
          3.7333e-04,  5.3610e-03],
        [-4.1368e-03, -3.2653e-03,  5.4916e-03,  ..., -1.7893e-03,
         -2.3519e-03,  1.1520e-03],
        ...,
        [-3.7088e-03, -1.7957e-03,  3.3921e-04,  ..., -4.3912e-03,
          1.9419e-04,  3.0356e-03],
        [ 1.9071e-03,  1.0396e-03, -3.5588e-04,  ...,  8.1504e-04,
         -3.8207e-04, -1.1678e-03],
        [-9.9614e-04, -8.3925e-04,  5.0913e-04,  ...,  1.4241e-04,
          2.8769e-06,  4.0479e-04]])

In [165]:
model.rnn.weight_ih_l0.grad

tensor([[-5.4073e-04,  1.8684e-04,  1.5433e-04,  ...,  2.6987e-04,
          7.6458e-04,  1.3815e-04],
        [ 4.5654e-04, -3.4279e-04, -4.9347e-05,  ..., -2.4360e-04,
         -4.8947e-04,  1.4968e-04],
        [-5.1119e-05, -3.0856e-04,  9.5631e-04,  ..., -2.2593e-04,
          8.7920e-04,  6.2837e-04],
        ...,
        [ 6.5464e-04, -4.3197e-04, -2.6080e-04,  ..., -2.6342e-04,
         -8.8494e-04,  2.9544e-05],
        [-2.3705e-04,  2.8603e-04,  2.5581e-04,  ...,  3.4914e-05,
          5.2554e-04, -4.6989e-05],
        [ 9.4197e-05,  1.0290e-04, -3.7651e-04,  ...,  8.6780e-05,
         -6.4274e-04, -3.7865e-04]])