# A notebook to build a NLP model from scratch

In [None]:
## Run this if on colab 
# !pip install fastai

In [1]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/Users/ramamurthi/.fastai/data/human_numbers/train.txt'),Path('/Users/ramamurthi/.fastai/data/human_numbers/valid.txt')]

## Combining train and valid for this particular usecase

In [63]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
print("Printing Lines")
print(lines[:100])

text = ' . '.join(l.strip() for l in lines)
print("Printing TEXT")
print(text[:100])

Printing Lines
['one \n', 'two \n', 'three \n', 'four \n', 'five \n', 'six \n', 'seven \n', 'eight \n', 'nine \n', 'ten \n', 'eleven \n', 'twelve \n', 'thirteen \n', 'fourteen \n', 'fifteen \n', 'sixteen \n', 'seventeen \n', 'eighteen \n', 'nineteen \n', 'twenty \n', 'twenty one \n', 'twenty two \n', 'twenty three \n', 'twenty four \n', 'twenty five \n', 'twenty six \n', 'twenty seven \n', 'twenty eight \n', 'twenty nine \n', 'thirty \n', 'thirty one \n', 'thirty two \n', 'thirty three \n', 'thirty four \n', 'thirty five \n', 'thirty six \n', 'thirty seven \n', 'thirty eight \n', 'thirty nine \n', 'forty \n', 'forty one \n', 'forty two \n', 'forty three \n', 'forty four \n', 'forty five \n', 'forty six \n', 'forty seven \n', 'forty eight \n', 'forty nine \n', 'fifty \n', 'fifty one \n', 'fifty two \n', 'fifty three \n', 'fifty four \n', 'fifty five \n', 'fifty six \n', 'fifty seven \n', 'fifty eight \n', 'fifty nine \n', 'sixty \n', 'sixty one \n', 'sixty two \n', 'sixty three \n', 'si

## Splitting the items by ' '  and assinging tokens to each and every word

In [61]:
tokens = text.split(' ')
tokens[1000:1020]
print(tokens)

['.',
 'two',
 'hundred',
 'fifty',
 'seven',
 '.',
 'two',
 'hundred',
 'fifty',
 'eight',
 '.',
 'two',
 'hundred',
 'fifty',
 'nine',
 '.',
 'two',
 'hundred',
 'sixty',
 '.']

## Getting Vocabs and Nums from tokens (Tokenization and Numericalization steps)

In [64]:
vocab = L(*tokens).unique()
print("Vocab",vocab[:10])

word2idx = {i:w for w,i in enumerate(vocab)}
nums = [i for i in tokens]
nums = L(word2idx[i] for i in tokens)
nums

Vocab ['one', '.', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']


(#63095) [0,1,2,1,3,1,4,1,5,1...]

## creating x, y tuples for train test examples

In [65]:

token_seq = L((tokens[i:i+3] , tokens[i+3]) for i in range(0,len(tokens)-4,3))
print(token_seq[1:5])
seqs = L((tensor(nums[i:i+3]) , tensor(nums[i+3])) for i in range(0,len(nums)-4,3))
print(seqs[1:5])

[(['.', 'three', '.'], 'four'), (['four', '.', 'five'], '.'), (['.', 'six', '.'], 'seven'), (['seven', '.', 'eight'], '.')]
[(tensor([1, 3, 1]), tensor(4)), (tensor([4, 1, 5]), tensor(1)), (tensor([1, 6, 1]), tensor(7)), (tensor([7, 1, 8]), tensor(1))]


## Creating DataLoaders using the x and y created above . Train , Valid is defined by using a cut of 80% on the whole dataset

In [66]:
bs = 64
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut]  ## train 
                                ,seqs[cut:]  ## valid
                                ,bs = 64 
                                ,shuffle = False)

## Peak into the Datasets
x, y = first(dls.train)
print(x.shape , y.shape)
print(x[:3] , y[:3])


torch.Size([64, 3]) torch.Size([64])
tensor([[0, 1, 2],
        [1, 3, 1],
        [4, 1, 5]]) tensor([1, 4, 1])


In [67]:
## Creating Language Model1

class LMModel(Module):
    def __init__(self , vocab_sz , n_hidden):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.h_h = nn.Embedding(n_hidden , n_hidden)
        self.h_o = nn.Embedding(n_hidden , vocab_sz)
    
    def forward(self,input_x):
        
        ## Processing Unit for one word
        h = self.i_h(x[:,0])  # (bs,vocab_sz) -> (bs,n_hidden)
        h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
        h = F.ReLU(h) # Activation Function 
        
        ## Processing Unit for word 2
        h = h + self.i_h(x[:,1])
        h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
        h = F.ReLU(h) # Activation Function 
        
        ## Processing Unit for word 3
        h = h + self.i_h(x[:,1])
        h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
        h = F.ReLU(h) # Activation Function 
        
        ## Final Conversion to Vocab size for softmax to handle
        h = self.h_o(h)

        return h

In [68]:
learn = Learner(dls  
            ,LMModel(len(vocab),64) 
            ,loss_func=F.cross_entropy 
            ,metrics = accuracy )
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

## Baseline Model which is basically predicting the word with highest occurance irrespective of the order

In [74]:
n,counts = 0,torch.zeros(len(vocab))
for x,y in dls.train:
    n += y.shape[0] ## Total Number of values of y surfed
    for i in range_of(vocab): ## Running over all the indexes of vocab (i)
        counts[i] += (y==i).long().sum() ## Addign the count of occurance of a particular i for every y and i
idx = torch.argmax(counts)  ## Checking the index of max occurance
idx , vocab[idx.item()] , counts[idx].item()/n

(tensor(1), '.', 0.1598878816793893)

## Simplyfying the model with creation of unit  

In [76]:
## Simplified language model

class LMModel2(Module):
    def __init__(self , vocab_sz , n_hidden):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.h_h = nn.Embedding(n_hidden , n_hidden)
        self.h_o = nn.Embedding(n_hidden , vocab_sz)
    
    def forward(self,input_x):
        
        h = 0
        num_untis = 3
        
        ## Repeat for 1->n-1
        for i in range(num_untis):
            h = h + self.i_h(x[:,num_untis])
            h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
            h = F.ReLU(h) # Activation Function 
        
        return self.h_o(h) ## Final Conversion to Vocab size for softmax to handle

# Improving RNNs

Two main drawbacks of the simplified model created above , 
1. We are initializing the hidden state as zero after every batch , throwing away info we had so far about the sentences . 
    - This can be solved by just initializing h = 0 in __init__ rather than in forward . 
2. But if we do this , this creates another problem . Suppose if there are 10000 layers in total , when doing backprop for the 10000th entry , we have to calculate gradients for all the 10000 layers (all the way back to first layer) since we are reusing h from the previous result . 
    - To solve this , we'll keep the back prop to the last 3 layers of an entry and remove requiers_grad() from the previous layers when doing back_prop

In [77]:
## Simplified language model

class LMModel3(Module):
    def __init__(self , vocab_sz , n_hidden):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.h_h = nn.Embedding(n_hidden , n_hidden)
        self.h_o = nn.Embedding(n_hidden , vocab_sz)
        self.h = 0
    
    def forward(self,input_x):
        
        seq_length = 3
        
        ## Repeat for 1->n-1
        for i in range(seq_length):
            h = h + self.i_h(x[:,seq_length])
            h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
            h = F.ReLU(h) # Activation Function 
        out = self.h_o(h) ## Final Conversion to Vocab size for softmax to handle
        
        # Note : We are Detaching gradients from h only after the computation of output , Which means that for this current iteration of out , gradients will still be calculated for the for seq_length layers . 
        
        self.h = self.h.detach() ## Remove all the gradients associated with h and just keeps the value (Sort of acting like a fixed bias)
        return out
    
    def reset(self):
        self.h = 0

By this way , The gradients are computed only for last 3 layers / seq_length defined inside the forward function . This approach is called back_prop through time 

In [83]:
## Splitting the data into bacthes and sequences 
m = len(seqs)//bs
print(m , bs , len(seqs))
seqs

## m -> no of batches by this logic
# First Batch - (0 , m ,2m ,3m ,... ,(bs-1)m )
# Second Batch - (1 , m+1 ,2m+1 ,3m+1 ,... ,(bs-1)m+1 )
# ..
# mth bacth (m-1 , 2m -1 , 3m -1 , ..  )

## Function to create the dataset desctribe above :

def group_chunks( ds , bs):
    m = len(ds)//bs
    new_ds = L()
    for i in range(m):
        new_ds += L(ds[(j*m + i)] for j in range(bs))
    return  new_ds

## We don't care much about the last batch in this case , So we just drop the last bacth since it doesn't match with the batch_size . 
        
bs = 64
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut]  ## train 
                                ,seqs[cut:]  ## valid
                                ,bs = 64 
                                ,drop_last = True ## Dropping the end batch for sake of simplicity
                                ,shuffle = False)




328 64 21031


(#21031) [(tensor([0, 1, 2]), tensor(1)),(tensor([1, 3, 1]), tensor(4)),(tensor([4, 1, 5]), tensor(1)),(tensor([1, 6, 1]), tensor(7)),(tensor([7, 1, 8]), tensor(1)),(tensor([1, 9, 1]), tensor(10)),(tensor([10,  1, 11]), tensor(1)),(tensor([ 1, 12,  1]), tensor(13)),(tensor([13,  1, 14]), tensor(1)),(tensor([ 1, 15,  1]), tensor(16))...]

In [None]:
learn = Learner(dls  
            ,LMModel3(len(vocab),64) 
            ,loss_func=F.cross_entropy 
            ,metrics = accuracy
            ,cbs = ModelResetter ) ## Used for callbacks (reset) function after every epoch and also at the start of validation
learn.fit_one_cycle(4,1e-3)

So Far , We have used a framework for predicting a word given it's previous 3 words .We could also predict the next word at the end of every unit and use this signal 

Old Framework : tensor[1,2,3] -> tensor[4] <br>
New framework : tensor[1,2,3] -> tensor[2,3,4]

In [99]:
seq_length = 16
input_list = [tensor(nums[i:i+seq_length]) for i in range(0,len(nums)-seq_length -1 , seq_length)]
output_list = [tensor(nums[i+1:i+seq_length+1]) for i in range(0,len(nums)-seq_length -1 , seq_length)]

len(output_list) , len(input_list)
seqs = L([(input_list[i],output_list[i]) for i in range(len(input_list))])
seqs[:10]

cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut] 
                            ,seqs[cut:]
                            ,bs = bs
                            ,drop_last = True
                            ,shuffle = False
                            )



In [114]:
seqs[0]
[L(vocab[i.item()] for i in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

## New Freamwork Model 

In [115]:
## Simplified language model

class LMModel4(Module):
    def __init__(self , vocab_sz , n_hidden):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.h_h = nn.Linear(n_hidden , n_hidden)
        self.h_o = nn.Linear(n_hidden , vocab_sz)
        self.h = 0
    
    def forward(self,input_x):
        
        # seq_length = 3
        outs = []
        
        ## Repeat for 1->n-1
        for i in range(seq_length):
            h = h + self.i_h(x[:,i])
            h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
            h = F.relu(h) # Activation Function 
            outs.append(self.h_o(self.h))  ## Appending outputs at each layer
        
        # Note : We are Detaching gradients from h only after the computation of output , Which means that for this current iteration of out , gradients will still be calculated for the for seq_length layers . 
        
        self.h = self.h.detach() ## Remove all the gradients associated with h and just keeps the value (Sort of acting like a fixed bias) after each seq_length

        return torch.stack(outs,dim=1)
    
    def reset(self):
        self.h = 0

In [116]:
def flattended_loss(inp , targ):
    F.cross_entropy(inp.view(-1,len(vocab)) , targ.view(-1))

In [None]:
learn = Learner(dls
                ,LMModel(len(vocab),64)
                ,loss_func = flattended_loss
                ,metrics  = accuracy
                ,cbs = ModelResetter
                )
learn.fit_one_cycle(15,3e-3)

# Multilayer RNNs

We pass activations from our RNN to second RNN (Which almost looks parallel to the first RNN ) which then provides the output required . 

In [None]:
## Multilayerd language model

class LMModel5(Module):
    def __init__(self , vocab_sz , n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.rnn = nn.RNN(n_hidden , n_hidden , n_layers , batch_first = True)
        self.h_o = nn.Linear(n_hidden , vocab_sz)
        self.h = torch.zeros(n_layers , bs , n_hidden)
    
    def forward(self,input_x):
        
        res,h = self.rnn(self.i_h(x) , self.h)
        self.h = h.detach()
        return self.h_o(res)

    
    def reset(self):
        self.h = 0

## LSTM 

What is the need for LSTM ? 

Ex1 : "Henry has a dog and he likes his dog very much"
Ex2 : "Sophie has a dog and she likes her dog very much"

RNN needs to remember The association between  the name with he/she and his/her.

RNNs are really bad at retaining memory of what happened earlier in the sentence , which is a motivation to have another hidden state (Called cell state) in LSTM

## Build LSTM from scratch

## Synatx to replace the loops with LSTM - nn.LSTM(n_hidden_layers , n_hidden , n_layers , batch_first = True)

In [None]:
## Multilayerd language model

class LMModel6(Module):
    def __init__(self , vocab_sz , n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz , n_hidden) ## Embedding Layer
        self.rnn = nn.LSTM(n_hidden , n_hidden , n_layers , batch_first = True) ## LSTM
        self.h_o = nn.Linear(n_hidden , vocab_sz)
        self.h = [torch.zeros(n_layers , bs , n_hidden) for _ in range(2)] ## Hidden states
    
    def forward(self,input_x):
        
        res,h = self.rnn(self.i_h(x) , self.h)
        self.h = [h_.detach() for h_ in h] ## Since it's for dim 2
        return self.h_o(res)

    def reset(self):
        self.h = 0

learn = Learner(dls  
            ,LMModel6(len(vocab),64 , 2) ## Inclusion of n_layers
            ,loss_func= CrossEntropyLossFlat() ## Similar to what we had defined above
            ,metrics = accuracy
            ,cbs = ModelResetter ) ## Used for callbacks (reset) function after every epoch and also at the start of validation

learn.fit_one_cycle(15,1e-2)

## Overfitting in LSTMs 

### 3 main ways to avoid overfitting using Regularization techniques
1. Using dropouts (randomly deleting acivations of nodes using a probability)
2. Activation Regularization -> loss += alpha * (acivations).pow(2).mean()
3. Temporal Activaton Regularization -> loss += beta * ((acivations[:,1:]) - (acivations[:,:-1]).pow(2).mean()
