# A notebook to build a NLP model from scratch

In [1]:
from fastai.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/Users/ramamurthi/.fastai/data/human_numbers/train.txt'),Path('/Users/ramamurthi/.fastai/data/human_numbers/valid.txt')]

## Combining train and valid for this particular usecase

In [63]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
print("Printing Lines")
print(lines[:100])

text = ' . '.join(l.strip() for l in lines)
print("Printing TEXT")
print(text[:100])

Printing Lines
['one \n', 'two \n', 'three \n', 'four \n', 'five \n', 'six \n', 'seven \n', 'eight \n', 'nine \n', 'ten \n', 'eleven \n', 'twelve \n', 'thirteen \n', 'fourteen \n', 'fifteen \n', 'sixteen \n', 'seventeen \n', 'eighteen \n', 'nineteen \n', 'twenty \n', 'twenty one \n', 'twenty two \n', 'twenty three \n', 'twenty four \n', 'twenty five \n', 'twenty six \n', 'twenty seven \n', 'twenty eight \n', 'twenty nine \n', 'thirty \n', 'thirty one \n', 'thirty two \n', 'thirty three \n', 'thirty four \n', 'thirty five \n', 'thirty six \n', 'thirty seven \n', 'thirty eight \n', 'thirty nine \n', 'forty \n', 'forty one \n', 'forty two \n', 'forty three \n', 'forty four \n', 'forty five \n', 'forty six \n', 'forty seven \n', 'forty eight \n', 'forty nine \n', 'fifty \n', 'fifty one \n', 'fifty two \n', 'fifty three \n', 'fifty four \n', 'fifty five \n', 'fifty six \n', 'fifty seven \n', 'fifty eight \n', 'fifty nine \n', 'sixty \n', 'sixty one \n', 'sixty two \n', 'sixty three \n', 'si

## Splitting the items by ' '  and assinging tokens to each and every word

In [61]:
tokens = text.split(' ')
tokens[1000:1020]
print(tokens)

['.',
 'two',
 'hundred',
 'fifty',
 'seven',
 '.',
 'two',
 'hundred',
 'fifty',
 'eight',
 '.',
 'two',
 'hundred',
 'fifty',
 'nine',
 '.',
 'two',
 'hundred',
 'sixty',
 '.']

## Getting Vocabs and Nums from tokens (Tokenization and Numericalization steps)

In [64]:
vocab = L(*tokens).unique()
print("Vocab",vocab[:10])

word2idx = {i:w for w,i in enumerate(vocab)}
nums = [i for i in tokens]
nums = L(word2idx[i] for i in tokens)
nums

Vocab ['one', '.', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']


(#63095) [0,1,2,1,3,1,4,1,5,1...]

## creating x, y tuples for train test examples

In [65]:

token_seq = L((tokens[i:i+3] , tokens[i+3]) for i in range(0,len(tokens)-4,3))
print(token_seq[1:5])
seqs = L((tensor(nums[i:i+3]) , tensor(nums[i+3])) for i in range(0,len(nums)-4,3))
print(seqs[1:5])

[(['.', 'three', '.'], 'four'), (['four', '.', 'five'], '.'), (['.', 'six', '.'], 'seven'), (['seven', '.', 'eight'], '.')]
[(tensor([1, 3, 1]), tensor(4)), (tensor([4, 1, 5]), tensor(1)), (tensor([1, 6, 1]), tensor(7)), (tensor([7, 1, 8]), tensor(1))]


## Creating DataLoaders using the x and y created above . Train , Valid is defined by using a cut of 80% on the whole dataset

In [66]:
bs = 64
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(seqs[:cut]  ## train 
                                ,seqs[cut:]  ## valid
                                ,bs = 64 
                                ,shuffle = False)

## Peak into the Datasets
x, y = first(dls.train)
print(x.shape , y.shape)
print(x[:3] , y[:3])


torch.Size([64, 3]) torch.Size([64])
tensor([[0, 1, 2],
        [1, 3, 1],
        [4, 1, 5]]) tensor([1, 4, 1])


In [67]:
## Creating Language Model1

class LMModel(Module):
    def __init__(self , vocab_sz , n_hidden):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.h_h = nn.Embedding(n_hidden , n_hidden)
        self.h_o = nn.Embedding(n_hidden , vocab_sz)
    
    def forward(self,input_x):
        
        ## Processing Unit for one word
        h = self.i_h(x[:,0])  # (bs,vocab_sz) -> (bs,n_hidden)
        h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
        h = F.ReLU(h) # Activation Function 
        
        ## Processing Unit for word 2
        h = h + self.i_h(x[:,1])
        h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
        h = F.ReLU(h) # Activation Function 
        
        ## Processing Unit for word 3
        h = h + self.i_h(x[:,1])
        h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
        h = F.ReLU(h) # Activation Function 
        
        ## Final Conversion to Vocab size for softmax to handle
        h = self.h_o(h)

        return h

In [68]:
learn = Learner(dls  
            ,LMModel(len(vocab),64) 
            ,loss_func=F.cross_entropy 
            ,metrics = accuracy )
learn.fit_one_cycle(4,1e-3)

epoch,train_loss,valid_loss,accuracy,time


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

## Baseline Model which is basically predicting the word with highest occurance irrespective of the order

In [74]:
n,counts = 0,torch.zeros(len(vocab))
for x,y in dls.train:
    n += y.shape[0] ## Total Number of values of y surfed
    for i in range_of(vocab): ## Running over all the indexes of vocab (i)
        counts[i] += (y==i).long().sum() ## Addign the count of occurance of a particular i for every y and i
idx = torch.argmax(counts)  ## Checking the index of max occurance
idx , vocab[idx.item()] , counts[idx].item()/n

(tensor(1), '.', 0.1598878816793893)

## Simplyfying the model with creation of unit  

In [76]:
## Simplified language model

class LMModel2(Module):
    def __init__(self , vocab_sz , n_hidden):
        self.i_h = nn.Embedding(vocab_sz , n_hidden)
        self.h_h = nn.Embedding(n_hidden , n_hidden)
        self.h_o = nn.Embedding(n_hidden , vocab_sz)
    
    def forward(self,input_x):
        
        h = 0
        num_untis = 3
        
        ## Repeat for 1->n-1
        for i in range(num_untis):
            h = h + self.i_h(x[:,num_untis])
            h = self.h_h(h)  # (bs,n_hidden) -> (bs,n_hidden)
            h = F.ReLU(h) # Activation Function 
        
        return self.h_o(h) ## Final Conversion to Vocab size for softmax to handle

# Improving RNNs