<a href="https://colab.research.google.com/github/steimel60/ML/blob/main/DeepLearning/LanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.text.all import *

[K     |████████████████████████████████| 719 kB 5.0 MB/s 
[K     |████████████████████████████████| 197 kB 49.4 MB/s 
[K     |████████████████████████████████| 4.2 MB 48.2 MB/s 
[K     |████████████████████████████████| 346 kB 59.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 54.7 MB/s 
[K     |████████████████████████████████| 60 kB 7.2 MB/s 
[K     |████████████████████████████████| 212 kB 64.1 MB/s 
[K     |████████████████████████████████| 86 kB 3.4 MB/s 
[K     |████████████████████████████████| 1.1 MB 52.4 MB/s 
[K     |████████████████████████████████| 86 kB 4.7 MB/s 
[K     |████████████████████████████████| 140 kB 69.4 MB/s 
[K     |████████████████████████████████| 596 kB 53.5 MB/s 
[K     |████████████████████████████████| 127 kB 61.6 MB/s 
[K     |████████████████████████████████| 144 kB 60.9 MB/s 
[K     |████████████████████████████████| 271 kB 84.4 MB/s 
[K     |████████████████████████████████| 94 kB 1.1 MB/s 
[K     |████████████████████████

In [2]:
#Download numbers in english
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/root/.fastai/data/human_numbers/valid.txt'),Path('/root/.fastai/data/human_numbers/train.txt')]

In [3]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [4]:
#Concatenate into one stream
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [5]:
#Tokenize text data
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [6]:
#Numericalize tokens
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [7]:
#Convert tokens into nums
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [8]:
#targets in our example will be every sequence of 3 words
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [9]:
#But we want it in tensors
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [10]:
#Create batches
bs = 64
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=bs, shuffle=False)

In [11]:
#Build our model
class LMModel1(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) #layer 1 - input to hidden
    self.h_h = nn.Linear(n_hidden, n_hidden)    #layer 2 - hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz)    #layer 3 - hidden to output
  
  def forward(self, x):
    h = F.relu(self.h_h(self.i_h(x[:,0])))
    h = h + self.i_h(x[:,1])
    h = F.relu(self.h_h(h))
    h = h + self.i_h(x[:,2])
    h = F.relu(self.h_h(h))
    return self.h_o(h)

In [12]:
#Build and train our learner
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.824297,1.970941,0.467554,00:01
1,1.386973,1.823242,0.467554,00:01
2,1.417556,1.654498,0.494414,00:01
3,1.37644,1.650849,0.494414,00:01


In [15]:
n,counts = 0, torch.zeros(len(vocab))
for x,y in dls.valid:
  n += y.shape[0]
  for i in range_of(vocab): counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n #most common index is at index 29, is "thousand", and always prediciting this would give us accuracy of 15%

(tensor(29), 'thousand', 0.15165200855716662)

In [16]:
#A better model
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) #layer 1 - input to hidden
    self.h_h = nn.Linear(n_hidden, n_hidden)    #layer 2 - hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz)    #layer 3 - hidden to output
  
  def forward(self, x):
    h = 0
    for i in range(3): #This loop is what makes this model an RNN - recurrent neural network
      h = h+ self.i_h(x[:,i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)

In [17]:
#We should get the same results
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.829873,2.037933,0.457333,00:02
1,1.408025,1.852531,0.473259,00:01
2,1.451901,1.703711,0.48348,00:01
3,1.411282,1.695291,0.459235,00:01
