<a href="https://colab.research.google.com/github/steimel60/ML/blob/main/DeepLearning/LanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.text.all import *

[?25l[K     |▌                               | 10 kB 28.9 MB/s eta 0:00:01[K     |█                               | 20 kB 23.8 MB/s eta 0:00:01[K     |█▍                              | 30 kB 17.6 MB/s eta 0:00:01[K     |█▉                              | 40 kB 15.3 MB/s eta 0:00:01[K     |██▎                             | 51 kB 7.1 MB/s eta 0:00:01[K     |██▊                             | 61 kB 8.4 MB/s eta 0:00:01[K     |███▏                            | 71 kB 7.9 MB/s eta 0:00:01[K     |███▋                            | 81 kB 8.8 MB/s eta 0:00:01[K     |████                            | 92 kB 9.7 MB/s eta 0:00:01[K     |████▌                           | 102 kB 8.1 MB/s eta 0:00:01[K     |█████                           | 112 kB 8.1 MB/s eta 0:00:01[K     |█████▌                          | 122 kB 8.1 MB/s eta 0:00:01[K     |██████                          | 133 kB 8.1 MB/s eta 0:00:01[K     |██████▍                         | 143 kB 8.1 MB/s eta 0:00:01[K 

In [2]:
#Download numbers in english
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/root/.fastai/data/human_numbers/valid.txt'),Path('/root/.fastai/data/human_numbers/train.txt')]

In [3]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [4]:
#Concatenate into one stream
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [5]:
#Tokenize text data
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [6]:
#Numericalize tokens
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [7]:
#Convert tokens into nums
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [8]:
#targets in our example will be every sequence of 3 words
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [9]:
#But we want it in tensors
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [10]:
#Create batches
bs = 64
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=bs, shuffle=False)

In [11]:
#Build our model
class LMModel1(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) #layer 1 - input to hidden
    self.h_h = nn.Linear(n_hidden, n_hidden)    #layer 2 - hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz)    #layer 3 - hidden to output
  
  def forward(self, x):
    h = F.relu(self.h_h(self.i_h(x[:,0])))
    h = h + self.i_h(x[:,1])
    h = F.relu(self.h_h(h))
    h = h + self.i_h(x[:,2])
    h = F.relu(self.h_h(h))
    return self.h_o(h)

In [12]:
#Build and train our learner
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.824297,1.970941,0.467554,00:01
1,1.386973,1.823242,0.467554,00:01
2,1.417556,1.654498,0.494414,00:02
3,1.37644,1.650849,0.494414,00:02


In [13]:
n,counts = 0, torch.zeros(len(vocab))
for x,y in dls.valid:
  n += y.shape[0]
  for i in range_of(vocab): counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n #most common index is at index 29, is "thousand", and always prediciting this would give us accuracy of 15%

(tensor(29), 'thousand', 0.15165200855716662)

In [14]:
#A better model
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) #layer 1 - input to hidden
    self.h_h = nn.Linear(n_hidden, n_hidden)    #layer 2 - hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz)    #layer 3 - hidden to output
  
  def forward(self, x):
    h = 0
    for i in range(3): #This loop is what makes this model an RNN - recurrent neural network
      h = h+ self.i_h(x[:,i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)

In [15]:
#We should get the same results
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.816274,1.964143,0.460185,00:01
1,1.423805,1.739964,0.473259,00:02
2,1.430327,1.685172,0.485382,00:03
3,1.38839,1.657033,0.470406,00:02


##Adding state to RNN

In [16]:
#This model is stateful, it remembers the activations between calls to forward
class LMModel3(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0
  
  def forward(self, x):
    for i in range(3):
      self.h = self.h + self.i_h(x[:,i])
      self.h = F.relu(self.h_h(self.h))
    out = self.h_o(self.h)
    self.h = self.h.detach() #remove gradient history
    return out

  def reset(self): self.h = 0

In [17]:
#divide samples into groups
m = len(seqs)//bs
m, bs, len(seqs)

(328, 64, 21031)

In [18]:
#Above but reindexes the groups
def group_chunks(ds, bs):
  m = len(ds)//bs
  new_ds = L()
  for i in range(m): new_ds = L(ds[i + m*j] for j in range(bs))
  return new_ds

In [19]:
#Drop last batch that isnt size 64 and dont shuffle
cut = int(len(seqs)*.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False
)

In [20]:
#Build learner
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10,3e-3) #Better results from a little history

epoch,train_loss,valid_loss,accuracy,time
0,3.327572,3.323006,0.046875,00:00
1,3.324659,3.301501,0.046875,00:00
2,3.309127,3.256275,0.046875,00:00
3,3.278579,3.200846,0.0625,00:00
4,3.238185,3.142714,0.09375,00:00
5,3.191905,3.086158,0.125,00:00
6,3.143015,3.038446,0.15625,00:00
7,3.094622,3.004031,0.15625,00:00
8,3.04945,2.985095,0.15625,00:00
9,3.009567,2.979559,0.15625,00:00


##Sending more signal can help too

We can do this by predicting the next word after every single word instead of every three words

In [21]:
sl = 16 #sequence length
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+1+sl])) for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs)*.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False
)

In [22]:
#Each element in seqs has 2 list with the second the same as the first but offset by 1
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [23]:
#Adjust model to output prediction after every word
class LMModel4(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0
  
  def forward(self, x):
    outs = []
    for i in range(sl):
      self.h = self.h + self.i_h(x[:,i])
      self.h = F.relu(self.h_h(self.h))
      outs.append(self.h_o(self.h))
    self.h = self.h.detach() #remove gradient history
    return torch.stack(outs, dim=1)

  def reset(self): self.h = 0

Our targets are shape bs x sl but the model returns bs x sl x vocab_sz so we need to flatten before calling loss func

In [24]:
def loss_func(inp, targ):
  return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [25]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.515865,3.534737,0.016602,00:00
1,3.513115,3.512919,0.026367,00:00
2,3.504191,3.46276,0.02832,00:00
3,3.485763,3.390942,0.02832,00:00
4,3.458187,3.311083,0.042969,00:00
5,3.423964,3.230036,0.111328,00:00
6,3.385307,3.150996,0.192383,00:00
7,3.34378,3.07685,0.228516,00:00
8,3.300672,3.009414,0.261719,00:00
9,3.257162,2.951045,0.262695,00:00


##Save time with PyTorch

In [26]:
class LMModel5(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = torch.zeros(n_layers, bs, n_hidden)
  
  def forward(self, x):
    res,h = self.rnn(self.i_h(x), self.h)
    self.h = h.detach()
    return self.h_o(res)

  def reset(self): self.h.zero_()

In [27]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.477326,3.449765,0.017578,00:00
1,3.47243,3.411843,0.018555,00:00
2,3.456496,3.323555,0.058594,00:00
3,3.423368,3.191411,0.326172,00:00
4,3.373218,3.037416,0.411133,00:00
5,3.310364,2.879385,0.443359,00:00
6,3.239548,2.730469,0.439453,00:00
7,3.164955,2.601913,0.442383,00:00
8,3.090265,2.499048,0.445312,00:00
9,3.018275,2.421227,0.444336,00:00


##Exploding or Disappearing activations (quickly approaching +- infinity) are likely whats giving us worse results

#We can use LTSM (Long short-term memory) to help this

In [28]:
class LTSMCell(Module):
  def __init__(self, ni, nh):
    self.forget_gate = nn.Linear(ni + nh, nh)
    self.input_gate  = nn.Linear(ni + nh, nh)
    self.cell_gate   = nn.Linear(ni + nh, nh)
    self.output_gate = nn.Linear(ni + nh, nh)

  def forward(self, input, state):
    h,c = state
    h = torch.stack([h, input], dim=1)
    forget = torch.sigmoid(self.forget_gate(h))
    c = c*forget
    inp = torch.sigmoid(self.input_gate(h))
    cell = torch.tanh(self.cell_gate(h))
    c = c+inp*cell
    out = torch.sigmoid(self.output_gate(h))
    h = out * torch.tanh(c)
    return h, (h,c)


In [29]:
#We can optimize the class above
class LTSMCell(Module):
  def __init__(self, ni, nh):
    self.ih = nn.Linear(ni,4*nh)
    self.hh = nn.Linear(nh,4*nh)
  
  def forward(self, input, state):
    h,c = state
    gates = (self.ih(input)+self.hh(h)).chunk(4,1)
    ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3])
    cellgate = gates[3].tanh()
    c = (forgetgate*c) + (ingate*cellgate)
    h = outgate*c.tanh()
    return h, (h,c)

In [36]:
#update model to use lstm
class LMModel6(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
  
  def forward(self, x):
    res,h = self.rnn(self.i_h(x), self.h)
    self.h = [h_.detach() for h_ in h]
    return self.h_o(res)

  def reset(self):
    for h in self.h: h.zero_()

In [37]:
learn = Learner(dls, LMModel6(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,3.413969,3.410167,0.036133,00:00
1,3.411525,3.392582,0.036133,00:00
2,3.403557,3.34797,0.036133,00:00
3,3.385589,3.242154,0.140625,00:00
4,3.346944,3.061805,0.141602,00:00
5,3.278336,3.003152,0.146484,00:00
6,3.214273,2.95665,0.186523,00:00
7,3.158749,2.899602,0.253906,00:00
8,3.110103,2.842013,0.213867,00:00
9,3.066505,2.79556,0.272461,00:00
