<a href="https://colab.research.google.com/github/steimel60/ML/blob/main/DeepLearning/LanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()
from fastbook import *
from fastai.text.all import *

[K     |████████████████████████████████| 719 kB 18.7 MB/s 
[K     |████████████████████████████████| 1.2 MB 43.1 MB/s 
[K     |████████████████████████████████| 197 kB 48.4 MB/s 
[K     |████████████████████████████████| 346 kB 26.3 MB/s 
[K     |████████████████████████████████| 4.2 MB 46.1 MB/s 
[K     |████████████████████████████████| 60 kB 9.8 MB/s 
[K     |████████████████████████████████| 86 kB 6.8 MB/s 
[K     |████████████████████████████████| 140 kB 71.8 MB/s 
[K     |████████████████████████████████| 1.1 MB 60.5 MB/s 
[K     |████████████████████████████████| 212 kB 58.8 MB/s 
[K     |████████████████████████████████| 86 kB 6.8 MB/s 
[K     |████████████████████████████████| 596 kB 59.7 MB/s 
[K     |████████████████████████████████| 127 kB 75.8 MB/s 
[K     |████████████████████████████████| 94 kB 1.0 MB/s 
[K     |████████████████████████████████| 271 kB 72.0 MB/s 
[K     |████████████████████████████████| 144 kB 77.2 MB/s 
[K     |███████████████████████

In [2]:
#Download numbers in english
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/root/.fastai/data/human_numbers/train.txt'),Path('/root/.fastai/data/human_numbers/valid.txt')]

In [3]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [4]:
#Concatenate into one stream
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [5]:
#Tokenize text data
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [6]:
#Numericalize tokens
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [7]:
#Convert tokens into nums
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[i] for i in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

In [8]:
#targets in our example will be every sequence of 3 words
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [9]:
#But we want it in tensors
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [10]:
#Create batches
bs = 64
cut = int(len(seqs) * .8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=bs, shuffle=False)

In [11]:
#Build our model
class LMModel1(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) #layer 1 - input to hidden
    self.h_h = nn.Linear(n_hidden, n_hidden)    #layer 2 - hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz)    #layer 3 - hidden to output
  
  def forward(self, x):
    h = F.relu(self.h_h(self.i_h(x[:,0])))
    h = h + self.i_h(x[:,1])
    h = F.relu(self.h_h(h))
    h = h + self.i_h(x[:,2])
    h = F.relu(self.h_h(h))
    return self.h_o(h)

In [12]:
#Build and train our learner
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.824297,1.970941,0.467554,00:01
1,1.386973,1.823242,0.467554,00:02
2,1.417556,1.654498,0.494414,00:02
3,1.37644,1.650849,0.494414,00:02


In [13]:
n,counts = 0, torch.zeros(len(vocab))
for x,y in dls.valid:
  n += y.shape[0]
  for i in range_of(vocab): counts[i] += (y==i).long().sum()
idx = torch.argmax(counts)
idx, vocab[idx.item()], counts[idx].item()/n #most common index is at index 29, is "thousand", and always prediciting this would give us accuracy of 15%

(tensor(29), 'thousand', 0.15165200855716662)

In [14]:
#A better model
class LMModel2(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden) #layer 1 - input to hidden
    self.h_h = nn.Linear(n_hidden, n_hidden)    #layer 2 - hidden to hidden
    self.h_o = nn.Linear(n_hidden, vocab_sz)    #layer 3 - hidden to output
  
  def forward(self, x):
    h = 0
    for i in range(3): #This loop is what makes this model an RNN - recurrent neural network
      h = h+ self.i_h(x[:,i])
      h = F.relu(self.h_h(h))
    return self.h_o(h)

In [15]:
#We should get the same results
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3) 

epoch,train_loss,valid_loss,accuracy,time
0,1.816274,1.964143,0.460185,00:02
1,1.423805,1.739964,0.473259,00:01
2,1.430327,1.685172,0.485382,00:01
3,1.38839,1.657033,0.470406,00:01


##Adding state to RNN

In [16]:
#This model is stateful, it remembers the activations between calls to forward
class LMModel3(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0
  
  def forward(self, x):
    for i in range(3):
      self.h = self.h + self.i_h(x[:,i])
      self.h = F.relu(self.h_h(self.h))
    out = self.h_o(self.h)
    self.h = self.h.detach() #remove gradient history
    return out

  def reset(self): self.h = 0

In [17]:
#divide samples into groups
m = len(seqs)//bs
m, bs, len(seqs)

(328, 64, 21031)

In [21]:
#Above but reindexes the groups
def group_chunks(ds, bs):
  m = len(ds)//bs
  new_ds = L()
  for i in range(m): new_ds = L(ds[i + m*j] for j in range(bs))
  return new_ds

In [22]:
#Drop last batch that isnt size 64 and dont shuffle
cut = int(len(seqs)*.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False
)

In [23]:
#Build learner
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10,3e-3) #Better results from a little history

epoch,train_loss,valid_loss,accuracy,time
0,3.467223,3.426226,0.046875,00:00
1,3.464298,3.403263,0.109375,00:00
2,3.448774,3.355431,0.15625,00:00
3,3.418672,3.299966,0.203125,00:00
4,3.379746,3.244526,0.34375,00:00
5,3.336154,3.194137,0.28125,00:00
6,3.291062,3.150886,0.25,00:00
7,3.246963,3.119181,0.25,00:00
8,3.206102,3.10147,0.265625,00:00
9,3.170145,3.09629,0.265625,00:00


##Sending more signal can help too

We can do this by predicting the next word after every single word instead of every three words

In [26]:
sl = 16 #sequence length
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+1+sl])) for i in range(0,len(nums)-sl-1,sl))
cut = int(len(seqs)*.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False
)

In [27]:
#Each element in seqs has 2 list with the second the same as the first but offset by 1
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [28]:
#Adjust model to output prediction after every word
class LMModel4(Module):
  def __init__(self, vocab_sz, n_hidden):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.h_h = nn.Linear(n_hidden, n_hidden)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = 0
  
  def forward(self, x):
    outs = []
    for i in range(sl):
      self.h = self.h + self.i_h(x[:,i])
      self.h = F.relu(self.h_h(self.h))
      outs.append(self.h_o(self.h))
    self.h = self.h.detach() #remove gradient history
    return torch.stack(outs, dim=1)

  def reset(self): self.h = 0

Our targets are shape bs x sl but the model returns bs x sl x vocab_sz so we need to flatten before calling loss func

In [29]:
def loss_func(inp, targ):
  return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [30]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15,3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.452138,3.490425,0.052734,00:00
1,3.449492,3.47195,0.057617,00:00
2,3.440877,3.42956,0.05957,00:00
3,3.422944,3.367302,0.081055,00:00
4,3.395682,3.295859,0.123047,00:00
5,3.361108,3.220869,0.188477,00:00
6,3.321131,3.143911,0.199219,00:00
7,3.277176,3.066892,0.264648,00:00
8,3.230442,2.99327,0.310547,00:00
9,3.182226,2.926794,0.316406,00:00


##Save time with PyTorch

In [31]:
class LMModel5(Module):
  def __init__(self, vocab_sz, n_hidden, n_layers):
    self.i_h = nn.Embedding(vocab_sz, n_hidden)
    self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
    self.h_o = nn.Linear(n_hidden, vocab_sz)
    self.h = torch.zeros(n_layers, bs, n_hidden)
  
  def forward(self, x):
    res,h = self.rnn(self.i_h(x), self.h)
    self.h = h.detach()
    return self.h_o(res)

  def reset(self): self.h.zero_()

In [32]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.432505,3.429802,0.023438,00:00
1,3.42824,3.397266,0.043945,00:00
2,3.414346,3.321186,0.132812,00:00
3,3.385343,3.205534,0.237305,00:00
4,3.341136,3.067815,0.390625,00:00
5,3.285249,2.922424,0.411133,00:00
6,3.221617,2.780877,0.429688,00:00
7,3.153739,2.654009,0.44043,00:00
8,3.084776,2.548518,0.44043,00:00
9,3.017323,2.466366,0.441406,00:00
