# Building a trigram language model of the same architecture as in the video.
### Train a trigram language model: take 2 characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss: did it improve over a bigram model? 

In [37]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
from string import ascii_lowercase
%matplotlib inline
words = open('../names.txt', 'r').read().splitlines()

In [56]:
chars = sorted(list(set(''.join(words))))
chars.insert(0, '.')
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for s,i in stoi.items()}

char_pairs = []
for i in '.' + ascii_lowercase:
    for j in '.' + ascii_lowercase:
        char_pairs.append(i + j)

ptoi = {p:i for i,p in enumerate(char_pairs)}
itop = {i:p for p,i in ptoi.items()}

In [65]:
xs, ys = [], []
for w in words:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for i in range(len(chs[:-2])):
        ix1 = ptoi[chs[i] + chs[i+1]]
        ix2 = stoi[chs[i+2]]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

number of examples:  228146


In [66]:
W = torch.randn((729,27), requires_grad=True)

In [97]:
# gradient descent.
for k in range(10):
    # forward pass.
    xenc = F.one_hot(xs, num_classes=729).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()
    print(f'trial {k}: loss = {loss.item()}')
    
    # backward pass.
    W.grad = None
    loss.backward()
    
    # update.
    W.data += 100 * -W.grad

trial 0: loss = 2.333828926086426
trial 1: loss = 2.3331234455108643
trial 2: loss = 2.332425355911255
trial 3: loss = 2.331735372543335
trial 4: loss = 2.331052780151367
trial 5: loss = 2.3303778171539307
trial 6: loss = 2.329710006713867
trial 7: loss = 2.3290493488311768
trial 8: loss = 2.3283963203430176
trial 9: loss = 2.3277499675750732


In [98]:
for i in range(50):
  out = ['.','.']
  while True:
    ix = ptoi[''.join(out[-2:])]
    xenc = F.one_hot(torch.tensor([ix]), num_classes=729).float()
    logits = xenc @ W 
    counts = logits.exp() 
    p = counts / counts.sum(1, keepdims=True)
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break

  print(''.join(out[2:-1]))

ra
daqtana
eveni
ro
violinnanifpsuleignfbitenan
ju
vieli
emrnkbyjxwkbellettenerol
balyn
shws
zomazmsjuaylengglnvirkgpzsoxc
samacmhhus
rack
re
lor
mael
bellani
finsuukjan
becania
hae
jeven
karrineth
da
sh
sahlnzrinarlfur
samya
don
kha
yessaun
na
driyah
morig
kqlnozcptkmguhana
gqw
ad
kaitps
cairairayan
mee
avyn
an
jan
hhpxzytruwabers
asterya
ron
qus
tell
habreel
al
henzwefzdp
emillah


we now have 729 x 27 values to determine. this will be more accurate in the limit, but it takes longer to train well because each cell is seen less often per training iteration, which means that the initial values are stickier. that's why this is so bad right now.  

update after a few hundred iterations: loss lower than for the previous model. nice.  

now to calculate explicitly what the loss should be.

In [119]:
N = torch.zeros((729,27), dtype=torch.float32)
for w in words:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for i in range(len(chs[:-2])):
        ix1 = ptoi[chs[i] + chs[i+1]]
        ix2 = stoi[chs[i+2]]
        N[ix1,ix2] += 1
        
P = F.normalize(N, p=1, dim=1)

In [121]:
log_likelihood = 0.0
n = 0

for w in words:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for i in range(len(chs[:-2])):
        ix1 = ptoi[chs[i] + chs[i+1]]
        ix2 = stoi[chs[i+2]]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n+=1
        
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n=}')

log_likelihood=tensor(-498647.7812)
nll=tensor(498647.7812)
nll/n=tensor(2.1857)


pretty good!