In [None]:
# |export
#|default_exp p00_makemore


In [None]:
# |export
# this file is based on https://github.com/fastai/course22/blob/master/05-linear-model-and-neural-net-from-scratch.ipynb
import matplotlib
import matplotlib.pyplot as plt
plt.ion()
import os
import time
import pathlib
import argparse
import torch
import torch.nn.functional as F
from torch import tensor
from matplotlib.pyplot import plot, imshow, tight_layout, xlabel, ylabel, title, subplot, subplot2grid, grid, text, legend, figure, gcf, xlim, ylim



In [None]:
# |export
start_time=time.time()
debug=True
_code_git_version="2765409fac4cfde8033da38c45dfd6358b1bc86c"
_code_repository="https://github.com/plops/cl-py-generator/tree/master/example/97_makemore/source/"
_code_generation_time="14:23:00 of Friday, 2024-05-10 (GMT+1)"
start_time=time.time()
debug=True


In [None]:
# |export
parser=argparse.ArgumentParser()
parser.add_argument("-v", "--verbose", help="enable verbose output", action="store_true")
args=parser.parse_args()


In [None]:
# |export
words=open("/home/martin/stage/cl-py-generator/example/97_makemore/source/names.txt", "r").read().splitlines()
words[:10]


In [None]:
# |export
min(len(w) for w in words)


In [None]:
# |export
max(len(w) for w in words)


In [None]:
# collect statistics for pairs of characters
b={}
for w in words:
    chs=((["<S>"])+(list(w))+(["<E>"]))
    for bigram in zip(chs, chs[1:]):
        b[bigram]=((b.get(bigram, 0))+(1))


In [None]:
# |export
# show statistics sorted by frequency
sorted(b.items(), key=lambda kv: ((-1)*(kv[1])))


In [None]:
# |export
character_set=sorted(list(set("".join(words))))
len(character_set)


In [None]:
# |export
stoi={s:i+1 for i, s in enumerate(character_set)}
stoi["."]=0
stoi


In [None]:
# |export
# invert lookup
itos={i:s for s, i in stoi.items()}


In [None]:
# |export
# 2d array is more convenient
number_tokens=len(stoi)
N=torch.zeros((number_tokens,number_tokens,), dtype=torch.int32)
for w in words:
    chs=((["."])+(list(w))+(["."]))
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        N[ix1,ix2] += 1


In [None]:
imshow(N)


In [None]:
figure(figsize=(16,16,))
imshow(N, cmap="Blues")
for i in range(number_tokens):
    for j in range(number_tokens):
        chstr=((itos[i])+(itos[j]))
        text(j, i, chstr, ha="center", va="bottom", color="gray")
        text(j, i, N[i,j].item(), ha="center", va="top", color="gray")
plt.axis("off")


In [None]:
p=N[0].float()
p=((p)/(p.sum()))
p


In [None]:
g=torch.Generator().manual_seed(2147483647)
p=torch.rand(3, generator=g)
p=((p)/(p.sum()))


In [None]:
torch.multinomial(p, num_samples=20, replacement=True, generator=g)


In [None]:
# https://pytorch.org/docs/stable/notes/broadcasting.html
# adding one for model smoothing (we don't want zeros in the matrix)
P=((N)+(1)).float()
P=((P)/(P.sum(1, keepdim=True)))


In [None]:
log_likelihood=(0.    )
n=0
for w in ["andrej"]:
    chs=((["."])+(list(w))+(["."]))
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        prob=P[ix1,ix2]
        logprob=torch.log(prob)
        log_likelihood += logprob
        n += 1
        # everything with probability higher than 4% is better than random
        print(f"{ch1}{ch2}: {prob:.4f} {logprob:.4f}")
print(f"{log_likelihood=}")
# we are intersted in the product of all probabilities. this would be a small number so we look at the log
# look at negative log_likelihood. the lowest we can get is 0
nll=((-1)*(log_likelihood))
print(f"{nll=}")
# normalized log likelihood is what we use
# normalized log likelihood of the training model is 2.454
print(f"{nll/n:.3f}")


In [None]:
xs=[]
ys=[]
for w in words[:1]:
    chs=((["."])+(list(w))+(["."]))
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1=stoi[ch1]
        ix2=stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)
xs=tensor(xs)
ys=tensor(ys)
# encode integers with one-hot encoding
xenc=F.one_hot(xs, num_classes=number_tokens).float()
imshow(xenc)


In [None]:
g=torch.Generator().manual_seed(2147483647)
W=torch.randn((27,27,), generator=g, requires_grad=True)


In [None]:
# output is 5x27 @ 27x27 = 5x27
# 27 neurons on 5 inputs
# what is the firing rate of the 27 neurons on everyone of the 5 inputs
# xenc @ W [3,13] indicates the firing rate of the 13 neuron for input 3. it is a dot-product of the 13th column of W with the input xenc
# we exponentiate the numbers. negative numbers will be 0..1, positive numbers will be >1
# we will interpret them as something equivalent to count (positive numbers). this is called logits. equivalent to the counts in the N matrix
# converting logits to probabilities is called softmax
# the closer values in W the closer the probabilities to equal
# you can regularize by forcing W to be closer to zero ... W**2 term in loss
logits=((xenc)@(W))
counts=logits.exp()
probs=((counts)/(counts.sum(1, keepdims=True)))
probs
loss=((-1)*(probs[torch.arange(5),ys].log().mean()))
print(loss.item())
# this is the forward pass


In [None]:
# backward pass
# clear gradient
W.grad=None
loss.backward()
W.data += (((-0.10    ))*(W.grad))
# gradient descent gives exactly the same model. sampling will be the same as the frequency counter
