In [None]:
words = open("names.txt", "r").read().splitlines()

In [None]:
words[:10]

In [None]:
len(words)

In [None]:
max(len(w) for w in words)

## Bigram
Given a character, we want to predict the next character in the sequence.  Always looking at the previous character to predict the next one.

In [None]:
b = {}
for w in words:
    chs = ["<S>"] + list(w) + ["<E>"]
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1


In [None]:
sorted(b.items(), key=lambda kv: -kv[1])

In [None]:
import torch

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32) # 26 letters + <S> and <E>

In [None]:
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
for w in words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap="Blues")
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")
plt.axis("off");

In [None]:
N[0]

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix]


In [None]:
P = (N+1).float() # the '1' is model smoothing to remove infinite nll
P /= P.sum(1, keepdim=True)
P[0]

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    
        out.append(itos[ix])
        if ix == 0: # end token '.'
            break

    print("".join(out))

In [None]:
# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotomic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# likelihood = product of all probabilities
# log(a*b*c) = log(a) + log(b) + log(c)

In [None]:
log_likelihood = 0.0
n = 0

for w in ["andrejq"]: #words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f"{ch1}{ch2}: {prob:.4f} {logprob:.4f}")

print(f"{log_likelihood=}")
nll = -log_likelihood
print(f"{nll=}")
print(f"{nll/n}")