<a href="https://colab.research.google.com/github/rtedwards/nn-zero-to-hero/blob/main/notebooks/1_makemore_bigrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.notebook import tqdm

In [None]:
words = open("names.txt", "r").read().splitlines()

In [None]:
words[:10]

In [None]:
len(words)

In [None]:
max(len(w) for w in words)

## Bigram
Given a character, we want to predict the next character in the sequence.  Always looking at the previous character to predict the next one.

In [None]:
b = {}
for w in words:
    chs = ["<S>"] + list(w) + ["<E>"]
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1


In [None]:
sorted(b.items(), key=lambda kv: -kv[1])

In [None]:
import torch

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32) # 26 letters + <S> and <E>

In [None]:
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}

In [None]:
for w in words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,16))
plt.imshow(N, cmap="Blues")
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")
plt.axis("off");

In [None]:
N[0]

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix]


In [None]:
P = (N+1).float() # the '1' is model smoothing to remove infinite nll
P /= P.sum(1, keepdim=True)
P[0]

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(10):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        out.append(itos[ix])
        if ix == 0: # end token '.'
            break

    print("".join(out))

## Gradient-Based Model

In [None]:
# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotomic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# likelihood = product of all probabilities
# log(a*b*c) = log(a) + log(b) + log(c)

In [None]:
log_likelihood = 0.0
n = 0

for w in ["andrejq"]: #words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f"{ch1}{ch2}: {prob:.4f} {logprob:.4f}")

print(f"{log_likelihood=}")
nll = -log_likelihood
print(f"{nll=}")
print(f"{nll/n}")

In [None]:
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
# create the training set of trigrams (x,y,z)
xs, ys, zs = [], [], []

for w in words[:1]:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        print(ch1, ch2, ch3)
        xs.append(ix1)
        ys.append(ix2)
        zs.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
zs = torch.tensor(zs)

In [None]:
xenc = F.one_hot(xs, num_classes=27).float()
xenc

In [None]:
# visualization of turned on bits in each OHE
plt.imshow(xenc)

In [None]:
W = torch.randn((27, 27))
xenc @ W # matrix multiplication
# (5, 27) @ (27, 27) -> (5, 27)

In [None]:
(xenc @ W)[3, 13] # firing rate of 13th neuron on 3rd input

In [None]:
(xenc[3] * W[:, 13]).sum()

In [None]:
# interpret weights as log counts
logits = xenc @ W     # log-counts
counts = logits.exp() # equivalent to N
probs = counts / counts.sum(1, keepdims=True)

In [None]:
probs[0].sum()

### Summary

In [None]:
xs

In [None]:
ys

In [None]:
# randomly initialize 27 neurons' weights
# each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27,27), generator=g)

In [None]:
# Forward Pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W     # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# btw, last 2 lines here are together called a" softmax"

**Softmax Activation Function**

a normalization function that outputs probabilites

$$
\begin{alignat}{2}
\text{Output Layer}
& \longrightarrow
\text{Softmax Activation Function}
&& \longrightarrow
\text{Probabilities} \\
\begin{bmatrix}
1.3 \\
5.1 \\
2.2 \\
0.7 \\
1.1
\end{bmatrix}
& \longrightarrow
\frac{e^{z_i}}{\sum^K_{j=1} e^{z_j}}
&& \longrightarrow
\begin{bmatrix}
0.02 \\
0.90 \\
0.05 \\
0.01 \\
0.02
\end{bmatrix}
\end{alignat}
$$

In [None]:
nlls = torch.zeros(5)
for i in range(5):
    # i-th bigram:
    x = xs[i].item() # input character index
    y = ys[i].item() # label character index
    print("--------")
    print(f"bigram example {i+1}: '{itos[x]}{itos[y]}' (indexes {x},{y})")
    print(f"input to the neural net: {x}")
    print(f"output probabilities from the neural net:", probs[i])
    print(f"label (actual next character): {y} ({itos[y]})")
    p = probs[i, y]
    logp = torch.log(p)
    nll = -logp
    nlls[i] = nll
    print(f"probability assigned by the net to the correct character: {p.item()}")
    print(f"log-likelihood: {logp.item()}")
    print(f"negative log-likelihood: {nll.item()}")


print("========")
print(f"average negative log-likelihood, i.e. loss = {nlls.mean().item()}")

### Full Model

In [None]:
# create the training set of bigrams (x,y)
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# randomly initialize 27 neurons' weights
# each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

for k in tqdm(range(1000)):
    # Forward Pass
    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    logits = xenc @ W     # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    loss = -probs[torch.arange(num), ys].log().mean() # selects probabilities the net assigned to next characters
    loss = loss + 0.01 * (W**2).mean() # loss

    # Backward Pass
    W.grad = None # more efficient than zeroing
    loss.backward() # fills in intermediant gradients

    # Update
    W.data += -50 * W.grad

    # print(f"Loss: {loss.item()}")

print(f"Loss: {loss.sum()}")

### Inference

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    ix = 0
    while True:
        # --- Before ---
        # p = P[ix]
        # --- Now ---
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True)

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0: # if end of word '.'
            break
    print("".join(out))