In [None]:
import torch.nn.functional as F

In [None]:
words = open("names.txt").read().splitlines()

In [None]:
words[:10]

In [None]:
len(words)

In [None]:
min(len(w) for w in words)

In [None]:
max(len(w) for w in words)

In [None]:
# bigram language model

b = {}
for w in words:
    chs = ["<S>"] + list(w) + ["<E>"]  # start & end markers on a word
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1

In [None]:
sorted(b.items(), key=lambda x: -x[1])

In [None]:
import torch

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [None]:
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}

In [None]:
# bigram language model
for w in words:
    chs = ["."] + list(w) + ["."]  # start & end markers on a word
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

plt.figure(figsize=(16, 16))
plt.imshow(N, cmap="Blues")

for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")
plt.axis("off")

# now the first row is just starting word & first column is ending words & everything else are middle characters

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
g = torch.Generator().manual_seed(2147483647)
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix]

In [None]:
P = (N + 1).float()
P /= P.sum(1, keepdim=True)  # sum row-wise

In [None]:
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []

    ix = 0
    while True:
        p = P[ix]
        # p = N[ix].float()
        # p = p / p.sum()
        # p = torch.ones(27) / 27.0  # every output is as likely; bigram is better than this randomness!

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        out.append(itos[ix])

        if ix == 0:
            break

    print("".join(out))

In [None]:
"""
Goal: maximise the likelihood of the data w.r.t. model parameters (statistical modelling) - take product of probablities
a * b * c

equivalent to maximising the log likelihood (because log is monotonic) - take sum of probabilities
log(a * b * c) -> log(a) + log(b) + log(c)

equivalent to minimising the negative log likelihood
equivalent to minimising the average log likelihood

"""

In [None]:
# looking to evaluate the quality of this bigram model now
log_likelihood = 0.0
n = 0

for w in words:
    # for w in ["andrejq"]:
    chs = ["."] + list(w) + ["."]  # start & end markers on a word
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        # print(f"{ch1}{ch2}: {prob:.4f} {logprob:.4f}")

print(f"{log_likelihood=}")
# negative log_likelihood
nil = -log_likelihood
print(f"{nil=}")
print(f"{nil / n}")

In [None]:
"""
the nn problem:


"""

In [None]:
# create the training set for bigram: x, y -> given x predict y
# x: inputs
# y: targets

xs, ys = [], []

for w in words[:1]:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]

        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

"""
xs & ys are integer indices for characters. to make a neural net aware of this
spatial arrangement of data, we can use one-hot encoding. otherwise a NN is not
capable of understanding an integer just randomly.
"""

In [None]:
xs

In [None]:
ys

In [None]:
xenc = F.one_hot(xs, num_classes=27).float()  # nns like floats not ints
plt.imshow(xenc)

In [None]:
"""
For input word, say, "emma." (5 chars), we make 5 independent forward passes through the network. one per character.

- OHE triggers one of the 27 neurons at a time. so 'e' triggs the 5 of the 27 neurons (0-indexed).
- The weight multiplication of 27 weights in that neuron is just how 'e' is interatacting with all the possible 27 outputs.
- So essentially, matmul acts as a switch. Given a char input, it select all the 27 weights of that neuron as-is.
"""

xenc = F.one_hot(xs, num_classes=27).float()  # nns like floats not ints
g = torch.Generator().manual_seed(2147483647 + 1)
W = torch.randn(
    (27, 27), generator=g
)  # first layer of the net; where there are 27 neurons
xenc @ W

In [None]:
"""
LOGITS — Quick Reference

What are they?
Raw, unnormalized scores output by a neural network before converting to probabilities.

The flow:
input → network → LOGITS → softmax → probabilities
                  (any real number)   (0 to 1, sum=1)

Why "logits"?
From logistic regression — "logit" = log-odds. Just ML jargon for "pre-softmax values."

Key properties:
- Can be any real number (negative, positive, large, small)
- Not probabilities — don't sum to 1
- Only relative differences matter (adding a constant to all logits doesn't change final probabilities)

Why use them?
- Unconstrained optimization — model can freely push scores up/down during training
- Softmax handles normalization — separation of concerns
- Cleaner gradients — no boundary issues near 0 or 1

Example:
logits = [0.1, -0.3, 0.9, 0.4, -0.2]   <- raw network output
probs  = softmax(logits)               <- [0.19, 0.13, 0.43, 0.26, 0.14]


LOG-ODDS — Quick Reference

Odds
----
Another way to express probability.

odds = probability of event / probability of NOT event
     = p / (1 - p)

Example: 70% chance of rain
odds = 0.70 / 0.30 = 2.33
Meaning: rain is 2.33 times more likely than no rain ("2.33 to 1 odds")


Log-odds (the "logit")
----------------------
Just take the logarithm of the odds:

log-odds = log(p / (1 - p))

Why bother?
- Probability is bounded: 0 to 1
- Odds is bounded: 0 to infinity
- Log-odds is unbounded: -infinity to +infinity

That's the magic. Log-odds can be any real number, just like neural network logits.


The connection
--------------
p = 0.50  ->  odds = 1.0   ->  log-odds = 0
p = 0.70  ->  odds = 2.33  ->  log-odds = +0.85
p = 0.30  ->  odds = 0.43  ->  log-odds = -0.85
p = 0.99  ->  odds = 99    ->  log-odds = +4.6
p = 0.01  ->  odds = 0.01  ->  log-odds = -4.6

Notice:
- 50/50 sits at zero
- More likely = positive
- Less likely = negative
- Symmetric and unbounded

That's why neural network outputs are called "logits" — they live in this same unconstrained space.

"""

In [None]:
logits = xenc @ W  # log-counts
counts = logits.exp()  # equivalent to the N-matrix that we had created earlier
probs = counts / counts.sum(1, keepdims=True)

# last 2 lines are softmax: for a given linear output convert to probabilities
# this is the progression. we are
# probability → odds → log-odds (logit)
# [0 to 1]      [0 to ∞]   [-∞ to +∞]
"""
LOGITS TO PROBABILITIES — The Full Pipeline

The trifecta
------------
probability -> odds -> log-odds (logit)
[0 to 1]      [0 to inf]   [-inf to +inf]

Each step "unlocks" more of the number line.
They're all just different ways of saying the same thing — fully convertible.


Going backwards
---------------
logits -> exp() -> odds -> normalize -> probabilities

Exponentiate log-odds to get odds.
Normalize odds to get probabilities.


Softmax does both steps in one
------------------------------
softmax(logits) = exp(logits) / sum(exp(logits))
                      |              |
                 get odds      normalize them

So when you call softmax, you're really doing:
1. Exponentiate to get back to odds-space
2. Divide by the total so they sum to 1

That's the whole trick. Softmax = "exp then normalize."
"""

In [None]:
nlls = torch.zeros(5)
for i in range(5):
    x = xs[i].item()
    y = ys[i].item()

    print("-----------------")

    print(f"bigram example {i + 1}: {itos[x]}{itos[y]} (indexes {x}, {y})")
    print("input to the neural net:", x)
    print("output probabilities from the neural net:", probs[i])
    print("label (actual next character):", y)
    p = probs[i, y]
    print("probability assigned by the net to the correct character:", p.item())
    logp = torch.log(p)
    print("log likelihood:", logp.item())
    nll = -logp
    print("negative log likelihood:", nll.item())
    nlls[i] = nll

print("==============")
print("avg negative log likelihood", nlls.mean().item())

In [None]:
# optimisation now

In [None]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(
    (27, 27), generator=g, requires_grad=True
)  # first layer of the net; where there are 27 neurons

In [None]:
# forward pass

xenc = F.one_hot(xs, num_classes=27).float()  # nns like floats not ints
logits = xenc @ W  # log-counts
counts = logits.exp()  # equivalent to the N-matrix that we had created earlier
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(5), ys].log().mean()

In [None]:
"""
there is also some math on regularisation

REGULARIZATION — The Math

The problem
-----------
Model overfits = memorizes training data, fails on new data.
Solution: penalize complexity (big weights).

Without regularization
----------------------
loss = prediction_loss

With regularization (L2 / weight decay)
---------------------------------------
loss = prediction_loss + lambda * sum(W^2)

Where:
- W = all your weights
- W^2 = square each weight (makes all positive, penalizes big ones more)
- sum(W^2) = add them all up
- lambda = how much you care about keeping weights small (you tune this)

Concrete example
----------------
W = [0.5, -2.0, 0.1, 3.0]
W^2 = [0.25, 4.0, 0.01, 9.0]
sum(W^2) = 13.26

If lambda = 0.01:
regularization penalty = 0.01 * 13.26 = 0.1326

If prediction loss = 2.5:
total loss = 2.5 + 0.1326 = 2.6326

The gradient (backprop)
-----------------------
For any weight w, the regularization term w^2 contributes:

d/dw (lambda * w^2) = 2 * lambda * w

So gradient has two parts:
gradient = gradient_from_prediction + 2 * lambda * w

Big weights get pushed harder toward zero (push is proportional to w).

Why "weight decay"?
-------------------
Weights decay toward zero a little bit each step:

w_new = w_old - learning_rate * pred_gradient - learning_rate * 2 * lambda * w_old
                    ^                       ^
             fit the data            decay toward zero

Same math, different name.

The effect
----------
- Without: weights can grow large, model overfits
- With: weights stay small unless truly needed, model generalizes better
"""

In [None]:
print(loss.item())

In [None]:
# backward pass

W.grad = None
loss.backward()

In [None]:
W.data += -0.1 * W.grad

In [None]:
# more final

In [None]:
xs, ys = [], []

for w in words[:]:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]

        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print("number of examples", num)

g = torch.Generator().manual_seed(2147483647)
W = torch.randn(
    (27, 27), generator=g, requires_grad=True
)  # first layer of the net; where there are 27 neurons

In [None]:
for k in range(100):
    # forward pass

    xenc = F.one_hot(xs, num_classes=27).float()  # nns like floats not ints
    logits = xenc @ W  # log-counts
    counts = logits.exp()  # equivalent to the N-matrix that we had created earlier
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean()

    print(loss.item())

    # backward pass
    W.grad = None
    loss.backward()

    W.data += -50 * W.grad

In [244]:
# sample the nn

g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []

    ix = 0
    while True:
        # before
        # p = P[ix]
        # ------

        # now
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W  # predicts log-counts
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)
        # ------

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        out.append(itos[ix])

        if ix == 0:
            break

    print("".join(out))

# ps: output remains very very similar cuz this bigram NN is pretty much the same as what we trained earlier.

cexze.
momasurailezityha.
konimittain.
llayn.
ka.
