In [1]:
path = "/content/drive/MyDrive/Colab Notebooks/AndrejKarpathy_NN_Hero/names.txt"

In [2]:
import numpy as np
import torch
import torch.nn.functional as F

In [3]:
words = open(path, mode='r').read().splitlines()

In [4]:
chars = ['.'] + list("abcdefghijklmnopqrstuvwxyz")
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for c, i in stoi.items()}

# Quadgram

In [63]:
# Create a training data set

x1s = []
x2s = []
x3s = [] # Can use a 2d Array with 3 Sub Arrays
ys = []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3, ch4 in zip(chs, chs[1:], chs[2:], chs[3:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        ix4 = stoi[ch4]

        x1s.append(ix1)
        x2s.append(ix2)
        x3s.append(ix3)
        ys.append(ix4)

x1s = torch.tensor(x1s)
x2s = torch.tensor(x2s)
x3s = torch.tensor(x3s)
ys = torch.tensor(ys)
print(f"Total Examples: {ys.nelement()}")

x1enc = F.one_hot(x1s, num_classes=27).float()
x2enc = F.one_hot(x2s, num_classes=27).float()
x3enc = F.one_hot(x3s, num_classes=27).float()
xenc = torch.cat([x1enc, x2enc, x3enc], dim=1)

Total Examples: 164080


In [64]:
# Create the network

g = torch.Generator().manual_seed(2147483647)
W = torch.randn((81, 27), generator=g, requires_grad=True)

# Produce Output
logits = xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), ys].log().mean()

print(f'Loss: {loss.item()}')

Loss: 4.368551731109619


In [74]:
# Train and Optimize the Network: Gradient Descent
n = ys.nelement()

for k in range(300):
    # Forward Pass
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), ys].log().mean()

    # Backward Pass
    W.grad = None
    loss.backward()

    # Nudge Weights
    W.data += -1 * W.grad
print(f'Loss: {loss.item()}')

Loss: 2.262303352355957


In [32]:
# Minimum loss of 0.0055 is achieved

In [60]:
# Sampling

g = torch.Generator().manual_seed(2147483647)

for k in range(20):
    idx = [0, 0, 0]
    wrd = '.'
    while True:
        x1enc = F.one_hot(torch.tensor([idx[0]]), num_classes=27).float()
        x2enc = F.one_hot(torch.tensor([idx[1]]), num_classes=27).float()
        x3enc = F.one_hot(torch.tensor(idx[2]), num_classes=27).float().unsqueeze(0)
        # Wrap idx[0] in [] - this will produce [1, 27], otherwise it will be [27] and we will have to .unsqueeze it
        xenc = torch.cat([x1enc, x2enc, x3enc], dim=1)
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        ix_next = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        wrd += itos[ix_next]
        if ix_next == 0:
            break
        idx = idx[1:] + [ix_next]
    print(wrd)

..
.uoidvdvpkpqga.
.uyzftywedvzqfuybftmozckbxgtebwdwdqgaadvdaibedmczphvdaikrwlmtyasdjpicaycfwebfdvvtumpzyfmd.
.uojfpffybbgjqbmryqdhoidayaebfjmpypftxd.
.nojdmbudmfyfvwpwgapsxhzjiiussszxhpdgaiffibiauilzlglplglmvyapejkrolllllllmpyxywejikuycftvqdmpzhvpchvckmckr.
.ytyhwdqztxa.
.nzyuisznlyiquiqzavwpocbgdvqmbyayawdw.
.n.
..
.nzzgkfqvxlptqmgycaleevvdvzuipewffmbbxqqgrjxbtearchwdmuzhiouycycdwckdhalglplpgpebidzpulsciqeaydwzuivzpihulgibdhvkdokgtilwlxsstzzgiiuisznzipiqdapbgdwvdwfdmhukbxdvjvpkmykdtnajbfjmbkdqfvwdkbdduihzpuisidzbgpfdvhwbuhstvwtxd.
.n.
.uy.
.htyudtfjtpa.
.ynzfisqnlyppggjikullovdmszdfvkbxdqzmfvwxbt.
..
.n.
.hzoigaibudmszxivwdb.
.uoisuiogtkvnvqgabmbqdvrqfalaslylgalelgjpqiardyzq.
.nsgirqdmhvipagkfdvzainfqbadmfulmskdtcvjtxq.
..


# Bigram & Trigram - Train, Test Split

## BIGRAM

In [14]:
# BIGRAM

xsb = []
ysb = []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xsb.append(ix1)
        ysb.append(ix2)

xsb = torch.tensor(xsb)
ysb = torch.tensor(ysb)
n = ysb.nelement()
print(f"Total Examples: {n}")

xbenc = F.one_hot(xsb, num_classes=27).float()

Total Examples: 228146


In [15]:
# Split the data
from sklearn.model_selection import train_test_split

# Step 1: Split into train (80%) and temp (20% → dev+test)
xb_train, xb_temp, yb_train, yb_temp = train_test_split(xbenc, ysb, test_size=0.2, random_state=42)

# Step 2: Split temp into dev (10%) and test (10%)
xb_dev, xb_test, yb_dev, yb_test = train_test_split(xb_temp, yb_temp, test_size=0.5, random_state=42)

# Confirm shapes
print(xb_train.shape, yb_train.shape)  # ~80%
print(xb_dev.shape, yb_dev.shape)      # ~10%
print(xb_test.shape, yb_test.shape)    # ~10%

torch.Size([182516, 27]) torch.Size([182516])
torch.Size([22815, 27]) torch.Size([22815])
torch.Size([22815, 27]) torch.Size([22815])


In [16]:
# Create the network

g = torch.Generator().manual_seed(2147483647)
Wb = torch.randn((27, 27), generator=g, requires_grad=True)

nb_train = yb_train.nelement()
# Produce Output
logitsb = xb_train @ Wb
countsb = logitsb.exp()
probsb = countsb / countsb.sum(1, keepdims=True)
lossb = -probsb[torch.arange(nb_train), yb_train].log().mean()

print(f'Loss: {lossb.item()}')

Loss: 3.759887933731079


In [21]:
# Train and Optimize the Network: Gradient Descent

for k in range(500):
    # Forward Pass
    logitsb = xb_train @ Wb
    countsb = logitsb.exp()
    probsb = countsb / countsb.sum(1, keepdims=True)
    lossb = -probsb[torch.arange(nb_train), yb_train].log().mean()

    # Backward Pass
    Wb.grad = None
    lossb.backward()

    # Nudge Weights
    Wb.data += -1 * Wb.grad
print(f'Loss: {lossb.item()}')

Loss: 2.506399631500244


In [30]:
# Evalutate the model on dev, test dataset

# Dev data
g = torch.Generator().manual_seed(2147483647)

logitsb_dev = xb_dev @ Wb
countsb_dev = logitsb_dev.exp()
probsb_dev = countsb_dev / countsb_dev.sum(1, keepdims=True)

# Loss of Dev Data on NN
nb_dev = yb_dev.nelement()
lossb_dev = -probsb_dev[torch.arange(nb_dev), yb_dev].log().mean().item()

print(f"Loss: {lossb_dev:.4f}")

yb_dev_pred = torch.multinomial(probsb_dev, num_samples=1, replacement=True, generator=g).squeeze()

# Accuracy Test
accuracyb_dev = (yb_dev_pred == yb_dev).float().mean() * 100
print(f"Bigram Dev accuracy: {accuracyb_dev:.4f}%")

Loss: 2.5120
Bigram Dev accuracy: 12.2814%


In [32]:
# Evalutate the model on test dataset

# Dev data
g = torch.Generator().manual_seed(2147483647)

logitsb_test = xb_test @ Wb
countsb_test = logitsb_test.exp()
probsb_test = countsb_test / countsb_test.sum(1, keepdims=True)

# Loss of Dev Data on NN
nb_test = yb_test.nelement()
lossb_test = -probsb_test[torch.arange(nb_test), yb_test].log().mean().item()

print(f"Loss: {lossb_test:.4f}")

yb_test_pred = torch.multinomial(probsb_test, num_samples=1, replacement=True, generator=g).squeeze()

# Accuracy Test
accuracyb_test = (yb_test_pred == yb_test).float().mean() * 100
print(f"Bigram Dev accuracy: {accuracyb_test:.4f}%")

Loss: 2.5052
Bigram Dev accuracy: 12.1543%


## TRIGRAM

In [5]:
# Preparing the data
xt = []
yt = []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xt.append([ix1, ix2])  # input: two chars
        yt.append(ix3)         # output: next char

xt = torch.tensor(xt)
yt = torch.tensor(yt)
nt = yt.nelement()
print(f"Total Trigram Examples: {nt}")

# One-hot encode each of the two input indices, then concatenate to form input of shape [nt, 54]
x1enc = F.one_hot(xt[:, 0], num_classes=27)
x2enc = F.one_hot(xt[:, 1], num_classes=27)
xtenc = torch.cat([x1enc, x2enc], dim=1).float()

Total Trigram Examples: 196113


In [6]:
# Splitting the data
from sklearn.model_selection import train_test_split

xt_train, xt_temp, yt_train, yt_temp = train_test_split(xtenc, yt, test_size=0.2, random_state=42)
xt_dev, xt_test, yt_dev, yt_test = train_test_split(xt_temp, yt_temp, test_size=0.5, random_state=42)

print(xt_train.shape, yt_train.shape)
print(xt_dev.shape, yt_dev.shape)
print(xt_test.shape, yt_test.shape)

torch.Size([156890, 54]) torch.Size([156890])
torch.Size([19611, 54]) torch.Size([19611])
torch.Size([19612, 54]) torch.Size([19612])


In [7]:
# Create network
g = torch.Generator().manual_seed(2147483647)
Wt = torch.randn((54, 27), generator=g, requires_grad=True)

nt_train = yt_train.nelement()

# Training Loop
for k in range(500):
    # Forward pass
    logitst = xt_train @ Wt
    countst = logitst.exp()
    probst = countst / countst.sum(1, keepdims=True)
    losst = -probst[torch.arange(nt_train), yt_train].log().mean()

    # Backward pass
    Wt.grad = None
    losst.backward()

    # Update weights
    Wt.data += -1 * Wt.grad

print(f'Training Loss: {losst.item():.4f}')

Training Loss: 2.5139


In [8]:
# Training Loop
for k in range(500):
    # Forward pass
    logitst = xt_train @ Wt
    countst = logitst.exp()
    probst = countst / countst.sum(1, keepdims=True)
    losst = -probst[torch.arange(nt_train), yt_train].log().mean()

    # Backward pass
    Wt.grad = None
    losst.backward()

    # Update weights
    Wt.data += -1 * Wt.grad

print(f'Training Loss: {losst.item():.4f}')

Training Loss: 2.3840


In [9]:
# Dev Data Evaluation
nt_dev = yt_dev.nelement()
logitst_dev = xt_dev @ Wt
countst_dev = logitst_dev.exp()
probst_dev = countst_dev / countst_dev.sum(1, keepdim=True)
losst_dev = -probst_dev[torch.arange(nt_dev), yt_dev].log().mean().item()
print(f"Trigram Dev Loss: {losst_dev:.4f}")

# Accuracy
g = torch.Generator().manual_seed(2147483647)
yt_dev_pred = torch.multinomial(probst_dev, num_samples=1, replacement=True, generator=g).squeeze()
accuracyt_dev = (yt_dev_pred == yt_dev).float().mean() * 100
print(f"Trigram Dev Accuracy: {accuracyt_dev:.4f}%")

Trigram Dev Loss: 2.3784
Trigram Dev Accuracy: 16.0522%


In [10]:
# Test Data Evaluation
nt_test = yt_test.nelement()
logitst_test = xt_test @ Wt
countst_test = logitst_test.exp()
probst_test = countst_test / countst_test.sum(1, keepdim=True)
losst_test = -probst_test[torch.arange(nt_test), yt_test].log().mean().item()
print(f"Trigram Test Loss: {losst_test:.4f}")

yt_test_pred = torch.multinomial(probst_test, num_samples=1, replacement=True, generator=g).squeeze()
accuracyt_test = (yt_test_pred == yt_test).float().mean() * 100
print(f"Trigram Test Accuracy: {accuracyt_test:.4f}%")

Trigram Test Loss: 2.3815
Trigram Test Accuracy: 15.4548%


In [11]:
# For the same given data, Trigram produces more accurate results, when compared to bigram

# 3. Regularization of Trigram Model

In [13]:
# Define regularization strengths to try
lambdas = [0.0, 0.01, 0.1, 1.0, 10.0]
best_lambda = None
best_dev_loss = float('inf')
best_Wt = None

g = torch.Generator().manual_seed(2147483647)

for lam in lambdas:
    # Reset weights
    Wt = torch.randn((54, 27), generator=g, requires_grad=True)

    # Train the model
    for k in range(500):
        # Forward pass
        logitst = xt_train @ Wt
        countst = logitst.exp()
        probst = countst / countst.sum(1, keepdims=True)
        losst = -probst[torch.arange(yt_train.nelement()), yt_train].log().mean()

        # Add L2 regularization
        losst += lam * Wt.pow(2).mean()

        # Backward pass
        Wt.grad = None
        losst.backward()
        Wt.data += -1 * Wt.grad

    # Evaluate on dev set
    logitst_dev = xt_dev @ Wt
    countst_dev = logitst_dev.exp()
    probst_dev = countst_dev / countst_dev.sum(1, keepdim=True)
    losst_dev = -probst_dev[torch.arange(yt_dev.nelement()), yt_dev].log().mean().item()

    print(f"Lambda: {lam:.3f}, Dev Loss: {losst_dev:.4f}")

    if losst_dev < best_dev_loss:
        best_dev_loss = losst_dev
        best_lambda = lam
        best_Wt = Wt.detach().clone()

print(f"\n✅ Best Lambda: {best_lambda}, with Dev Loss: {best_dev_loss:.4f}")

Lambda: 0.000, Dev Loss: 2.5109
Lambda: 0.010, Dev Loss: 2.5183
Lambda: 0.100, Dev Loss: 2.4818
Lambda: 1.000, Dev Loss: 2.4699
Lambda: 10.000, Dev Loss: 2.7915

✅ Best Lambda: 1.0, with Dev Loss: 2.4699


In [14]:
# Train model with best lambda and evaluate on test data

g = torch.Generator().manual_seed(2147483647)
Wt = torch.randn((54, 27), generator=g, requires_grad=True)

# Train the model
for k in range(500):
    # Forward pass
    logitst = xt_train @ Wt
    countst = logitst.exp()
    probst = countst / countst.sum(1, keepdims=True)
    losst = -probst[torch.arange(yt_train.nelement()), yt_train].log().mean()

    # Add L2 regularization
    losst += best_lambda * Wt.pow(2).mean()

    # Backward pass
    Wt.grad = None
    losst.backward()
    Wt.data += -1 * Wt.grad

logitst = xt_train @ Wt
countst = logitst.exp()
probst = countst / countst.sum(1, keepdims=True)
losst = -probst[torch.arange(yt_train.nelement()), yt_train].log().mean().item()
print(f'Train Loss: {losst:.4f}')

# Evaluate on dev set
logitst_dev = xt_dev @ Wt
countst_dev = logitst_dev.exp()
probst_dev = countst_dev / countst_dev.sum(1, keepdim=True)
losst_dev = -probst_dev[torch.arange(yt_dev.nelement()), yt_dev].log().mean().item()
print(f"Dev Loss: {losst_dev:.4f}")

# Evaluate on test set
logitst_test = xt_test @ Wt
countst_test = logitst_test.exp()
probst_test = countst_test / countst_test.sum(1, keepdim=True)
losst_test = -probst_test[torch.arange(yt_test.nelement()), yt_test].log().mean().item()
print(f"Dev Loss: {losst_test:.4f}")

Train Loss: 2.4721
Dev Loss: 2.4700
Dev Loss: 2.4694


# Using F.cross_entropy for loss

In [15]:
# Train model with best lambda and evaluate on test data

g = torch.Generator().manual_seed(2147483647)
Wt = torch.randn((54, 27), generator=g, requires_grad=True)

# Train the model
for k in range(500):
    losst = F.cross_entropy(xt_train @ Wt, yt_train)
    losst += best_lambda * Wt.pow(2).mean()

    Wt.grad = None
    losst.backward()
    Wt.data += -1 * Wt.grad

# Evaluate on train set
losst = F.cross_entropy(xt_train @ Wt, yt_train).item()
print(f'Train Loss: {losst:.4f}')

# Evaluate on dev set
losst_dev = F.cross_entropy(xt_dev @ Wt, yt_dev).item()
print(f"Dev Loss: {losst_dev:.4f}")

# Evaluate on test set
losst_test = F.cross_entropy(xt_test @ Wt, yt_test).item()
print(f"Test Loss: {losst_test:.4f}")

Train Loss: 2.4721
Dev Loss: 2.4700
Test Loss: 2.4694


In [17]:
# Using F.cross_entropy is preferred as it is more efficient, faster, and less prone to errors.
# Also takes care of any floating point overflow or underflow

# F.cross_entropy takes care of:
    # Softmax of logits
    # Log
    # NLL
    # Mean

In [19]:
## SUMMARY

# 1. Generate datasets using words
# 2. Encode input dataset (and concat incase of trigram or more)
# 3. Generate random weights for 27 Neurons
# 4. Train the Model using loss-back propogation
    # Use cross_entropy(logits) to calculate loss
    # Add Regularization to limit Weight growth
# 5. Evaluate on dev, test datasets
# 6. Generate more samples using multinomial

# Graphic of Matrix Multiplication in Bigram and Trigram

## Bigram

In [20]:
# When we are doing the matrix maultiplication
    # only 1 value of the one-hot encoded vector is non-zero
    # So, although the input vector gets multiplied with all the weights of all Neurons
    # Only 1 weight from each Neuron is selected as all other weights from the Neurons become 0 after the multiplication
# So X @ W in a Bigram model simply indexes into a row of the Network
    # (ith index weight of each Neuron, where i is the index of the non-zero element in input vector)

In [21]:
# Similarly in trigram
    # only 2 indexes are non-zero out of 54
    # So the output is basically the sum of the weights of each neuron in the 2 non-zero index rows.

For simplicity let's consider our alphabet only has 3 characters, so including special sequence. 4 characters

Matrix Multiplication:
X @ W

|         @    | Neuron 0 | Neuron 1 | Neuron 2 | Neuron 3 |
| ----------- | -------- | -------- | -------- | -------- |
| **Input 0** | `W₀₀`    | `W₀₁`    | `W₀₂`    | `W₀₃`    |
| **Input 1** | `W₁₀`    | `W₁₁`    | `W₁₂`    | `W₁₃`    |
| **Input 2** | `W₂₀`    | `W₂₁`    | `W₂₂`    | `W₂₃`    |
| **Input 3** | `W₃₀`    | `W₃₁`    | `W₃₂`    | `W₃₃`    |

Y \= y0 + y1 + y2 + y3
<br> = (x0\*w00 , x0\*w01 , x0\*w02 , x0\*w03)
<br> \+ (x1\*w10 , x1\*w11 , x1\*w12 , x1\*w13)
<br> \+ (x2\*w20 , x2\*w21 , x2\*w22 , x2\*w23)
<br> \+ (x3\*w30 , x3\*w31 , x3\*w32 , x3\*w33)

For x0, x1, x2 = 0 & x3=1

Y = (x3\*w30 , x3\*w31 , x3\*w32 , x3\*w33)
<br> = [w30, w31, w32, w33]
<br> = Row i of W (where i is the index of non-zero element in Input Vector

Y = W[i]

## Trigram

In [22]:
# Similarly, 2 inputs are non zero. So the output will be the sum of individuals weights of the non-zero index's rows.