Reference Paper: [A Neural Probabilistic Language Model - Bengio et.al.](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)
<br>
## Graphic of an MLP:
![MLP](https://i.postimg.cc/4d03m46Q/Screenshot-2025-06-09-022351.png)

In [27]:
import torch
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
import torch.nn.functional as F

# Exercise 1: Get a loss < 2.2 (Parameter & Hyperparameter tuning)

## Creating Dataset

In [1]:
# Collecting Words
path = '/content/drive/MyDrive/Colab Notebooks/AndrejKarpathy_NN_Hero/names.txt'
words = open(path, 'r').read().splitlines()

# Mapping
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [29]:
block_size = 4 # number of characters as input
X, Y = [], []

for w in words:
    context = [0] * block_size
    for ch in w+'.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

# Split the dataset (train: 80%, dev: 10%, test:10%)
rs = 2147483647
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=0.8, random_state=rs, shuffle=True)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, train_size=0.5, random_state=rs, shuffle=True)

## Creating MLP Network

In [59]:
# Initializing parameters
g = torch.Generator().manual_seed(2147483647)
n_emb = 4 # number of embedding layers
n_h = 200 # number of Neurons in hidden layer

C = torch.randn((27, n_emb), generator=g, requires_grad=True) # Embedding in Lookup layer
w1 = torch.randn(( n_emb * block_size, n_h), generator=g, requires_grad=True) # hidden layer weights
b1 = torch.randn((n_h), generator=g, requires_grad=True) # hidden layer biases
w2 = torch.randn((n_h, 27), generator=g, requires_grad=True) # Last layer (output)
b2 = torch.randn((27), generator=g, requires_grad=True) # output layer biases

params = [C, w1, b1, w2, b2]

In [60]:
# Total parameter count
sum(p.nelement() for p in params)

8935

## Tuning

In [61]:
g = torch.Generator().manual_seed(2147483647)
itrns = 200000

# learning rates
lri = torch.linspace(0, -3, itrns) # from 0 to -3 with itrns number of steps
lre = 10**lri # from 10*0=1 to 10**-3=0.001 in itrns number of steps

for i in range(itrns):
    # minibatch
    ix = torch.randint(low=0, high=X_train.shape[0], size=(64, ), generator=g)

    # forward pass
    emb = C[X_train[ix]]
    h = torch.tanh((emb.view(-1, block_size * n_emb) @ w1 + b1))
    logits = h @ w2 + b2
    loss = F.cross_entropy(logits, Y_train[ix])

    # backward pass
    for p in params:
        p.grad = None
    loss.backward()

    # nudge
    lr = lre[i]
    for p in params:
        p.data += -lr * p.grad

    if (i*1.0) % 10000 == 0.0:
        print(f'Loss on iteration {i}: {loss.item()} | Learning Rate: {lr}')

print(loss.item())

Loss on iteration 0: 25.550077438354492 | Learning Rate: 1.0
Loss on iteration 10000: 2.64593243598938 | Learning Rate: 0.7079445719718933
Loss on iteration 20000: 2.6186728477478027 | Learning Rate: 0.5011854767799377
Loss on iteration 30000: 2.4271085262298584 | Learning Rate: 0.35481154918670654
Loss on iteration 40000: 2.0755672454833984 | Learning Rate: 0.25118690729141235
Loss on iteration 50000: 2.2218549251556396 | Learning Rate: 0.1778264045715332
Loss on iteration 60000: 2.1787593364715576 | Learning Rate: 0.12589123845100403
Loss on iteration 70000: 1.892807126045227 | Learning Rate: 0.08912400156259537
Loss on iteration 80000: 2.1421315670013428 | Learning Rate: 0.063094861805439
Loss on iteration 90000: 2.2556822299957275 | Learning Rate: 0.04466765746474266
Loss on iteration 100000: 2.213229179382324 | Learning Rate: 0.031622231006622314
Loss on iteration 110000: 1.9161920547485352 | Learning Rate: 0.022386789321899414
Loss on iteration 120000: 2.223454475402832 | Learnin

In [62]:
# Loss for Dev dataset

emb = C[X_dev]
h = torch.tanh(emb.view(-1, block_size * n_emb) @ w1 + b1)
logits = h @ w2 + b2
loss = F.cross_entropy(logits, Y_dev)
loss.item()

2.1384639739990234

In [63]:
# Best loss achieved with 100 Neurons = 2.20
# Let's increase number of neurons to 200, and keep everything else same:

# Other methods to optimize:
    # best way is to increase the number of Neurons in the hidden layer (100 -> 300)
    # or we can also increase the layers in our embedding (2 -> 10)
    # or we can change the number of characters we are feeding (context 3 -> 4, 5)
    # or we can run more training loops with decaying learning rate
    # or we can change the batch size (32 -> 64)

In [64]:
# Best loss achieved with 200 Neurons = 2.13846

In [65]:
# Loss for Test dataset

emb = C[X_test]
h = torch.tanh(emb.view(-1, block_size * n_emb) @ w1 + b1)
logits = h @ w2 + b2
loss = F.cross_entropy(logits, Y_test)
loss.item()

2.1387529373168945

In [66]:
# Our model shows consistent loss for all three datasets, which means it has not been overfit to the training data.
# Let's stick to it now

In [70]:
# SAMPLING

g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size

    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ w1 + b1)
        logits = h @ w2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        if ix ==0:
            break
        out.append(ix)
    print(''.join(itos[i] for i in out))

morachmyah
seelv
dhryah
rener
jendrie
caden
daelin
shivoel
edeliennanar
kayzim
mara
noshur
rishianna
kinton
kynlee
pair
ubemavd
ryyah
faeha
kayshawn


In [68]:
# Very much name like outputs.
# Goal achieved: Get loss less than 2.2, got 2.13

# Exercise 2: Initializing with close to uniform weights

We can make the weights smaller to make them more uniform (closer)
Multiply by 0.01

<u>Make sure to call requires_grad after the multiplication as any operation will remove grads from non-leaf tensors<u>

In [77]:
# Initializing parameters
g = torch.Generator().manual_seed(2147483647)
n_emb = 4 # number of embedding layers
n_h = 200 # number of Neurons in hidden layer

Cx = (torch.randn((27, n_emb), generator=g) * 0.01).requires_grad_()
w1x = (torch.randn(( n_emb * block_size, n_h), generator=g) * 0.01).requires_grad_()
b1x = torch.zeros((n_h), requires_grad=True)
w2x = (torch.randn((n_h, 27), generator=g) * 0.01).requires_grad_()
b2x = torch.zeros((27), requires_grad=True)

paramsx = [Cx, w1x, b1x, w2x, b2x]

In [78]:
g = torch.Generator().manual_seed(2147483647)
itrns = 200000

# learning rates
lri = torch.linspace(0, -3, itrns) # from 0 to -3 with itrns number of steps
lre = 10**lri # from 10*0=1 to 10**-3=0.001 in itrns number of steps

for i in range(itrns):
    # minibatch
    ix = torch.randint(low=0, high=X_train.shape[0], size=(64, ), generator=g)

    # forward pass
    emb = Cx[X_train[ix]]
    h = torch.tanh((emb.view(-1, block_size * n_emb) @ w1x + b1x))
    logits = h @ w2x + b2x
    loss = F.cross_entropy(logits, Y_train[ix])

    # backward pass
    for p in paramsx:
        p.grad = None
    loss.backward()

    for j, p in enumerate(paramsx):
        if p.grad is None:
            print(f"Gradient is None for parameter {j} (name: {['Cx', 'w1x', 'b1x', 'w2x', 'b2x'][j]}) at iteration {i}")
    # --- End of added diagnostic code ---

    # nudge
    lr = lre[i]
    for p in paramsx:
        p.data += -lr * p.grad

    if (i*1.0) % 10000 == 0.0:
        print(f'Loss on iteration {i}: {loss.item()} | Learning Rate: {lr}')

print(loss.item())

Loss on iteration 0: 3.295844316482544 | Learning Rate: 1.0
Loss on iteration 10000: 2.3734588623046875 | Learning Rate: 0.7079445719718933
Loss on iteration 20000: 2.461765766143799 | Learning Rate: 0.5011854767799377
Loss on iteration 30000: 2.245591878890991 | Learning Rate: 0.35481154918670654
Loss on iteration 40000: 2.2268147468566895 | Learning Rate: 0.25118690729141235
Loss on iteration 50000: 2.2551960945129395 | Learning Rate: 0.1778264045715332
Loss on iteration 60000: 2.383298397064209 | Learning Rate: 0.12589123845100403
Loss on iteration 70000: 2.127276659011841 | Learning Rate: 0.08912400156259537
Loss on iteration 80000: 2.41489839553833 | Learning Rate: 0.063094861805439
Loss on iteration 90000: 2.2968294620513916 | Learning Rate: 0.04466765746474266
Loss on iteration 100000: 2.205369710922241 | Learning Rate: 0.031622231006622314
Loss on iteration 110000: 2.113295793533325 | Learning Rate: 0.022386789321899414
Loss on iteration 120000: 2.336944341659546 | Learning Rat

In [79]:
# Loss for Dev dataset

emb = Cx[X_dev]
h = torch.tanh(emb.view(-1, block_size * n_emb) @ w1x + b1x)
logits = h @ w2x + b2x
loss = F.cross_entropy(logits, Y_dev)
loss.item()

2.1891355514526367

In [80]:
# Loss for Test dataset

emb = Cx[X_test]
h = torch.tanh(emb.view(-1, block_size * n_emb) @ w1x + b1x)
logits = h @ w2x + b2x
loss = F.cross_entropy(logits, Y_test)
loss.item()

2.1833572387695312

In [81]:
# SAMPLING

g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    out = []
    context = [0] * block_size

    while True:
        emb = Cx[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ w1x + b1x)
        logits = h @ w2x + b2x
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        if ix ==0:
            break
        out.append(ix)
    print(''.join(itos[i] for i in out))

mora
kmyan
seel
ndhey
lorethantendra
graderedieli
jemi
jenleigh
esoraan
kayzion
kalin
shuberglairie
tricke
jeliphetton
kubelynder
yahulynn
yula
mustondrihal
salynn
uhazalel


In [82]:
# With uniform initialization, as well, we got almost same loss as random initialization.
# Good day, signing off