# MLP

Implement MLP model as implemented in [makemore part 2](https://www.youtube.com/watch?v=TCH_1BHY58I&ab_channel=AndrejKarpathy)

In [1]:
with open("names.txt", 'r') as file:
    names = file.read().splitlines()

len(names)

32033

## Step 1: Create the training data

Create the training data based on the context window length

In [19]:
window_size = 3

xchars, ychars = [], []

for name in names[:3]:
    name = ['.'] * window_size + list(name)
    for i, ch in enumerate(name[window_size:]):
        x = name[i:i + window_size]
        y = ch
        print(f"{x}-->{y}")
        xchars.append(x)
        ychars.append(y)

len(xchars), len(ychars)

['.', '.', '.']-->e
['.', '.', 'e']-->m
['.', 'e', 'm']-->m
['e', 'm', 'm']-->a
['.', '.', '.']-->o
['.', '.', 'o']-->l
['.', 'o', 'l']-->i
['o', 'l', 'i']-->v
['l', 'i', 'v']-->i
['i', 'v', 'i']-->a
['.', '.', '.']-->a
['.', '.', 'a']-->v
['.', 'a', 'v']-->a


(13, 13)

## Step 2: Create the char list to indexes

In [20]:
chars = sorted(list(set("".join(name for name in names))))
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [21]:
stoi, itos = {}, {}

stoi['.'] = 0
itos[0] = '.'

for i, ch in enumerate(chars):
    stoi[ch] = i + 1
    itos[i + 1] = ch

len(stoi), len(itos)

(27, 27)

In [22]:
xs, ys = [], []

for x, y in zip(xchars, ychars):
    xs.append([stoi[ch] for ch in x])
    ys.append(stoi[y])

len(xs), len(ys)

(13, 13)

In [9]:
xs[:5]

[[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [0, 0, 0]]

In [10]:
ys[:5]

[5, 13, 13, 1, 15]

In [23]:
import torch

xs = torch.Tensor(xs).long()
ys = torch.Tensor(ys).long()

xs.shape, ys.shape

(torch.Size([13, 3]), torch.Size([13]))

## Step 3: Create the MLP, define the parameters

The parameters in question:
- Embedding lookup table.
- Parameters in the MLP (Weights, Biases)

In [40]:
g = torch.Generator().manual_seed(2147483647)
emb_dim = 2

emb_dict = torch.randn((len(stoi), emb_dim), dtype=torch.float32, generator=g)
emb_dict.shape

torch.Size([27, 2])

In [41]:
# Define 1st layer
hidden_dim = 100
W1 = torch.randn(((emb_dim * window_size), hidden_dim), dtype=torch.float32, generator=g)
b1 = torch.randn((hidden_dim), dtype=torch.float32, generator=g)

# Define output layer
W2 = torch.randn((hidden_dim, len(stoi)), dtype=torch.float32, generator=g)
b2 = torch.randn((len(stoi)), dtype=torch.float32, generator=g)

W1.shape, b1.shape, W2.shape, b2.shape

(torch.Size([6, 100]),
 torch.Size([100]),
 torch.Size([100, 27]),
 torch.Size([27]))

In [42]:
# Make parameters trainable

params = [emb_dict, W1, b1, W2, b2]

for param in params:
    param.requires_grad = True

## Step 4: Create the forward pass

Flatten the input and do the forward pass

In [58]:
for param in params:
    param.grad = None
    
inp = emb_dict[xs].view(len(xs), -1)

o1 = inp @ W1 + b1
h1 = torch.tanh(o1)

logits = h1 @ W2 + b2
probs = torch.softmax(logits, dim=1)
probs.shape

torch.Size([13, 27])

In [59]:
probs[2].sum()

tensor(1.0000, grad_fn=<SumBackward0>)

## Step 5: Compute NLL and do backward

In [60]:
import torch.nn.functional as F

nll = F.cross_entropy(logits, ys)
nll

tensor(7.0138, grad_fn=<NllLossBackward0>)

In [61]:
nll.backward()

In [62]:
for param in params:
    param.data -= 0.1*param.grad

## Step 6: Complete the loop for smaller examples

In [87]:
window_size = 3

xs, ys = [], []

for name in names[:5]:
    # print(name)
    name = ['.'] * window_size + list(name) + ['.']
    for i, ch in enumerate(name[window_size:]):
        x = [stoi[c] for c in name[i:i + window_size]]
        y = stoi[ch]
        # print(f"{x}-->{y}")
        xs.append(x)
        ys.append(y)

len(xs), len(ys)

(32, 32)

In [88]:
xs = torch.Tensor(xs).long()
ys = torch.Tensor(ys).long()

xs.shape, ys.shape

(torch.Size([32, 3]), torch.Size([32]))

In [118]:
g = torch.Generator().manual_seed(2147483647)
emb_dim = 2

emb_dict = torch.randn((len(stoi), emb_dim), dtype=torch.float32, generator=g)
emb_dict.shape

torch.Size([27, 2])

In [119]:
# Define 1st layer
hidden_dim = 100
W1 = torch.randn(((emb_dim * window_size), hidden_dim), dtype=torch.float32, generator=g)
b1 = torch.randn((hidden_dim), dtype=torch.float32, generator=g)

# Define output layer
W2 = torch.randn((hidden_dim, len(stoi)), dtype=torch.float32, generator=g)
b2 = torch.randn((len(stoi)), dtype=torch.float32, generator=g)

W1.shape, b1.shape, W2.shape, b2.shape

(torch.Size([6, 100]),
 torch.Size([100]),
 torch.Size([100, 27]),
 torch.Size([27]))

In [120]:
# Make parameters trainable

params = [emb_dict, W1, b1, W2, b2]

for param in params:
    param.requires_grad = True

In [121]:
for epoch in range(100):
    for param in params:
        param.grad = None

    ## Forward pass
    inp = emb_dict[xs].view(len(xs), -1)
    h1 = torch.tanh(inp @ W1 + b1)
    logits = h1 @ W2 + b2

    ## Compute loss
    # nll = -torch.log(probs[torch.arange(len(xs)), ys]).mean()
    nll = F.cross_entropy(logits, ys)

    ## Backward
    nll.backward()

    ## Update params
    for param in params:
        param.data -= 0.1 * param.grad

    print(f"Epoch: {epoch + 1} NLL: {nll.item()}")

Epoch: 1 NLL: 17.76971435546875
Epoch: 2 NLL: 13.656402587890625
Epoch: 3 NLL: 11.298770904541016
Epoch: 4 NLL: 9.452457427978516
Epoch: 5 NLL: 7.984264373779297
Epoch: 6 NLL: 6.891321182250977
Epoch: 7 NLL: 6.100014686584473
Epoch: 8 NLL: 5.452036380767822
Epoch: 9 NLL: 4.898151874542236
Epoch: 10 NLL: 4.414664268493652
Epoch: 11 NLL: 3.9858498573303223
Epoch: 12 NLL: 3.6028308868408203
Epoch: 13 NLL: 3.262141466140747
Epoch: 14 NLL: 2.961381196975708
Epoch: 15 NLL: 2.6982975006103516
Epoch: 16 NLL: 2.469712495803833
Epoch: 17 NLL: 2.2716610431671143
Epoch: 18 NLL: 2.1012837886810303
Epoch: 19 NLL: 1.9571774005889893
Epoch: 20 NLL: 1.8374857902526855
Epoch: 21 NLL: 1.7380964756011963
Epoch: 22 NLL: 1.6535117626190186
Epoch: 23 NLL: 1.5790901184082031
Epoch: 24 NLL: 1.5117672681808472
Epoch: 25 NLL: 1.4496049880981445
Epoch: 26 NLL: 1.391312599182129
Epoch: 27 NLL: 1.335992693901062
Epoch: 28 NLL: 1.2830532789230347
Epoch: 29 NLL: 1.232191801071167
Epoch: 30 NLL: 1.1833819150924683
Epo

## Now train on the complete dataset

### Step 1: Split the dataset on names

In [123]:
len(names)

32033

In [124]:
from sklearn.model_selection import train_test_split  

random_state = 42
names_train, names_temp = train_test_split(names, test_size=0.2, random_state=random_state,\
                                            shuffle=True)  
names_dev, names_test = train_test_split(names_temp, test_size=0.5, random_state=random_state,\
                                            shuffle=True)  

len(names_train), len(names_dev), len(names_test)

(25626, 3203, 3204)

### Step 2: Create dataset

In [128]:
def create_dataset(names, window_size=3):
    xs, ys = [], []

    for name in names:
        # print(name)
        name = ['.'] * window_size + list(name) + ['.']
        for i, ch in enumerate(name[window_size:]):
            x = [stoi[c] for c in name[i:i + window_size]]
            y = stoi[ch]
            # print(f"{x}-->{y}")
            xs.append(x)
            ys.append(y)

    xs = torch.Tensor(xs).long()
    ys = torch.Tensor(ys).long()

    return xs, ys

In [129]:
x_train, y_train = create_dataset(names_train)
x_dev, y_dev = create_dataset(names_dev)
x_test, y_test = create_dataset(names_test)

In [130]:
x_train.shape, y_train.shape

(torch.Size([182497, 3]), torch.Size([182497]))

In [131]:
x_dev.shape, y_dev.shape


(torch.Size([22882, 3]), torch.Size([22882]))

In [132]:
x_test.shape, y_test.shape


(torch.Size([22767, 3]), torch.Size([22767]))

### Step 3: Create the emb lookup table and the parameters of the NN

In [133]:
len(stoi)

27

In [199]:
# Hyperparams related to network arch
window_size = 3
emb_dim = 10
hidden_dim = 200

In [200]:
# Lookup table
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((len(stoi), emb_dim), generator=g, dtype=torch.float32)

In [201]:
# NN parameters

# Layer 1
W1 = torch.randn((emb_dim * window_size, hidden_dim), generator=g, dtype=torch.float32)
b1 = torch.randn((hidden_dim), generator=g, dtype=torch.float32)

# Layer 2
W2 = torch.randn((hidden_dim, len(stoi)), generator=g, dtype=torch.float32)
b2 = torch.randn((len(stoi)), generator=g, dtype=torch.float32)

W1.shape, b1.shape, W2.shape, b2.shape


(torch.Size([30, 200]),
 torch.Size([200]),
 torch.Size([200, 27]),
 torch.Size([27]))

In [202]:
params = [C, W1, b1, W2, b2]

for p in params:
    p.requires_grad = True

### Step 4: Create the training loop

We will create mini batches and train only on the mini batch which gives an approximate of the gradient and do gradient descent.

In [203]:
batch_size = 128
lr = 0.1
decay = 0.999999

In [204]:
for epoch in range(200000):
    # Set grads as None
    for p in params:
        p.grad = None

    # Create a mini batch
    ix = torch.randint(0, x_train.shape[0], (batch_size,))
    x_batch = x_train[ix]
    y_batch = y_train[ix]

    # Forward pass
    inp = C[x_batch].view(batch_size, -1)
    h1 = torch.tanh(inp @ W1 + b1)
    logits = h1 @ W2 + b2

    # Loss computation
    nll = F.cross_entropy(logits, y_batch)

    # Backward and update weights, lr
    nll.backward()
    for p in params:
        p.data -= lr * p.grad
    lr *= decay

    if epoch % 10000 == 0:
        with torch.no_grad():
            # Train Loss
            inp = C[x_train].view(x_train.shape[0], -1)
            h1 = torch.tanh(inp @ W1 + b1)
            logits = h1 @ W2 + b2
            train_nll = F.cross_entropy(logits, y_train)

            # Dev Loss
            inp = C[x_dev].view(x_dev.shape[0], -1)
            h1 = torch.tanh(inp @ W1 + b1)
            logits = h1 @ W2 + b2
            dev_nll = F.cross_entropy(logits, y_dev)

        print(f"Epoch {epoch + 1}, Train NLL: {train_nll.item()}, Dev NLL: {dev_nll.item()}")


Epoch 1, Train NLL: 24.376293182373047, Dev NLL: 24.451757431030273
Epoch 10001, Train NLL: 2.3863210678100586, Dev NLL: 2.395778179168701
Epoch 20001, Train NLL: 2.3319787979125977, Dev NLL: 2.352963447570801
Epoch 30001, Train NLL: 2.2805376052856445, Dev NLL: 2.303104877471924
Epoch 40001, Train NLL: 2.271453380584717, Dev NLL: 2.297417640686035
Epoch 50001, Train NLL: 2.223081350326538, Dev NLL: 2.248016357421875
Epoch 60001, Train NLL: 2.20719313621521, Dev NLL: 2.2350494861602783
Epoch 70001, Train NLL: 2.194711923599243, Dev NLL: 2.227790594100952
Epoch 80001, Train NLL: 2.1956119537353516, Dev NLL: 2.2302663326263428
Epoch 90001, Train NLL: 2.1956751346588135, Dev NLL: 2.2314772605895996
Epoch 100001, Train NLL: 2.1695024967193604, Dev NLL: 2.2090086936950684
Epoch 110001, Train NLL: 2.1677730083465576, Dev NLL: 2.209225654602051
Epoch 120001, Train NLL: 2.1476802825927734, Dev NLL: 2.1932027339935303
Epoch 130001, Train NLL: 2.1489768028259277, Dev NLL: 2.192969560623169
Epoch

In [206]:
with torch.no_grad():
    # Train Loss
    inp = C[x_train].view(x_train.shape[0], -1)
    h1 = torch.tanh(inp @ W1 + b1)
    logits = h1 @ W2 + b2
    train_nll = F.cross_entropy(logits, y_train)

    # Dev Loss
    inp = C[x_dev].view(x_dev.shape[0], -1)
    h1 = torch.tanh(inp @ W1 + b1)
    logits = h1 @ W2 + b2
    dev_nll = F.cross_entropy(logits, y_dev)

    # Test Loss
    inp = C[x_test].view(x_test.shape[0], -1)
    h1 = torch.tanh(inp @ W1 + b1)
    logits = h1 @ W2 + b2
    test_nll = F.cross_entropy(logits, y_test)

train_nll.item(), dev_nll.item(), test_nll.item()

(2.120701551437378, 2.174471616744995, 2.1914663314819336)

## Sampling from this model

In [220]:
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    curr_window, curr_name = ['.'] * window_size, ""

    while True:
        ix = torch.Tensor([stoi[ch] for ch in curr_window]).long()

        with torch.no_grad():
            inp = C[ix].view(1, -1)
            h1 = torch.tanh(inp @ W1 + b1)
            logit = h1 @ W2 + b2
            probs = torch.softmax(logit, dim=1)

        next_ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        next_char = itos[next_ix]

        if next_char == '.':
            break
        
        curr_name += next_char
        curr_window += [next_char]
        curr_window = curr_window[1:]

    print(curr_name)

cerie
moullura
carityn
mellistana
nalundk
katar
raciopellanso
gotti
mclie
cayo
kenteda
kaley
maside
anka
graylandse
hina
gavon
las
kashra
anesley


In [210]:
ix

tensor([0, 0, 0])

In [211]:
C[ix].shape

torch.Size([3, 10])