In [1]:
import numpy as np
import torch
import torch.nn.functional as F

In [2]:
with open("../Makemore/names.txt") as f:
    names = f.readlines()

In [4]:
names = [name.strip() for name in names]

In [6]:
char_set = sorted(list({c for name in names for c in name}))
char_set.insert(0, '.')
len(char_set)

27

In [7]:
char_set

['.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [12]:
stoi = {char: i for i, char in enumerate(char_set)}
itos = {i: char for char, i in stoi.items()}

In [16]:
block_size = 3
X, Y = [], []
for name in names:
    name = '.' * block_size + name + '.'
    for i in range(len(name) - block_size):
        context = name[i: i+block_size]
        X.append([stoi[char] for char in context])
        Y.append(stoi[name[i+block_size]])

X = torch.tensor(X)
Y = torch.tensor(Y)

In [17]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [259]:
train_idx, val_idx, test_idx = torch.utils.data.random_split(range(X.shape[0]), [0.8, 0.1, 0.1])
X_train, Y_train = X[train_idx], Y[train_idx]
X_val, Y_val = X[val_idx], Y[val_idx]
X_test, Y_test = X[test_idx], Y[test_idx]
print(f"Number of train samples: {X_train.shape[0]}")
print(f"Number of validation samples: {X_val.shape[0]}")
print(f"Number of test samples: {X_test.shape[0]}")

Number of train samples: 182517
Number of validation samples: 22815
Number of test samples: 22814


In [265]:
gen = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 20), generator=gen)
W1 = torch.randn((60, 300), generator=gen)
B1 = torch.randn(300, generator=gen)
W2 = torch.randn((300, 27), generator=gen)
B2 = torch.randn(27, generator=gen)
parameters = [C, W1, B1, W2, B2]

In [266]:
sum(p.nelement() for p in parameters)

26967

In [267]:
for p in parameters:
    p.requires_grad = True

In [268]:
C[X_train].shape

torch.Size([182517, 3, 20])

In [269]:
C.shape

torch.Size([27, 20])

In [270]:
for i in range(300001):
    ix = torch.randint(0, X_train.shape[0], (64, ))
    emb = C[X_train[ix]]
#     print(f"emb: {emb.shape}")
    out = torch.tanh(torch.matmul(emb.view(-1, W1.shape[0]), W1) + B1)
    logits = torch.matmul(out, W2) + B2
    loss = F.cross_entropy(logits, Y_train[ix])
    if i%5000 == 0:
        overall_train_loss(i)
    loss.backward()
    lr_schedule = {0: 0.1, 100000: 0.05, 200000: 0.01, 300000: 0.001}
    for threshold, item in lr_schedule.items():
        if i > threshold:
            lr = item
    for p in parameters:
        p.data += -lr * p.grad
        p.grad = None

Epoch 0 	 Loss: 33.08217239379883
Epoch 5000 	 Loss: 3.1133220195770264
Epoch 10000 	 Loss: 2.662336826324463
Epoch 15000 	 Loss: 2.437128782272339
Epoch 20000 	 Loss: 2.3415513038635254
Epoch 25000 	 Loss: 2.295254707336426
Epoch 30000 	 Loss: 2.266085624694824
Epoch 35000 	 Loss: 2.251458168029785
Epoch 40000 	 Loss: 2.222989559173584
Epoch 45000 	 Loss: 2.2297451496124268
Epoch 50000 	 Loss: 2.1771247386932373
Epoch 55000 	 Loss: 2.1993818283081055
Epoch 60000 	 Loss: 2.203444719314575
Epoch 65000 	 Loss: 2.146192789077759
Epoch 70000 	 Loss: 2.1651298999786377
Epoch 75000 	 Loss: 2.169393301010132
Epoch 80000 	 Loss: 2.213757038116455
Epoch 85000 	 Loss: 2.1542716026306152
Epoch 90000 	 Loss: 2.1260766983032227
Epoch 95000 	 Loss: 2.1293880939483643
Epoch 100000 	 Loss: 2.143028974533081
Epoch 105000 	 Loss: 2.069441556930542
Epoch 110000 	 Loss: 2.0641143321990967
Epoch 115000 	 Loss: 2.0550808906555176
Epoch 120000 	 Loss: 2.061725616455078
Epoch 125000 	 Loss: 2.0556840896606445

In [256]:
@torch.no_grad()
def overall_train_loss(epoch_num):
    out = torch.tanh(torch.matmul(C[X_train].view(-1, W1.shape[0]), W1) + B1)
    logits = torch.matmul(out, W2) + B2
    train_loss = F.cross_entropy(logits, Y_train)
    print(f"Epoch {epoch_num} \t Loss: {train_loss}")

In [271]:
with torch.no_grad():
    out = torch.tanh(torch.matmul(C[X_val].view(-1, W1.shape[0]), W1) + B1)
    logits = torch.matmul(out, W2) + B2
    val_loss = F.cross_entropy(logits, Y_val)
    print(f"Val loss: {val_loss}")

Val loss: 2.160008430480957


In [272]:
with torch.no_grad():
    out = torch.tanh(torch.matmul(C[X_test].view(-1, W1.shape[0]), W1) + B1)
    logits = torch.matmul(out, W2) + B2
    test_loss = F.cross_entropy(logits, Y_test)
    print(f"test loss: {test_loss}")

test loss: 2.1386842727661133


### E01: I did not get around to seeing what happens when you initialize all weights and biases to zero. Try this and train the neural net. You might think either that 1) the network trains just fine or 2) the network doesn't train at all, but actually it is 3) the network trains but only partially, and achieves a pretty bad final performance. Inspect the gradients and activations to figure out what is happening and why the network is only partially training, and what part is being trained exactly.


In [275]:
C_Z = torch.randn((27, 20))
W1_Z = torch.zeros((60, 300))
B1_Z = torch.zeros(300)
W2_Z = torch.zeros((300, 27))
B2_Z = torch.zeros(27)
parameters_Z = [C_Z, W1_Z, B1_Z, W2_Z, B2_Z]

In [276]:
for p in parameters_Z:
    p.requires_grad = True

In [282]:
for i in range(300001):
    ix = torch.randint(0, X_train.shape[0], (64, ))
    emb = C_Z[X_train[ix]]
#     print(f"emb: {emb.shape}")
    out = torch.tanh(torch.matmul(emb.view(-1, W1_Z.shape[0]), W1_Z) + B1_Z)
    logits = torch.matmul(out, W2_Z) + B2_Z
    loss = F.cross_entropy(logits, Y_train[ix])
    if i%5000 == 0:
        with torch.no_grad():
            out = torch.tanh(torch.matmul(C_Z[X_train].view(-1, W1_Z.shape[0]), W1_Z) + B1_Z)
            logits = torch.matmul(out, W2_Z) + B2_Z
            train_loss = F.cross_entropy(logits, Y_train)
            print(f"Epoch {i+1} \t Loss: {train_loss}")
    loss.backward()
    lr_schedule = {0: 0.1, 100000: 0.05, 200000: 0.01, 300000: 0.001}
    for threshold, item in lr_schedule.items():
        if i > threshold:
            lr = item
    for p in parameters:
        print(f"{lr=} {p.grad=}")
        p.data += -lr * p.grad
        p.grad = None

Epoch 1 	 Loss: 3.295837163925171
lr=0.01 p.grad=None


TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

### E02: BatchNorm, unlike other normalization layers like LayerNorm/GroupNorm etc. has the big advantage that after training, the batchnorm gamma/beta can be "folded into" the weights of the preceeding Linear layers, effectively erasing the need to forward it at test time. Set up a small 3-layer MLP with batchnorms, train the network, then "fold" the batchnorm gamma/beta into the preceeding Linear layer's W,b by creating a new W2, b2 and erasing the batch norm. Verify that this gives the same forward pass during inference. i.e. we see that the batchnorm is there just for stabilizing the training, and can be thrown out after training is done! pretty cool.
