# [Building Makemore, Part III: Activations, Gradients, & BatchNorm](https://www.youtube.com/watch?v=P6sfmUTpUmc)

In [1]:
# import basics
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# create words array
words = open('names.txt','r').read().splitlines()

In [3]:
# build dicts
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)

In [4]:
# build dataset
block_size = 3

def build_dataset(words):
    X, Y = [], []
    
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [5]:
# create train, validation, test sets
import random
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1]) # traaining
Xdv, Ydv = build_dataset(words[n1:n2]) # validation
Xts, Yts = build_dataset(words[n2:]) # test

In [61]:
# cleaner MLP
# hyperparameters
n_embed = 10
n_hidden = 200

# create parameter tensors
C = torch.randn((vocab_size, n_embed))
W1 = torch.randn((n_embed * block_size, n_hidden)) * (5/3) / (n_embed*block_size)**.5 # 5/3 because we use tanh and $\sqrt{n_embed*block_size} for the fan in
# b1 = torch.randn(n_hidden) * 0.01 we end up using a bias for the layer as a whole
W2 = torch.randn((n_hidden, vocab_size)) * 0.01
b2 = torch.randn(vocab_size) * 0

b_n_gain = torch.ones((1, n_hidden))
b_n_bias = torch.zeros((1, n_hidden))
b_n_mean_running = torch.zeros((1, n_hidden))
b_n_std_running = torch.ones((1, n_hidden))

# require_grad on all
parameters = [C, W1, W2, b2, b_n_gain, b_n_bias]
for p in parameters:
    p.requires_grad = True

In [62]:
# optimization
# hyperparameters
n_steps = 20000
batch_size = 32
lossi = []

for i in range(n_steps):
    # minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]
    
    # forward
    emb = C[Xb]
    emb_cat = emb.view(emb.shape[0], -1)
    h_preact = emb_cat @ W1
    b_n_mean_i = h_preact.mean(0, keepdim=True)
    b_n_std_i = h_preact.std(0, keepdim=True)
    h_preact = b_n_gain * (h_preact - b_n_mean_i) / b_n_std_i + b_n_bias # batch normalization
    h = torch.tanh(h_preact) 
    logits = h @ W2 + b2 # output of the network
    loss = F.cross_entropy(logits, Yb) # calc loss
    
    with torch.no_grad():
        b_n_mean_running = 0.999 * b_n_mean_running + 0.001 * b_n_mean_i 
        b_n_std_running = 0.999 * b_n_std_running + 0.001 * b_n_std_i 
    
    # backward 
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < n_steps / 2 else 0.01
    for p in parameters:
        p.data += lr * -p.grad
        
    if i % 1000 == 0:
        print(f'{i:7d}/{n_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/  20000: 3.2881
   1000/  20000: 2.5778
   2000/  20000: 2.4284
   3000/  20000: 2.4159
   4000/  20000: 2.0846
   5000/  20000: 1.9615
   6000/  20000: 2.2179
   7000/  20000: 2.2058
   8000/  20000: 2.4432
   9000/  20000: 2.0212
  10000/  20000: 2.6387
  11000/  20000: 2.0833
  12000/  20000: 2.4406
  13000/  20000: 2.1330
  14000/  20000: 2.1533
  15000/  20000: 2.1912
  16000/  20000: 2.2662
  17000/  20000: 2.4015
  18000/  20000: 1.9788
  19000/  20000: 2.1736


In [66]:
@torch.no_grad() # apparently this tells torch we won't be using gradients so it doesn't need to keep track of them here
def split_loss(split):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xdv, Ydv),
        'test': (Xts, Yts),
    }[split] # this is gorgeous; I would not have thought of it
    
    emb = C[x]
    emb_cat = emb.view(emb.shape[0], -1)
    h_preact = emb_cat @ W1 + b1
    h_preact = b_n_gain * (h_preact - b_n_mean_running) / b_n_std_running + b_n_bias
    h = torch.tanh(emb_cat @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('val')
split_loss('test')

val 2.874155282974243
test 2.868072271347046


In [65]:
for _ in range(20):
    out = []
    context = [0] * block_size
    
    while True:
        # forward pass
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1,-1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits,dim=1)
         
        # sample from output of neural net
        ix = torch.multinomial(probs, num_samples=1).item()
        
        # shift context window
        context = context[1:] + [ix]
        out.append(ix)
        
        # leave if we predict '.'
        if ix == 0:
            break
    
    # print output
    print(''.join(itos[i] for i in out))

sdha.
jhashemmzibritdassnne.
shmottdlylskhmaryn.
zylenghgefnekres.
branchlustsholst.
jremtqngellam.
brandlavghnzq.
nambrckowynzerra.
jrqbnzxwmazamerzikckprissemwenderzikbashzonlektwhkbeggiganetlaem.
zakgengikhlltdwensshjobnakbohstinzhishbrttty.
ghdzjenevulllen.
dulylsvikketngissivmarmylennrqlknavinella.
jrya.
doeghaht.
mizzistamashbenglowgwqrdinghvikrim.
smaddmandravtygally.
dymrqikeqrdentqeshcycnjitghighangliden.
jhamtrbshbghmarghtrconnestnontzalss.
hrhcarthmarvhmarmikdhyamirghsisstphambrhhidhbanna.
kzimmed.


modern innovations (that we will learn about soon!) make it less important than before to ensure the NN is properly initialized.