# Setup

In [29]:
import torch
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [30]:
# Collecting Words
path = '/content/drive/MyDrive/Colab Notebooks/AndrejKarpathy_NN_Hero/names.txt'
words = open(path, 'r').read().splitlines()

# Mapping
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [31]:
block_size = 3 # number of characters as input
X, Y = [], []

for w in words:
    context = [0] * block_size
    for ch in w+'.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

# Split the dataset (train: 80%, dev: 10%, test:10%)
rs = 2147483647
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=0.8, random_state=rs, shuffle=True)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, train_size=0.5, random_state=rs, shuffle=True)

# Backprop Ninja

In [32]:
# Utility function to use later: to compare manual gradients with pytorch ones

def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt-t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [33]:
n_emb = 10
n_hidden = 64
vocab_size = 27

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_emb), generator=g)

# L1
w1 = torch.randn((n_emb * block_size, n_hidden), generator=g) * (5/3)/((n_emb * block_size)**0.5)
b1 = torch.randn((n_hidden), generator=g) * 0.01 # biases are useless now as we are using batchnorm layer, using it only for fun

# L2
w2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn((vocab_size), generator=g) * 0.1

# Batchnorm
bngain = torch.randn((1, n_hidden), generator=g) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden), generator=g) * 0.1

parameters = [C, w1, b1, w2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True
print(sum(p.nelement() for p in parameters)) # total params

4137


In [34]:
# create a mini batch

n = batch_size = 32
ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
Xb, Yb = X_train[ix], Y_train[ix]

<img src='https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fimg-blog.csdnimg.cn%2F8ea5b689eb984e51a3d0f5018bc2e7f5.png&f=1&nofb=1&ipt=46bbf325087fa4d91f99379ef97f2672f4761df26ac4f1751f027e5261bdc7d7' height='400px'>

In [35]:
# Forward Pass (expanded completely to make manual backprop easier to understand)

emb = C[Xb]
embcat = emb.view(emb.shape[0], -1) # concatenrate vector columns into a single column
eps = 1e-5

# Linear 1
hprebn = embcat @ w1 + b1 # hidden layer preactivation

# Batchnorm
bnmeani = (1/n)*hprebn.sum(0, keepdim=True) # mini-batch mean
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = (1/(n-1)) * bndiff2.sum(0, keepdim=True) # mini-batch variance # Bessel's Correction: We are using n-1 instead of n
bnvar_inv = (bnvar + eps)**-0.5
bnraw = bndiff * bnvar_inv # normalized
hpreact = bngain * bnraw + bnbias

# Non-linear layer
h = torch.tanh(hpreact)

# L2
logits = h @ w2 + b2 # output layer

# Applying cross entory (F.cross_entropy)
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits-logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum**-1
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# Pytorch Backward pass
for p in parameters:
    p.grad = None

for t in [logprobs, probs, counts_sum_inv, counts_sum, counts, norm_logits, logit_maxes, logits, h, hpreact, bnraw, bnvar_inv, bnvar, bndiff2, bndiff, bnmeani, hprebn, embcat, emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.5251, grad_fn=<NegBackward0>)

## Manual Backprop

In [76]:
# Ex1: Manual backpropogation of each variable, as defined above
# Draw out the flow of variables, will help in understanding the derivatives

# d(loss)/d(logprobs)
# loss = - (a + b + c)/3
# d(loss)/da = -1/3 = -1/n
dlogprobs = torch.zeros_like(logprobs) # same shape as logprobs, each element will have it's own gradient
# -logprobs[range(n), Yb].mean() - but only some of the elements, 1 from each row is getting selected here, rest will be 0 grad
dlogprobs[range(n), Yb] = -1.0/n

# dloss/dprobs = d(logprobs)/d(probs) * d(loss)/d(logprobs)
dprobs = (1.0 / probs) * dlogprobs

# dcounts_sum_inv = dprobs/dcounts_sum_inv * dloss/dprobs
dcounts_sum_inv = (counts * dprobs).sum(1, keepdim=True) # to retain the shape of sum_inv
dcounts = counts_sum_inv * dprobs

dcounts_sum = -1 * counts_sum**-2 * dcounts_sum_inv
dcounts += torch.ones_like(counts) * dcounts_sum

# dcounts/dnorm_logits * dcounts = norm_logits.exp() * dcounts
dnorm_logits = counts * dcounts

dlogits = dnorm_logits.clone()
dlogit_maxes = (-dnorm_logits).sum(1, keepdim=True)
dlogits += F.one_hot(logits.max(1).indices, num_classes= logits.shape[1]) * dlogit_maxes

# logits = h @ w2 + b2
dh = dlogits @ w2.T
dw2 = h.T @ dlogits
db2 = dlogits.sum(0)

# h = torch.tanh(hpreact)
dhpreact = (1.0 - h**2) * dh

# hpreact = bngain * bnraw + bnbias
dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
dbnraw = (bngain * dhpreact)
dbnbias = dhpreact.sum(0, keepdim=True)

# bnraw = bndiff * bnvar_inv
dbndiff = bnvar_inv * dbnraw
dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)

# bnvar_inv = (bnvar + eps)**-0.5
dbnvar = (-0.5 * (bnvar+eps)**-1.5) * dbnvar_inv

# bnvar = (1/(n-1)) * bndiff2.sum(0, keepdim=True)
dbndiff2 = ((1.0/(n-1)) * torch.ones_like(bndiff2)) * dbnvar

# bndiff2 = bndiff**2
dbndiff += (2.0 *bndiff * dbndiff2)

# bndiff = hprebn - bnmeani
dhprebn = dbndiff.clone()
dbnmeani = -dbndiff.sum(0, keepdims=True)

# bnmeani = (1/n)*hprebn.sum(0, keepdim=True)
dhprebn += ((1/n) * torch.ones_like(hprebn)) * dbnmeani

# hprebn = embcat @ w1 + b1
dembcat = dhprebn @ w1.T
dw1 = embcat.T @ dhprebn
db1 = dhprebn.sum(0)

# embcat = emb.view(emb.shape[0], -1)
demb = dembcat.view(emb.shape)

# emb = C[Xb]
dC = torch.zeros_like(C)
for i in range(Xb.shape[0]):
    for j in range(Xb.shape[1]):
        ix = Xb[i, j]
        dC[ix] += demb[i, j]

# Comparing the differences
cmp('logprobs', dlogprobs, logprobs)
cmp('probs', dprobs, probs)
cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
cmp('counts_sum', dcounts_sum, counts_sum)
cmp('counts', dcounts, counts)
cmp('norm_logits', dnorm_logits, norm_logits)
cmp('logit_maxes', dlogit_maxes, logit_maxes)
cmp('logits', dlogits, logits)
cmp('h', dh, h)
cmp('w2', dw2, w2)
cmp('b2', db2, b2)
cmp('hpreact', dhpreact, hpreact)
cmp('bngain', dbngain, bngain)
cmp('bnraw', dbnraw, bnraw)
cmp('bnbias', dbnbias, bnbias)
cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
cmp('bnvar', dbnvar, bnvar)
cmp('bndiff2', dbndiff2, bndiff2)
cmp('bndiff', dbndiff, bndiff)
cmp('bnmeani', dbnmeani, bnmeani)
cmp('hprebn', dhprebn, hprebn)
cmp('embcat', dembcat, embcat)
cmp('w1', dw1, w1)
cmp('b1', db1, b1)
cmp('emb', demb, emb)
cmp('C', dC, C)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0
probs           | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum_inv  | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum      | exact: True  | approximate: True  | maxdiff: 0.0
counts          | exact: True  | approximate: True  | maxdiff: 0.0
norm_logits     | exact: True  | approximate: True  | maxdiff: 0.0
logit_maxes     | exact: True  | approximate: True  | maxdiff: 0.0
logits          | exact: True  | approximate: True  | maxdiff: 0.0
h               | exact: True  | approximate: True  | maxdiff: 0.0
w2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0
hpreact         | exact: False | approximate: True  | maxdiff: 4.656612873077393e-10
bngain          | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
bnraw           | exact: False | approximate: True  | maxdiff: 4.656612873077393e-10
bnbias  

In [79]:
# Exercise 4: putting it all together!
# Train the MLP neural net with your own backward pass

# init
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad = True

# same optimization as last time
max_steps = 200000
batch_size = 32
n = batch_size # convenience
lossi = []

# use this context manager for efficiency once your backward pass is written (TODO)
# with torch.no_grad(): # Removed to allow gradient calculation

# kick off optimization
for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, X_train.shape[0], (batch_size, ), generator=g)
    Xb, Yb = X_train[ix], Y_train[ix]
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1) # concatenrate vector columns into a single column
    eps = 1e-5

    # Linear 1
    hprebn = embcat @ W1 + b1 # hidden layer preactivation # Corrected w1 to W1

    # Batchnorm
    bnmeani = (1/n)*hprebn.sum(0, keepdim=True) # mini-batch mean
    bndiff = hprebn - bnmeani
    bndiff2 = bndiff**2
    bnvar = (1/(n-1)) * bndiff2.sum(0, keepdim=True) # mini-batch variance # Bessel's Correction: We are using n-1 instead of n
    bnvar_inv = (bnvar + eps)**-0.5
    bnraw = bndiff * bnvar_inv # normalized
    hpreact = bngain * bnraw + bnbias

    # Non-linear layer
    h = torch.tanh(hpreact)

    # L2
    logits = h @ W2 + b2 # output layer # Corrected w2 to W2

    # Applying cross entory (F.cross_entropy)
    logit_maxes = logits.max(1, keepdim=True).values
    norm_logits = logits-logit_maxes # subtract max for numerical stability
    counts = norm_logits.exp()
    counts_sum = counts.sum(1, keepdim=True)
    counts_sum_inv = counts_sum**-1
    probs = counts * counts_sum_inv
    logprobs = probs.log()
    loss = -logprobs[range(n), Yb].mean()

    # backward pass
    for p in parameters:
        p.grad = None
    #loss.backward() # use this for correctness comparisons, delete it later!

    # manual backprop! #swole_doge_meme
    # -----------------
    dlogprobs = torch.zeros_like(logprobs)
    dlogprobs[range(n), Yb] = -1.0/n
    dprobs = (1.0 / probs) * dlogprobs
    dcounts_sum_inv = (counts * dprobs).sum(1, keepdim=True)
    dcounts = counts_sum_inv * dprobs
    dcounts_sum = -1 * counts_sum**-2 * dcounts_sum_inv
    dcounts += torch.ones_like(counts) * dcounts_sum
    dnorm_logits = counts * dcounts
    dlogits = dnorm_logits.clone()
    dlogit_maxes = (-dnorm_logits).sum(1, keepdim=True)
    dlogits += F.one_hot(logits.max(1).indices, num_classes= logits.shape[1]) * dlogit_maxes # Corrected dlogit_maxes to dlogits_maxes, Typo in original code

    dh = dlogits @ W2.T # Corrected w2 to W2
    dw2 = h.T @ dlogits # Corrected dw2 to dW2
    db2 = dlogits.sum(0)
    dhpreact = (1.0 - h**2) * dh
    dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
    dbnraw = (bngain * dhpreact)
    dbnbias = dhpreact.sum(0, keepdim=True)
    dbndiff = bnvar_inv * dbnraw
    dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
    dbnvar = (-0.5 * (bnvar+eps)**-1.5) * dbnvar_inv
    dbndiff2 = ((1.0/(n-1)) * torch.ones_like(bndiff2)) * dbnvar
    dbndiff += (2.0 *bndiff * dbndiff2)
    dhprebn = dbndiff.clone()
    dbnmeani = -dbndiff.sum(0, keepdims=True)
    dhprebn += ((1/n) * torch.ones_like(hprebn)) * dbnmeani

    dembcat = dhprebn @ W1.T # Corrected w1 to W1
    dw1 = embcat.T @ dhprebn # Corrected dw1 to dW1
    db1 = dhprebn.sum(0)
    demb = dembcat.view(emb.shape)
    dC = torch.zeros_like(C)
    for i in range(Xb.shape[0]):
        for j in range(Xb.shape[1]):
            ix = Xb[i, j]
            dC[ix] += demb[i, j]
    grads = [dC, dw1, db1, dw2, db2, dbngain, dbnbias] # Corrected dw1, dw2 to dW1, dW2
    # -----------------

    # update
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
    for p, grad in zip(parameters, grads):
    #p.data += -lr * p.grad # old way of cheems doge (using PyTorch grad from .backward())
        p.data += -lr * grad # new way of swole doge TODO: enable

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

12297


In [80]:
# calibrate the batch norm at the end of training

with torch.no_grad():
    # pass the training set through
    emb = C[X_train]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    # measure the mean/std over the entire training set
    bnmean = hpreact.mean(0, keepdim=True)
    bnvar = hpreact.var(0, keepdim=True, unbiased=True)

In [81]:
# DEV & TEST LOSS

@torch.no_grad() # decorator to disable grad tracking for any vars inside the function
def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]

    emb = C[x] # (N, block_size, n_embd)
    embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
    h = torch.tanh(hpreact) # (N, n_hidden)
    logits = h @ W2 + b2 # (N, vocab_size)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('val')
split_loss('test')

val 2.178706645965576
test 2.177915096282959


In [82]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        # ------------
        # forward pass:
        # Embedding
        emb = C[torch.tensor([context])] # (1,block_size,d)
        embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
        hpreact = embcat @ W1 + b1
        hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
        h = torch.tanh(hpreact) # (N, n_hidden)
        logits = h @ W2 + b2 # (N, vocab_size)
        # ------------
        # Sample
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print(''.join(itos[i] for i in out))

mora.
gryanna.
elmadhayla.
redhasiendradge.
zehed.
elin.
shyonelle.
elieananaraelynn.
hokelin.
shervredhira.
sten.
joselynn.
novanna.
brence.
ruy.
julieh.
yuma.
maston.
azhianna.
yansunazalel.
