In [None]:
import torch
import random
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

: 

Pytorch will use Autograd engine for backprop.
 
But, `loss.backward()` abstracts a lot of things!

In [14]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [15]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [16]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


Function we will to compare manual and Pytorch gradients. 

In [18]:
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

Model - 

* `MLP`
* Normalize weights by squrt of `fan_in` 
* Do this for a few reasons:

1/ Loss will blow up if logits go to `extreme` values -
* Model spends the first few thousand iterations squashing weights to reasonable values
 
`2/ Vanishing gradient -` 
* Large activations are squared when passed through tanh
* The derivative of `tanh` `self.grad += (1 - t**2) * out.grad`
* So if `tanh` is 1 or -1, then the gradient vanishes b/c we multiply by zero 

In [26]:
n_embed = 10 # char embedding
n_hidden = 64 # neurons in hidden layer
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size,n_embd),generator=g)
hidden_in=block_size*n_embed

# Layer 1: 
W1 = torch.randn((hidden_in,n_hidden),generator=g) * (5/3)/(hidden_in**0.5)
b1 = torch.randn(n_hidden,generator=g) * 0.01

# Layer 2: 
W2 = torch.randn((n_hidden,vocab_size),generator=g) * 0.1
b2 = torch.randn(vocab_size,generator=g) * 0.1

# BN params
bngain=torch.ones(1,n_hidden) * 0.1 + 1
bnbias=torch.zeros(1,n_hidden) * 0.1

# Params
parameters = [C,W1,b1,W2,b2,bn_gain,bn_bias]
for p in parameters:
    p.requires_grad = True
    
# Batch 
batch_size = 32
n = batch_size
ix = torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] 

Forward and backward pass -

* Explicit implementation of loss function (unbundle `F.cross_entropy()`)

In [27]:
# Embed the chars into vectors
emb=C[Xb] 

# Concat
embcat=emb.view(emb.shape[0],-1)

# Linear layer 1
hprebn=embcat @ W1 + b1

# Batch norm to make it Gaussian 
bnmeani=1/n*hprebn.sum(0,keepdim=True)
bndiff=hprebn - bnmeani
bndiff2=bndiff**2
bnvar=1/(n-1)*bndiff2.sum(0,keepdim=True) # Bessel's correction: Div by n-1
bnvar_inv=(bnvar+1e-5)**-0.5
bnraw=bndiff*bnvar_inv

# Activation
hpreact=bngain*bnraw+bnbias

# Non-linrarity
h=torch.tanh(hpreact)

# Linear layer 2
logits = h @ W2 + b2

# Cross entropy loss (same as F.cross_entropy())
logit_maxes=logits.max(1,keepdim=True).values
norm_logits=logits-logit_maxes # Subtract max for numerical stabilitiy
counts=norm_logits.exp()
counts_sum=counts.sum(1,keepdims=True)
counts_sum_inv=counts_sum**-1
probs=counts*counts_sum_inv
logprobs=probs.log()
loss=-logprobs[range(n),Yb].mean()

# Set grad to 0
for p in parameters: 
    p.grad = None
    
# Retain grad of all these intermediate tensors
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, 
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
    t.retain_grad()

# Backward
loss.backward()
loss 

In [28]:
Yb

tensor([ 8, 14, 15, 22,  0, 19,  9, 14,  5,  1, 20,  3,  8, 14, 12,  0, 11,  0,
        26,  9, 25,  0,  1,  1,  7, 18,  9,  3,  5,  9,  0, 18])