In [20]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [21]:
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [22]:
len(words)

32033

In [23]:
chars = sorted(list(set(''.join(words))))
print(chars)
stoi = {s: i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s,i in stoi.items()}
print(itos)
vocab_size = len(itos)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


### build the dataset

In [24]:
device = torch.device('cpu')
# device = torch.device('cuda')

In [25]:
block_size = 3
def build_dataset(words):
    X, Y = [], []
    for w in words:
        # print(w)
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '---->', itos[ix])
            context = context[1:] + [ix]

    X = torch.tensor(X).to(device)
    Y = torch.tensor(Y).to(device)
    print(X.dtype, X.shape, Y.dtype, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr, Ytr = build_dataset(words[:n1])        # 80%
Xdev, Ydev = build_dataset(words[n1:n2])    # 10%
Xte, Yte = build_dataset(words[n2:])        # 10%

torch.int64 torch.Size([182625, 3]) torch.int64 torch.Size([182625])
torch.int64 torch.Size([22655, 3]) torch.int64 torch.Size([22655])
torch.int64 torch.Size([22866, 3]) torch.int64 torch.Size([22866])


### batch norm - adding moving average statistics

In [7]:
n_embd = 10 # dimensionality of the character embedding vectors
n_hidden = 200 # number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size,n_embd), generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3) / ((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))
parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

12297


In [8]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X, Y
    
    # forward pass
    emb = C[Xb] # embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact = embcat @ W1 + b1 # hidden layer pre-activation
    bnmeani = hpreact.mean(dim=0, keepdim=True)
    bnstdi = hpreact.std(dim=0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function

    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update gradient
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data -= lr * p.grad
    
    # track stats
    if i % 10000 == 0 or i == max_steps-1: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 3.3147
  10000/ 200000: 2.1984
  20000/ 200000: 2.3375
  30000/ 200000: 2.4359
  40000/ 200000: 2.0119
  50000/ 200000: 2.2595
  60000/ 200000: 2.4775
  70000/ 200000: 2.1020
  80000/ 200000: 2.2788
  90000/ 200000: 2.1862
 100000/ 200000: 1.9474
 110000/ 200000: 2.3010
 120000/ 200000: 1.9837
 130000/ 200000: 2.4523
 140000/ 200000: 2.3839
 150000/ 200000: 2.1987
 160000/ 200000: 1.9733
 170000/ 200000: 1.8668
 180000/ 200000: 1.9973
 190000/ 200000: 1.8347
 199999/ 200000: 2.4315


In [9]:
@torch.no_grad() # disable gradient tracking
def split_loss(split):
    x, y = {'train': (Xtr, Ytr),
            'val': (Xdev, Ydev),
            'test': (Xte, Yte)}[split]
    emb = C[x] # (N, block_size, n_embed)
    embcat = emb.view(emb.shape[0], -1) # (N, block_size * n_embed)
    preact = embcat @ W1 + b1
    preact = bngain * (preact - bnmean_running) / bnstd_running + bnbias
    h = torch.tanh(preact) # (N, n_hidden)
    logits = h @ W2 + b2 # (N, vocab_size)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.06659197807312
val 2.1050572395324707


### implementing cross entropy loss

In [27]:
n_embd = 10 # dimensionality of the character embedding vectors
n_hidden = 200 # number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size,n_embd), generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3) / ((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))
parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

12297


In [None]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X, Y
    
    # forward pass
    emb = C[Xb] # embed the characters into vectors - (batch_size x block_size x n_embed)
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors - (batch_size x (block_size x n_embed))
    hpreact = embcat @ W1 + b1 # hidden layer pre-activation - (batch_size x n_hidden)
    bnmeani = hpreact.mean(dim=0, keepdim=True) # (1 x n_hidden)
    bnstdi = hpreact.std(dim=0, keepdim=True) # (1 x n_hidden)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias # (batch_size x n_hidden)
    h = torch.tanh(hpreact) # hidden layer - (batch_size x n_hidden)
    logits = h @ W2 + b2 # output layer - (batch_size x vocab_size)
    # loss = F.cross_entropy(logits, Yb) # loss function
    logitsmax = logits.max(dim=1, keepdim=True).values # (batch_size x vocab_size)
    normlogits = logits - logitsmax # (batch_size x vocab_size)
    counts = normlogits.exp() # (batch_size x vocab_size)
    countsum = counts.sum(dim=1, keepdim=True) # (batch_size x 1)
    countsuminv = countsum ** -1 # (batch_size x 1)
    probs = counts * countsuminv # (batch_size x vocab_size)
    logprobs = probs.log() # (batch_size x vocab_size)
    loss = -logprobs[torch.arange(batch_size), Yb].mean() # 1

    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi

    # backward pass
    dlogprops = torch.zeros(logprobs.shape)
    dlogprops[torch.arange(batch_size), Yb] = -1./batch_size
    dprobs = (1.0 / probs) * dlogprops
    dcountsuminv = (counts * dprobs).sum(dim=1, keepdim=True)
    dcountsum = (-countsum**-2) * dcountsuminv
    dcounts = torch.ones_like(counts) * dcountsum
    dcounts += countsuminv * dprobs
    dcounts2 = (countsuminv + counts * (-countsum**-2) * torch.ones_like(counts)) * dprobs


    parameters = [C, W1, b1, W2, b2, bngain, bnbias]
    print(sum(p.nelement() for p in parameters))
    for p in parameters:
        p.requires_grad = True
    dloss = 1.

    for p in parameters:
        p.grad = None
    loss.backward()

    # update gradient
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data -= lr * p.grad
    
    # track stats
    if i % 10000 == 0 or i == max_steps-1: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

In [1]:
def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [38]:
import torch
import torch.nn.functional as F

batch_size = 32
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb = Xtr[ix]
Yb = Ytr[ix]

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True
    p.grad = None

# forward pass
emb = C[Xb] # embed the characters into vectors - (batch_size x block_size x n_embed)
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors - (batch_size x (block_size x n_embed))
hprebn = embcat @ W1 + b1 # hidden layer pre-activation - (batch_size x n_hidden)
bnmeani = 1.0/batch_size * hprebn.sum(dim=0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff ** 2
bnvar = 1.0/(batch_size-1) * bndiff2.sum(dim=0, keepdim=True)
bnstdinv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnstdinv
hpreact = bngain * bnraw + bnbias # (batch_size x n_hidden)
h = torch.tanh(hpreact) # hidden layer - (batch_size x n_hidden)
logits = h @ W2 + b2 # output layer - (batch_size x vocab_size)
# logits = torch.rand((batch_size, 27), requires_grad=True)
logitsmax = logits.max(dim=1, keepdim=True).values # (batch_size x vocab_size)
normlogits = logits - logitsmax # (batch_size x vocab_size)
counts = normlogits.exp() # (batch_size x vocab_size)
countsum = counts.sum(dim=1, keepdim=True) # (batch_size x 1)
countsuminv = countsum ** -1 # (batch_size x 1)
probs = counts * countsuminv # (batch_size x vocab_size)
logprobs = probs.log() # (batch_size x vocab_size)
loss = -logprobs[torch.arange(batch_size), Yb].mean() # 1

for t in [hpreact, logits, logitsmax, normlogits, counts, countsum, countsuminv, probs, logprobs, loss]:
    t.retain_grad()
loss.backward()
# backward pass
dlogprops = torch.zeros(logprobs.shape)
dlogprops[torch.arange(batch_size), Yb] = -1./batch_size
dprobs = (1.0 / probs) * dlogprops
dcountsuminv = (counts * dprobs).sum(dim=1, keepdim=True)
dcountsum = (-countsum**-2) * dcountsuminv
dcounts = torch.ones_like(counts) * dcountsum
dcounts += countsuminv * dprobs
dnormlogits = normlogits.exp() * dcounts
dlogitsmax = -dnormlogits.sum(dim=1, keepdim=True) # for: -logitsmax in: normlogits = logits - logitsmax
dlogits = F.one_hot(logits.argmax(dim=1), num_classes=logits.shape[1]) * dlogitsmax
dlogits += dnormlogits

cmp('dlogprops', dlogprops, logprobs)
cmp('dprobs', dprobs, probs)
cmp('dcountsuminv', dcountsuminv, countsuminv)
cmp('dcountsum', dcountsum, countsum)
cmp('dcounts', dcounts, counts)
cmp('dnormlogits', dnormlogits, normlogits)
cmp('dlogitsmax', dlogitsmax, logitsmax)
cmp('dlogits', dlogits, logits)

12297
dlogprops       | exact: True  | approximate: True  | maxdiff: 0.0
dprobs          | exact: True  | approximate: True  | maxdiff: 0.0
dcountsuminv    | exact: True  | approximate: True  | maxdiff: 0.0
dcountsum       | exact: True  | approximate: True  | maxdiff: 0.0
dcounts         | exact: True  | approximate: True  | maxdiff: 0.0
dnormlogits     | exact: True  | approximate: True  | maxdiff: 0.0
dlogitsmax      | exact: True  | approximate: True  | maxdiff: 0.0
dlogits         | exact: True  | approximate: True  | maxdiff: 0.0
