In [2]:
import random
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

# read data

In [3]:
words = open('../data/names.txt', 'r').read().splitlines()

In [4]:
len(words)

32033

In [5]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [6]:
chars = sorted(list(set(''.join(words))))

# util

In [7]:
itos = {}
itos[0] = '.'
itos |= {i+1: s for i, s in enumerate(chars)}

In [8]:
stoi = {s: i for i, s in itos.items()}

In [27]:
def cmp(s, dt, t):
    """
    s is the message, dt is our grad, t is from pytorch
    """
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s}')

# Build a neural language model

## prepare data

In [9]:
def build_dataset(data):
    block_size = 3
    X, Y = [], []
    for w in data[:]:
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            # print(''.join([itos[i] for i in context]), '--->', itos[ix])
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]        

    X = torch.tensor(X)
    Y = torch.tensor(Y)        
    print(X.shape, Y.shape)
    return X, Y

In [10]:
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
         
Xtr, Ytr = build_dataset(words[:n1])
Xdv, Ydv = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])                               

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


# Train a neural language model

In [46]:
n_emb = 10
n_hidden = 200
block_size = 3

C = torch.randn(len(itos), n_emb)
w1 = torch.randn(n_emb*block_size, n_hidden)
b1 = torch.randn(n_hidden)
w2 = torch.randn(n_hidden, len(itos)) * 0.02
b2 = torch.randn(len(itos)) * 0.01
parameters = [C, w1, b1, w2, b2]

In [47]:
sum(p.nelement() for p in parameters)

11897

In [48]:
for p in parameters:
    p.requires_grad = True

In [49]:
# mini-batch
batch_size = 32
n = batch_size
ix = torch.randint(0, Xtr.shape[0], (batch_size,))
Xb, Yb = Xtr[ix], Ytr[ix]

In [51]:
# forward
emb = C[Xb] # (32, 3, 2)
h = (emb.view(emb.shape[0], -1) @ w1 + b1).tanh() # (32, 100)
logits = h @ w2 + b2 # (32, 27)
# loss = F.cross_entropy(logits, Yb)
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum**-1
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# backward
for p in parameters:
    p.grad = None
for t in [logprobs, probs]:
    t.retain_grad()
    
loss.backward()
loss

tensor(3.3476, grad_fn=<NegBackward0>)

# maunally compute grads

In [57]:
logprobs.shape

torch.Size([32, 27])

In [59]:
Yb.shape, Yb

(torch.Size([32]),
 tensor([18,  2, 18,  4, 12,  1,  0,  1, 19, 14, 18,  9, 15, 18, 18,  1,  0,  5,
          8,  3, 12,  1,  9,  0,  6, 19, 10,  5, 14,  8,  0,  0]))

In [65]:
dlogprobs = torch.zeros_like(logprobs)
dlogprobs[range(n), Yb] = -1.0/n

In [66]:
cmp('logprobs', dlogprobs, logprobs)

logprobs        | exact: True 


## compute loss on train/dev/test

In [17]:
def eval_dataset(X, Y):
    with torch.no_grad():
        emb = C[X] # (32, 3, 2)
        h = (emb.view(emb.shape[0], -1) @ w1 + b1).tanh() # (32, 100)
        logits = h @ w2 + b2 # (32, 27)
        loss = F.cross_entropy(logits, Y)
        print(loss.item())

In [18]:
eval_dataset(Xtr, Ytr)
eval_dataset(Xdv, Ydv)
# eval_dataset(Xte, Yte)

2.2350029945373535
2.2502639293670654
