In [29]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
import math
%matplotlib inline

In [2]:
words = open('./names.txt').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
chars = sorted(list(set(''.join(words)))) #lol
token_lookup = {c: i+1 for i, c in enumerate(chars)}
token_lookup['.'] = 0
char_lookup = {i:c for c, i in token_lookup.items()}
TOTAL_TOKENS = len(char_lookup.keys())

In [34]:
# build the dataset

BLOCK_SIZE = 3 # context size to give the model in order to predict the next character

def build_dataset(corpus, block_size, codebook, padding_char="."):
    X, Y = [], []
    for word in corpus:
        start_padding = padding_char * block_size
        padded_word = f"{start_padding}{word}."
        tokenized_word = [codebook[c] for c in padded_word]
        for i in range(len(tokenized_word)-block_size):
            X.append(tokenized_word[i:i+block_size])
            Y.append(tokenized_word[i+block_size])
        
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y


random.shuffle(words)

n1 = int(len(words)*0.8)
n2 = int(len(words)*0.9)

x_train, y_train = build_dataset(words[:n1], BLOCK_SIZE, token_lookup)
x_valid, y_valid = build_dataset(words[n1:n2], BLOCK_SIZE, token_lookup)
x_test, y_test = build_dataset(words[n2:], BLOCK_SIZE, token_lookup)

In [81]:
TOTAL_NEURONS = 200
EMBEDDING_DIMS = 10 # embed 27 characters into N dimensional space

C = torch.randn((TOTAL_TOKENS, EMBEDDING_DIMS))
W1 = torch.randn((EMBEDDING_DIMS * BLOCK_SIZE, TOTAL_NEURONS)) * ((5/3)/(EMBEDDING_DIMS * BLOCK_SIZE)**0.5) # kaiming init
b1 = torch.randn((TOTAL_NEURONS,)) * 0.01
W2 = torch.randn(TOTAL_NEURONS, TOTAL_TOKENS) * 0.01
b2 = torch.randn((TOTAL_TOKENS,)) * 0.01

bn_gain = torch.ones((1, TOTAL_NEURONS))
bn_bias = torch.zeros((1, TOTAL_NEURONS))

parameters = [C, W1, b1, W2, b2, bn_gain, bn_bias]

for p in parameters:
    p.requires_grad = True

In [85]:
steps = 50000
print_steps = 5000
BATCH_SIZE = 256
lrs = [1e-1, 1e-2]
tlr = math.ceil(steps / len(lrs))

for t in range(steps):
    lr = lrs[int(t / tlr)]
    # forward pass
    batch_ix = torch.randint(0, x_train.shape[0], (32,))
    embeddings = C[x_train[batch_ix]]
    cat_embeddings = embeddings.view(-1, EMBEDDING_DIMS * BLOCK_SIZE)
    preact = cat_embeddings @ W1 + b1
    # batch norm
    preact = (preact - preact.mean(dim=0, keepdims=True)) / preact.std(dim=0, keepdims=True)
    preact = bn_gain * preact + bn_bias
    hidden_states_01 = torch.tanh(preact)
    logits = hidden_states_01 @ W2 + b2
    loss = F.cross_entropy(logits, y_train[batch_ix])
    if t % print_steps == 0:
        print(f'step {t}: {loss.item()}')
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -lr * p.grad

step 0: 2.5478084087371826
step 5000: 2.377833127975464
step 10000: 2.454554319381714
step 15000: 2.3809127807617188
step 20000: 2.3064322471618652
step 25000: 2.449836492538452
step 30000: 2.2298200130462646
step 35000: 2.4332175254821777
step 40000: 1.733128547668457
step 45000: 2.1831178665161133


In [87]:
# clculate training and validation loss
@torch.no_grad()
def calc_loss(x_target, y_target):
    emb = C[x_target].view(-1, EMBEDDING_DIMS * BLOCK_SIZE)
    h = emb @ W1 + b1
    h = (h - h.mean(dim=0, keepdims=True)) / h.std(dim=0, keepdims=True)
    h = torch.tanh(bn_gain * h + bn_bias)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y_target)
    return loss

calc_loss(x_train, y_train), calc_loss(x_valid, y_valid)

(tensor(2.1053), tensor(2.1397))

In [94]:
# sample
@torch.no_grad()
def sample():
    tokens = [0] * BLOCK_SIZE
    while True:
        emb = C[torch.tensor(tokens[-BLOCK_SIZE:])].view(-1, EMBEDDING_DIMS * BLOCK_SIZE)
        h = emb @ W1 + b1
        print(h)
        h = (h - h.mean(dim=0, keepdims=True)) / h.std(dim=0, keepdims=True)
        print(h)
        h = torch.tanh(bn_gain * h + bn_bias)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        tokens.append(ix)
        if ix == 0:
            break
    return "".join([char_lookup[t] for t in tokens][BLOCK_SIZE:-1])

for i in range(20):
    print(sample())

tensor([[-1.2226e+00, -5.7514e-01,  1.5726e+00, -8.9563e-01, -4.2299e-01,
          3.1737e-01, -3.7347e-01, -4.0377e-01,  2.5963e+00, -2.0262e+00,
         -1.2265e+00, -3.2221e-01, -1.0200e+00, -1.4708e-01,  1.2598e-01,
         -7.4295e-01,  1.9956e+00, -1.1344e+00,  1.4441e+00,  1.3393e-01,
          1.0885e+00,  1.1504e+00, -1.1677e+00, -3.1158e-02,  2.1367e-01,
          4.0217e-01, -5.6928e-01,  1.4484e+00,  3.3922e+00,  4.4614e-01,
         -2.1129e-01, -3.6620e-01,  1.6677e+00, -3.4504e-01,  1.4493e-01,
         -1.5573e+00, -1.1558e+00,  1.1900e+00, -2.9737e-01,  2.2364e+00,
         -2.5963e-01,  6.5549e-01, -3.1693e-01,  1.0640e+00, -1.3255e-01,
          9.3955e-01,  9.3410e-01, -1.8707e-01,  1.0591e+00,  1.3200e+00,
          9.8426e-01,  2.8303e-01, -5.5124e-01, -1.3585e+00, -3.2119e-01,
          1.0012e+00,  2.1714e+00, -1.2558e+00, -4.9660e-01,  1.2911e+00,
         -9.8416e-01,  2.7136e-01,  8.9102e-01, -3.2857e-01, -1.7429e+00,
          3.7507e-01,  1.4067e+00, -4.

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [9]:
a = -torch.tensor(1/27.).log()
logits = torch.tensor([100.,0.,5.,0.])
probs = torch.softmax(logits, dim=0)
probs, -probs[2].log()

(tensor([1.0000e+00, 3.7835e-44, 5.5211e-42, 3.7835e-44]), tensor(95.0000))