In [29]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
import math
%matplotlib inline

In [2]:
words = open('./names.txt').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
chars = sorted(list(set(''.join(words)))) #lol
token_lookup = {c: i+1 for i, c in enumerate(chars)}
token_lookup['.'] = 0
char_lookup = {i:c for c, i in token_lookup.items()}
TOTAL_TOKENS = len(char_lookup.keys())

In [34]:
# build the dataset

BLOCK_SIZE = 3 # context size to give the model in order to predict the next character

def build_dataset(corpus, block_size, codebook, padding_char="."):
    X, Y = [], []
    for word in corpus:
        start_padding = padding_char * block_size
        padded_word = f"{start_padding}{word}."
        tokenized_word = [codebook[c] for c in padded_word]
        for i in range(len(tokenized_word)-block_size):
            X.append(tokenized_word[i:i+block_size])
            Y.append(tokenized_word[i+block_size])
        
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y


random.shuffle(words)

n1 = int(len(words)*0.8)
n2 = int(len(words)*0.9)

x_train, y_train = build_dataset(words[:n1], BLOCK_SIZE, token_lookup)
x_valid, y_valid = build_dataset(words[n1:n2], BLOCK_SIZE, token_lookup)
x_test, y_test = build_dataset(words[n2:], BLOCK_SIZE, token_lookup)

In [95]:
TOTAL_NEURONS = 200
EMBEDDING_DIMS = 10 # embed 27 characters into N dimensional space

C = torch.randn((TOTAL_TOKENS, EMBEDDING_DIMS))
W1 = torch.randn((EMBEDDING_DIMS * BLOCK_SIZE, TOTAL_NEURONS)) * ((5/3)/(EMBEDDING_DIMS * BLOCK_SIZE)**0.5) # kaiming init
b1 = torch.randn((TOTAL_NEURONS,)) * 0.01
W2 = torch.randn(TOTAL_NEURONS, TOTAL_TOKENS) * 0.01
b2 = torch.randn((TOTAL_TOKENS,)) * 0.01

bn_gain = torch.ones((1, TOTAL_NEURONS))
bn_bias = torch.zeros((1, TOTAL_NEURONS))
bn_epsilon = torch.tensor([1e-5])
bn_mean_running = torch.zeros((1, TOTAL_NEURONS))
bn_std_running = torch.zeros((1, TOTAL_NEURONS))
bn_momentum = 0.001

parameters = [C, W1, b1, W2, b2, bn_gain, bn_bias]

for p in parameters:
    p.requires_grad = True

In [96]:
steps = 50000
print_steps = 5000
BATCH_SIZE = 256
lrs = [1e-1, 1e-2]
tlr = math.ceil(steps / len(lrs))

for t in range(steps):
    lr = lrs[int(t / tlr)]
    
    # forward pass
    batch_ix = torch.randint(0, x_train.shape[0], (32,))
    embeddings = C[x_train[batch_ix]]
    cat_embeddings = embeddings.view(-1, EMBEDDING_DIMS * BLOCK_SIZE)
    preact = cat_embeddings @ W1 + b1

    # batch norm layer
    cur_bn_mean = preact.mean(dim=0, keepdims=True)
    cur_bn_std = preact.std(dim=0, keepdims=True) + bn_epsilon
    preact = (preact - cur_bn_mean) / cur_bn_std
    
    with torch.no_grad():
        bn_mean_running = ((1 - bn_momentum) * bn_mean_running) + (bn_momentum * cur_bn_mean)
        bn_std_running = ((1 - bn_momentum) * bn_std_running) + (bn_momentum * cur_bn_std)
    
    preact = bn_gain * preact + bn_bias
    
    # non linearity
    hidden_states_01 = torch.tanh(preact)
    
    # second layer to logits
    logits = hidden_states_01 @ W2 + b2
    
    # loss
    loss = F.cross_entropy(logits, y_train[batch_ix])
    
    if t % print_steps == 0:
        print(f'step {t}: {loss.item()}')
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -lr * p.grad

step 0: 3.2858150005340576
step 5000: 2.1013684272766113
step 10000: 2.2706680297851562
step 15000: 2.6932806968688965
step 20000: 2.216247081756592
step 25000: 2.454237699508667
step 30000: 2.223783016204834
step 35000: 1.8723540306091309
step 40000: 2.0456132888793945
step 45000: 2.3392839431762695


In [97]:
# clculate training and validation loss
@torch.no_grad()
def calc_loss(x_target, y_target):
    emb = C[x_target].view(-1, EMBEDDING_DIMS * BLOCK_SIZE)
    h = emb @ W1 + b1
    h = (h - bn_mean_running) / bn_std_running
    h = torch.tanh(bn_gain * h + bn_bias)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y_target)
    return loss

calc_loss(x_train, y_train), calc_loss(x_valid, y_valid)

(tensor(2.1261), tensor(2.1555))

In [98]:
# sample
@torch.no_grad()
def sample():
    tokens = [0] * BLOCK_SIZE
    while True:
        emb = C[torch.tensor(tokens[-BLOCK_SIZE:])].view(-1, EMBEDDING_DIMS * BLOCK_SIZE)
        h = emb @ W1 + b1
        h = (h - bn_mean_running) / bn_std_running
        h = torch.tanh(bn_gain * h + bn_bias)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        tokens.append(ix)
        if ix == 0:
            break
    return "".join([char_lookup[t] for t in tokens][BLOCK_SIZE:-1])

for i in range(20):
    print(sample())

letsuhandi
hazion
corixkendanae
slakyn
mador
kaveonte
amay
aryoni
raf
andrick
kos
gion
dakjarisho
roselaniquancoin
ameybeexaney
zyn
hoduly
aun
kalyleynosisyq
ado
