In [1]:
import torch
from torch.nn import functional as funct
g = torch.Generator().manual_seed(14442)

import pandas as pd
import altair as alt

import random
random.seed(30)

In [2]:
words = open("names.txt", "r").read().splitlines()

In [3]:
alphabet = sorted(list(set(".".join(words))))
char_to_idx = {c: i for i, c in enumerate(alphabet)}
idx_to_char = {i: c for i, c in enumerate(alphabet)}

In [4]:
context_size = 3
emb_length = 50
batch_size = 100
num_neurons = 300

def build_dataset(words:list):
    xs, ys = [], []
    for w in words:
        context = [0]*context_size
        for ch in w + ".":
            idx = char_to_idx[ch]
            xs.append(context)
            ys.append(idx)
            context = context[1:] + [idx]
    xs, ys = torch.tensor(xs), torch.tensor(ys)

    return xs, ys

In [5]:
random.shuffle(words)

n1 = int(0.8*len(words)); n2 = int(0.9*len(words))
x_train, y_train = build_dataset(words[:n1])
x_val, y_val = build_dataset(words[n1:n2])
x_test, y_test = build_dataset(words[n2:])

In [6]:
# Lookup table 
C = torch.randn((len(alphabet), emb_length), generator=g)

# Hidden layer
W1 = torch.randn((context_size * emb_length, num_neurons), generator=g)
b1 = torch.randn(num_neurons, generator=g)

# Output layer
W2 = torch.randn((num_neurons, len(alphabet)), generator=g)
b2 = torch.randn(27, generator=g)

params = [C, W1, b1, W2, b2]

for p in params:
    p.requires_grad = True

### Learning rate tuning

In [7]:
lr_exp = torch.linspace(-3, 0, 1000)
lr_space = 10**lr_exp 

In [8]:
losses = []

for i in range(len(lr_space)):
    batch = torch.randint(0, x_train.shape[0], (batch_size, ))
    embedding = C[x_train[batch]]
    h1 = torch.tanh(embedding.view(-1, context_size * emb_length) @ W1 + b1)
    logits = h1 @ W2 + b2
    loss = funct.cross_entropy(logits, y_train[batch]) #Equivalent to mean of neg log-likelihood

    losses.append(loss.item())

    for p in params:
        p.grad = None

    loss.backward()

    for p in params:
        p.data += -lr_space[i] * p.grad

In [9]:
source = pd.DataFrame({"x": lr_exp.numpy(), "f(x)": losses})

alt.Chart(source).mark_line().encode(
    x='x:Q',
    y='f(x):Q'
).properties(
    width=500,
    height=500
).interactive()

### Training

In [10]:
# Lookup table 
C = torch.randn((len(alphabet), emb_length), generator=g)

# Hidden layer
W1 = torch.randn((context_size * emb_length, num_neurons), generator=g)
b1 = torch.randn(num_neurons, generator=g)

# Output layer
W2 = torch.randn((num_neurons, len(alphabet)), generator=g)
b2 = torch.randn(27, generator=g)

params = [C, W1, b1, W2, b2]

for p in params:
    p.requires_grad = True

In [11]:
for i in range(100000):
    batch = torch.randint(0, x_train.shape[0], (batch_size, ))
    embedding = C[x_train[batch]]
    h1 = torch.tanh(embedding.view(-1, context_size * emb_length) @ W1 + b1)
    logits = h1 @ W2 + b2
    loss = funct.cross_entropy(logits, y_train[batch]) #Equivalent to mean of neg log-likelihood
    
    if i%2500==0:
        print(f"Iter {i} \t|\t Loss: {loss:.5f}")

    for p in params:
        p.grad = None

    loss.backward()

    lr = 0.1 if i < 30000 else 0.001

    for p in params:
        p.data += -lr * p.grad

Iter 0 	|	 Loss: 35.67497
Iter 2500 	|	 Loss: 3.94593
Iter 5000 	|	 Loss: 3.89794
Iter 7500 	|	 Loss: 2.66832
Iter 10000 	|	 Loss: 2.51858
Iter 12500 	|	 Loss: 2.03370
Iter 15000 	|	 Loss: 2.28029
Iter 17500 	|	 Loss: 2.37879
Iter 20000 	|	 Loss: 2.20791
Iter 22500 	|	 Loss: 2.29429
Iter 25000 	|	 Loss: 2.40089
Iter 27500 	|	 Loss: 2.18038
Iter 30000 	|	 Loss: 2.28181
Iter 32500 	|	 Loss: 2.21216
Iter 35000 	|	 Loss: 2.00417
Iter 37500 	|	 Loss: 2.12490
Iter 40000 	|	 Loss: 2.18462
Iter 42500 	|	 Loss: 2.43368
Iter 45000 	|	 Loss: 2.30173
Iter 47500 	|	 Loss: 2.21667
Iter 50000 	|	 Loss: 2.01487
Iter 52500 	|	 Loss: 1.94073
Iter 55000 	|	 Loss: 2.28348
Iter 57500 	|	 Loss: 2.15537
Iter 60000 	|	 Loss: 2.02759
Iter 62500 	|	 Loss: 2.20695
Iter 65000 	|	 Loss: 1.67594
Iter 67500 	|	 Loss: 2.06691
Iter 70000 	|	 Loss: 1.89820
Iter 72500 	|	 Loss: 2.04171
Iter 75000 	|	 Loss: 1.98580
Iter 77500 	|	 Loss: 1.90743
Iter 80000 	|	 Loss: 2.24756
Iter 82500 	|	 Loss: 1.83384
Iter 85000 	|	 Loss:

In [12]:
embedding_val = C[x_train]
h1 = torch.tanh(embedding_val.view(-1, context_size * emb_length) @ W1 + b1)
logits = h1 @ W2 + b2
loss = funct.cross_entropy(logits, y_train)

loss

tensor(2.0999, grad_fn=<NllLossBackward0>)

In [13]:
embedding_val = C[x_val]
h1 = torch.tanh(embedding_val.view(-1, context_size * emb_length) @ W1 + b1)
logits = h1 @ W2 + b2
loss = funct.cross_entropy(logits, y_val)

loss

tensor(2.2723, grad_fn=<NllLossBackward0>)

In [14]:
from sklearn.manifold import TSNE
C_emb = TSNE(n_components=2, perplexity=5).fit_transform(C.detach().numpy())

In [15]:
repr = pd.DataFrame({"x1": C_emb[:, 0], "x2": C_emb[:, 1], "char": alphabet})

scatter = alt.Chart(repr).mark_circle(size=60).encode(
    x='x1',
    y='x2',
    tooltip=['char']
).properties(
    width=500,
    height=500
).interactive()

chars = scatter.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='char'
)

scatter + chars

### Sampling

In [16]:
for _ in range(20):
    idx = 0 
    res = ""
    cont = [0] * context_size
    while True:
        emb = C[torch.tensor(cont)]
        h = torch.tanh(emb.view(-1, context_size * emb_length) @ W1 + b1)
        logits = h @ W2 + b2
        probs = funct.softmax(logits, dim=1)
        idx = torch.multinomial(probs, 1, replacement=True, generator=g).item()
        if idx==0:
            break
        
        res += idx_to_char[idx]
        cont = cont[1: ] + [idx]

    print(res)

pery
aley
khalon
jahoyvi
dya
shir
niciah
akelvimgoumareley
ashadre
cain
kayo
wilf
sofeclyn
maylia
cassoulysse
pra
magan
esaileenecy
jay
nice
