In [1]:
import torch
import pandas as pd
import numpy as np
import altair as alt

In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.backends.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

print(f"Device: {device}")

Device: mps


In [11]:
import json
f = open('names.json')
names = json.load(f)

In [4]:
def atoi(a):
  return 0 if a == '.' else (ord(a) - ord('a') + 1)

def itoa(i):
  return '.' if i == 0 else (chr(i - 1 + ord('a')))

In [12]:
context_length = 3
train_x = []
train_y = []
for name in names:
  name = f"{'.' * context_length}{name.lower()}."
  samples = [name[idx-(context_length + 1):idx] for idx in range(context_length + 1, len(name) + 1)]
  train_x.extend([[atoi(x) for x in sample[0:-1]] for sample in samples])
  train_y.extend([atoi(sample[-1]) for sample in samples])

train_x = torch.tensor(train_x, device=device)
train_y = torch.nn.functional.one_hot(torch.tensor(train_y, device=device), num_classes=27).float().to(device)
print(train_x.shape, train_y.shape)

torch.Size([34894, 3]) torch.Size([34894, 27])


In [13]:
brain_capacity = 128
embedding_dims = 2
output_dim = 27

C = torch.randn((27, embedding_dims), device=device, requires_grad=True)
W1 = torch.randn((embedding_dims * context_length, brain_capacity), device=device, requires_grad=True)
b1 = torch.randn(brain_capacity, device=device, requires_grad=True)
W2 = torch.randn((brain_capacity, output_dim), device=device, requires_grad=True)
b2 = torch.randn(output_dim, device=device, requires_grad=True)

parms = [C, W1, b1, W2, b2]

def run_forward(x):
  emb = C[x].view(-1, embedding_dims * context_length)
  layer1 = (emb @ W1 + b1).tanh()
  layer2 = (layer1 @ W2 + b2) # softmax when executing, cross_entropy for loss
  return layer2

In [14]:
losses = []
batch_size = 1024

epoch_count = 1000
lr_exp = torch.linspace(0, -4, epoch_count)

for epoch in range(epoch_count):
  for i in range(train_x.shape[0] // batch_size):
    logits = run_forward(train_x[i*batch_size:(i+1)*batch_size])
    output = train_y[i*batch_size:(i+1)*batch_size]
    loss = torch.nn.functional.cross_entropy(logits, output)
    if i % 23 == 0:
      losses.append(loss.cpu().data)
    for parm in parms:
      parm.grad = None
    loss.backward()
    for parm in parms:
      parm.data -= 10**lr_exp[epoch] * parm.grad
  if epoch % 100 == 0:
    print(f"Epoch {epoch}, loss is around {losses[-1]}")

x = np.arange(len(losses))
fx = np.array(losses)
source = pd.DataFrame({
  'x': x,
  'f(x)': fx
})

alt.Chart(source).mark_line().encode(
    x='x',
    y='f(x)'
)

Epoch 0, loss is around 7.21897554397583
Epoch 100, loss is around 1.6332008838653564
Epoch 200, loss is around 1.6123837232589722
Epoch 300, loss is around 1.8806347846984863
Epoch 400, loss is around 1.8729294538497925
Epoch 500, loss is around 1.8615796566009521
Epoch 600, loss is around 1.8551467657089233
Epoch 700, loss is around 1.8522568941116333
Epoch 800, loss is around 1.8510527610778809
Epoch 900, loss is around 1.8505651950836182


In [15]:
import altair as alt
import pandas as pd

chars = list(".abcdefghijklmnopqrstuvwxyz")
char_type = ['start' if char == '.' else ('vowel' if char in "aeiouy" else 'consonant') for char in chars]
xy = C[[atoi(char) for char in chars]].cpu().T.data
data = pd.DataFrame({'char': chars,
                     'type': char_type,
                     'x': xy[0],
                     'y': xy[1],
                    })
alt.Chart(data).mark_text(align='center',
    baseline='middle',
    dx=0,
    dy=0,
    fontSize=14,
    fontWeight=600).encode(text='char', x='x', y='y', color='type')