In [1]:
#import libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as f

plt.rcParams['axes.prop_cycle'] = plt.cycler(color = sns.color_palette('dark'))
plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (20 , 20)

In [2]:
#load dataset:
words = open('./data/names.txt').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
#build character tokens:
chars = sorted(list(set(''.join(words))))
cti = {ch : i + 1 for i , ch in enumerate(chars)}
cti['.'] = 0
print(cti)
#build inverse tokens
itc = {i : ch for ch , i in cti.items()}
print(itc)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [53]:
#convert dataset to n-gram format -> context of size n to predict the future word.
x , y = [] , []
k = 3   #context size
for word in words[:5]:
    # print(word)
    context = [0] * k
    for ch in word + '.':
        x.append(context)
        y.append(cti[ch])
        # print(f"{''.join([itc[i] for i in context])} -> {ch}")
        context = context[1:] + [cti[ch]]
    # print("*" * 40)
x = torch.tensor(x)
y = torch.tensor(y)
x.shape , x.dtype , y.shape , y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [54]:
#build embedding matrix:
c = torch.randn((27 , 2))
emb = c[x]
emb.shape , emb[0]

(torch.Size([32, 3, 2]),
 tensor([[-0.1053,  0.8348],
         [-0.1053,  0.8348],
         [-0.1053,  0.8348]]))

In [55]:
c[cti['.']]

tensor([-0.1053,  0.8348])

In [87]:
#initialize weights for first layer:
w1 = torch.randn((6, 100))
b = torch.randn(100)

#change embedding shape from (n , 3 , 2) -> (n , 6):

h = torch.tanh(emb.reshape((emb.shape[0] , -1)) @ w1 + b)
h.shape

torch.Size([32, 100])

In [86]:
#compute outer layer:
w2 = torch.randn((100 , 27))
b2 = torch.randn(27)
logits = h @ w2 + b2
counts = logits.exp()
probs = counts / counts.sum(dim = 1 , keepdims= True)
torch.isnan(probs).sum()
loss = -probs[torch.arange(32) , y].log().mean()
loss

tensor(inf)

In [79]:
(counts.sum(dim = 1)).shape , (counts/ counts.sum(dim = 1 , keepdim = True)).shape

(torch.Size([32]), torch.Size([32, 27]))

In [93]:
#final network:
gen = torch.Generator().manual_seed(2147483647)
c = torch.randn((27 , 2) , generator= gen)
w1 = torch.randn((6 , 100) , generator = gen)
b1 = torch.randn(100 , generator= gen)
w2 = torch.randn((100 , 27) , generator= gen)
b2 = torch.randn(27 , generator = gen)
parameters = [c , w1 , b1 , w2 , b2]
print(f"# of parameters: {sum(el.nelement() for el in parameters)}")

# of parameters: 3481


In [94]:
emb = c[x]
h = torch.tanh(emb.reshape((32 , -1)) @ w1 + b1)
logits = h @ w2 + b2
# logits = logits - logits.max(dim = 1 , keepdim = True).values
print(torch.isnan(logits).sum())
counts = logits.exp()
print(torch.isnan(counts).sum())
probs = counts / counts.sum(dim = 1 , keepdims = True)
print(torch.isnan(probs).sum())
loss = -probs[torch.arange(32) , y].log().mean()
print(torch.isnan(loss).sum())
loss

tensor(0)
tensor(0)
tensor(0)
tensor(0)


tensor(17.7697)

In [83]:
x.shape , y.shape , counts.sum(dim = 1 , keepdim= True).shape , emb.shape , torch.arange(32).shape

(torch.Size([32, 3]),
 torch.Size([32]),
 torch.Size([32, 1]),
 torch.Size([32, 3, 2]),
 torch.Size([32]))