In [1]:
import math
import numpy

import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

# Input

In [2]:
words = open('names.txt', 'r').read().splitlines()
print(f'{len(words)}')
words[:8]

32033


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

lookup tables

In [3]:
chars = list('abcdefghijklmnopqrstuvwxyz')
stoi_lookup = {c: i+1 for i, c in enumerate(chars)}
stoi_lookup['.'] = 0
itos_lookups = {i: char for char, i in stoi_lookup.items()}

### Dataset creation

-  both BOS and EOS are represented using '.'

- context - characters to be considered for the next prediction
    - what would be context for the first letter? just '.', the number of '.' will depend on the length of the context


- make sure to represent the EOS as well

In [32]:
block_size = 3 # context length

X, Y = [], []

for word in words[:5]:
    # print('word: ', word)
    context = [0] * block_size
    seq = word + '.'    # don't forget to add the . add the end

    for char in seq:
        X.append(context)
        # index of the character to be predicted
        y_i = stoi_lookup[char]
        Y.append(y_i)

        # print(''.join(itos_lookups[c] for c in context), '--->', itos_lookups[y_i])
        context = context[1:] + [y_i]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [33]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [34]:
for i in range(len(X)):
    print("context: ", X[i], " next char: ", Y[i])

context:  tensor([0, 0, 0])  next char:  tensor(5)
context:  tensor([0, 0, 5])  next char:  tensor(13)
context:  tensor([ 0,  5, 13])  next char:  tensor(13)
context:  tensor([ 5, 13, 13])  next char:  tensor(1)
context:  tensor([13, 13,  1])  next char:  tensor(0)
context:  tensor([0, 0, 0])  next char:  tensor(15)
context:  tensor([ 0,  0, 15])  next char:  tensor(12)
context:  tensor([ 0, 15, 12])  next char:  tensor(9)
context:  tensor([15, 12,  9])  next char:  tensor(22)
context:  tensor([12,  9, 22])  next char:  tensor(9)
context:  tensor([ 9, 22,  9])  next char:  tensor(1)
context:  tensor([22,  9,  1])  next char:  tensor(0)
context:  tensor([0, 0, 0])  next char:  tensor(1)
context:  tensor([0, 0, 1])  next char:  tensor(22)
context:  tensor([ 0,  1, 22])  next char:  tensor(1)
context:  tensor([ 1, 22,  1])  next char:  tensor(0)
context:  tensor([0, 0, 0])  next char:  tensor(9)
context:  tensor([0, 0, 9])  next char:  tensor(19)
context:  tensor([ 0,  9, 19])  next char:

### Embeddings lookup table

in [Bengio et al](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), 17k words were embeeded into 30 dims

we have to embed 27 chars into small dims, lets start with 2 dims embedding for now

In [57]:
# 27 chars emebedded into two dims randomly
dims = 2
C = torch.randn((27, dims))

# just to see what the embeddings are 
for i, embed in enumerate(C):
    print(itos_lookups[i], ": ", embed)


. :  tensor([-1.0912, -1.5305])
a :  tensor([-0.0173, -0.3021])
b :  tensor([0.1031, 1.1025])
c :  tensor([-1.3304,  0.1416])
d :  tensor([-1.2072,  1.6474])
e :  tensor([-1.0494, -1.4846])
f :  tensor([-0.0662,  0.2333])
g :  tensor([-0.6286,  0.0836])
h :  tensor([-0.4492,  1.0370])
i :  tensor([ 1.0079, -1.0480])
j :  tensor([-1.1377,  1.1776])
k :  tensor([-1.8299, -0.3040])
l :  tensor([0.5433, 0.0497])
m :  tensor([ 0.6874, -0.3776])
n :  tensor([ 0.9250, -1.2943])
o :  tensor([0.1519, 0.4443])
p :  tensor([0.4901, 0.5606])
q :  tensor([ 0.0579, -0.9379])
r :  tensor([0.7701, 0.9760])
s :  tensor([-0.3260,  1.3472])
t :  tensor([ 0.5321, -1.5233])
u :  tensor([1.4296, 1.6443])
v :  tensor([-1.7678, -1.4295])
w :  tensor([-0.0855,  2.2427])
x :  tensor([-1.0920,  1.1340])
y :  tensor([-0.2220,  0.1523])
z :  tensor([0.6374, 0.0944])


now we have to embed the integers in the input X using the above lookup table C

In [58]:
embs = C[X]
embs.shape

torch.Size([32, 3, 2])

### Hidden Layer

- What would be the input to this layer?

    the embedding have the shape [32, 3, 2] which means that we have 32 inputs and for each input so we have three(one for each character) two dims embeddings => 3 * 2 = 6

- What would be the number of neurons, we can try it with different values, lets say 100 for now

In [102]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
h = torch.tanh(embs.view(32, block_size*dims) @ W1 + b1)

### Output Layer

In [100]:
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

logits = h @ W2 + b2
counts = logits.exp()
prob = counts /counts.sum(1, keepdim=True)
print(prob[0].shape) # we have prob for all 27 chars for all our inputs
# prob[0].sum()

torch.Size([27])


In [111]:
ix = torch.arange(32) # indices for all 32 outputs

# Y is the index of the character that was predicted
loss = -prob[ix, Y].log().mean() # nll
loss

tensor(15.6169)

everything together i.e the network so far

In [114]:
print("the dataset: ", X.shape, Y.shape)

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)

params = [C, W1, b1, W2, b2]

# print(sum(p.nelement() for p in params))

embs = C[X]
h = torch.tanh(embs.view(32, block_size*dims) @ W1 + b1)
logits = h @ W2 + b2
counts = logits.exp()
prob = counts /counts.sum(1, keepdim=True)
loss = -prob[torch.arange(32), Y].log().mean()
loss

the dataset:  torch.Size([32, 3]) torch.Size([32])


tensor(17.7697)