In [123]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt 
%matplotlib inline 

In [124]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [125]:
len(words)

32033

In [126]:
#build vocab with numbers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
print(stoi)
itos = {i:s for s, i in stoi.items()}
itos

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [127]:
block_size = 3 # context length 
X, Y = [], []

for w in words:
    #print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '-->', itos[ix])
        context = context[1:] + [ix] # crop and append 
        
X = torch.tensor(X)
Y = torch.tensor(Y)    

In [128]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

## Embedding lookup table 

In [129]:
C = torch.randn((27, 2))

In [130]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([ 1.6514, -0.8885])

In [131]:
emb = C[X]
emb.shape

torch.Size([228146, 3, 2])

In [132]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [133]:
torch.cat([emb[:, 0,:], emb[:, 1,:], emb[:, 2,:]], 1).shape

torch.Size([228146, 6])

In [134]:
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([228146, 6])

In [135]:
torch.cat(torch.unbind(emb, 1), 1) == emb.view(32, 6)

RuntimeError: shape '[32, 6]' is invalid for input of size 1368876

In [136]:
logits1 = emb.view(emb.shape[0], -1) @ W1 + b1 #ensure correct broadcasting
logits1

tensor([[-3.7945, -2.0203, -1.0025,  ..., -0.5472,  2.3649,  0.9610],
        [-0.5094, -1.3745,  0.3461,  ..., -2.1075, -1.7925, -2.3127],
        [-3.2189,  2.8527,  1.1524,  ..., -2.4982,  3.9070,  1.2046],
        ...,
        [-1.0412,  0.5147, -0.7576,  ..., -0.6026, -0.6371, -0.3555],
        [-0.0694,  0.4640, -1.6301,  ...,  0.2602,  1.0451, -0.5038],
        [-2.3307,  0.4289, -2.0160,  ...,  0.5922, -0.4157,  1.4653]])

In [137]:
h = torch.tanh(logits1)

In [138]:
h.shape

torch.Size([228146, 100])

In [139]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [140]:
logits = h @ W2 + b2 
logits.shape

torch.Size([228146, 27])

In [141]:
counts = logits.exp()
prob = counts/counts.sum(1, keepdims=True)

In [142]:
Y

tensor([ 5, 13, 13,  ..., 26, 24,  0])

In [143]:
# nll loss
loss = -prob[torch.arange(32), Y].log().mean()
loss

IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [32], [228146]

In [144]:
# reproduce and clean up 

X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [145]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 50), generator=g)
W1 = torch.randn((150, 500), generator=g)
b1 = torch.randn(500, generator=g)
W2 = torch.randn((500, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [146]:
sum(p.nelement() for p in parameters) # num of params 

90377

In [147]:
for p in parameters:
    p.requires_grad = True 

In [148]:
for batch in range(50000):
    #minibatch construct 
    ix = torch.randint(0, X.shape[0], (32,))
    
    # forward pass 
    emb = C[X[ix]]
    h = torch.tanh(emb.view(-1, 150) @ W1 + b1)
    logits = h @ W2 + b2
    #counts = logits.exp()
    # prob = counts/counts.sum(1, keepdims=True)
    # loss = -prob[torch.arange(32), Y].log().mean()
    # loss
    loss = F.cross_entropy(logits, Y[ix])
 
    for p in parameters:
        p.grad = None
        
    loss.backward()
    # backward pass
    for p in parameters:
        p.data += -0.01 * p.grad
        
print(loss.item())    

1.7637887001037598


In [149]:
# forward pass 
emb = C[X]
h = torch.tanh(emb.view(-1, 150) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss)

tensor(3.0707, grad_fn=<NllLossBackward>)
