In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
words = open('names.txt','r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [7]:
len(words)

32033

In [11]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [58]:
block_size = 3 # context length: how many characters should we look at at once
X,Y = [], []
for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

### Goal
- get X tensor to be transformed into Y tensor
- we initalize to random numbers 

In [59]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [60]:
C = torch.randn((27,2))

In [61]:
C

tensor([[-1.4959,  0.5486],
        [ 1.4653, -1.6270],
        [-0.0720, -0.7127],
        [ 1.0548, -0.3560],
        [ 0.2174,  0.3312],
        [-0.6899, -0.3493],
        [ 1.9372,  0.0580],
        [ 0.1691,  0.1341],
        [ 1.1063, -0.2964],
        [ 0.5786, -0.5422],
        [-0.6661,  1.1575],
        [-0.7267, -1.3268],
        [-0.5680, -1.1404],
        [ 1.1535,  0.2077],
        [-0.7752,  0.7500],
        [-0.1632,  0.3342],
        [ 0.3085,  0.3519],
        [ 1.1856,  0.0638],
        [-0.3811, -1.3024],
        [-2.2456,  1.0355],
        [ 0.3671,  0.3592],
        [ 1.3277, -0.7941],
        [ 0.4666, -1.0034],
        [-0.2414, -0.4423],
        [-0.0833,  0.2128],
        [-0.3004, -0.2000],
        [ 1.5248, -2.4851]])

In [62]:
emb = C[X]
emb.shape

torch.Size([228146, 3, 2])

In [63]:
W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [64]:
torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]],1).shape

torch.Size([228146, 6])

In [65]:
# this does the same thing as the code above but generalized
torch.cat(torch.unbind(emb,1),1).shape

torch.Size([228146, 6])

In [66]:
a = torch.arange(18)
a.view(3,3,2) # Very efficent reorganizes the 18 elements

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [67]:
emb.shape

torch.Size([228146, 3, 2])

In [68]:
h = torch.tanh(emb.view(emb.shape[0],6) @ W1 + b1)

In [69]:
h

tensor([[-0.9764, -0.9998, -0.9985,  ..., -0.9899,  0.9931, -0.9993],
        [-0.6162, -0.9992, -0.9750,  ..., -0.9968,  0.9998, -0.9756],
        [ 0.9533, -0.3049, -0.6274,  ..., -0.4537,  0.9974,  0.9916],
        ...,
        [ 0.5615,  1.0000, -0.5668,  ...,  1.0000, -1.0000,  0.7125],
        [ 0.9972,  1.0000,  0.9052,  ...,  0.9976,  0.1745,  0.4146],
        [ 0.5477,  0.9972, -0.1077,  ...,  0.9986, -0.4863,  0.9997]])

In [70]:
W2 = torch.randn((100,27))
b2 = torch.randn(27)

In [71]:
logits = h @ W2 + b2
logits.shape

torch.Size([228146, 27])

In [72]:
# counts = logits.exp()

In [73]:
# prob = counts / counts.sum(1,keepdims=True)

In [74]:
# prob.shape

In [75]:
# loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(0.2859, grad_fn=<NllLossBackward0>)

In [76]:
loss = F.cross_entropy(logits, Y) # replaces all of the commented lines

In [77]:
parameters = [C,W1,W2,b1,b2]

In [78]:
for p in parameters:
    p.requires_grad = True

In [83]:
# Forward pass
for _ in range (10):
    ix = torch.randint(0,X.shape[0],(32,)) # Minibatch construct
    emb = C[X[ix]]
    h = torch.tanh(emb.view(-1,6) @ W1 +b1)
    logits = h @ W2 +b2
    loss = F.cross_entropy(logits,Y[ix])
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -.1*p.grad
        
print(loss.item())

3.1899073123931885


In [81]:
torch.randint(0,X.shape[0],(32,))

tensor([  9412,  17402, 202574,  23759, 139326, 160478, 133368,  71927,  52673,
        219645, 221026, 186312,  91022, 117111, 135566, 103331,  28176,   8706,
        135977, 160640,  18188, 175423,  63686,  30565, 201930,  57826, 217499,
        180004,  64878,  29785, 227535, 139233])

In [85]:
X.shape[0]

228146

In [87]:
C[X[100]]

tensor([[ 0.1128, -0.6394],
        [ 0.1128, -0.6394],
        [-0.0146, -0.4521]], grad_fn=<IndexBackward0>)