In [115]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [116]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [117]:
len(words)

32033

In [118]:
chars = [chr(i) for i in range(97, 97 + 26)]
stoi = {s: i + 1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [119]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
  
  # print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [120]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [121]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [122]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [123]:
for p in parameters:
    p.requires_grad = True

In [128]:
for _ in range(1000):
    # minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    # forward pass
    emb = C[X[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Y[ix])
    print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -0.01 * p.grad

3.0081984996795654
3.321211576461792
3.4563581943511963
2.555044174194336
2.5893754959106445
2.3814620971679688
2.8915741443634033
2.5867013931274414
2.811128616333008
2.531989574432373
2.620804786682129
2.6204469203948975
3.195596933364868
2.6940362453460693
3.0133068561553955
2.4713656902313232
2.687988042831421
2.495166540145874
2.836601495742798
2.7706642150878906
2.779841661453247
2.491499900817871
2.682466983795166
2.6625640392303467
2.772822618484497
2.6774349212646484
2.514741897583008
2.783639907836914
2.683440923690796
2.62882137298584
2.439507246017456
2.6930418014526367
3.2305943965911865
3.0262022018432617
2.804340362548828
2.6129231452941895
2.8836112022399902
2.799347400665283
2.7257936000823975
3.09517765045166
2.5167458057403564
2.412832021713257
2.8492753505706787
2.7930541038513184
2.31573224067688
2.804628610610962
2.8441731929779053
2.6647934913635254
2.062758207321167
2.972566604614258
3.2138612270355225
2.6504299640655518
2.4971444606781006
2.7007200717926025
2.7

In [131]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Y)
loss

tensor(2.6765, grad_fn=<NllLossBackward0>)