In [25]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [26]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [27]:
len(words)

32033

In [28]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [29]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])


torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [30]:
Xtr.shape, Ytr.shape # dataset

(torch.Size([182625, 3]), torch.Size([182625]))

In [31]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [32]:
sum(p.nelement() for p in parameters)

3481

In [33]:
for p in parameters:
    p.requires_grad = True

In [34]:
for i in range(2000):
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (32,)) # sample 32 at a time
    # forward pass
    emb = C[Xtr[ix]] # (32, 3, 2)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27])
    loss = F.cross_entropy(logits, Ytr[ix])
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    lr = 0.1 if i < 1000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

19.492156982421875
15.893224716186523
17.50156593322754
17.20179557800293
13.083878517150879
15.48676872253418
12.041716575622559
11.982147216796875
12.2935791015625
11.77007007598877
10.57004451751709
8.111320495605469
12.873079299926758
8.109328269958496
9.94201946258545
9.274588584899902
9.384342193603516
9.082603454589844
9.701021194458008
10.259269714355469
8.159101486206055
10.363975524902344
8.872828483581543
6.904699325561523
8.141331672668457
6.304592132568359
6.426123142242432
6.226120948791504
5.723204135894775
6.649616241455078
7.094991683959961
6.361607074737549
5.666666030883789
8.414931297302246
5.500812530517578
7.081417560577393
7.298527240753174
6.5589447021484375
6.297941207885742
5.527717590332031
4.224705696105957
7.307300567626953
5.488017559051514
5.0774054527282715
6.597989559173584
6.980192184448242
6.4200119972229
6.736786365509033
4.549951076507568
4.139153957366943
5.537045955657959
5.52241325378418
5.264784812927246
6.276819705963135
5.082775115966797
5.502

In [35]:
emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27])
loss = F.cross_entropy(logits, Ytr)
loss

tensor(2.5584, grad_fn=<NllLossBackward0>)

In [36]:
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27])
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.5506, grad_fn=<NllLossBackward0>)

In [37]:


# sample from the model
g = torch.Generator().manual_seed(2147483647)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

der.
maleallesaila.
kayhn.
koliri.
tainrllzan.
kan.
a.
samiyauelgbsryi.
tas.
molie.
cavo.
karteda.
kaleyda.
sade.
enkavirny.
cols.
mhinin.
bvtallas.
dasdr.
ban.
