In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('names.txt', 'r').read().splitlines()
words[:8]


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
chars = sorted(list(set(''.join(words))))
chars = ['.'] + chars
# string to integer encoding
stoi = {s: i for i, s in enumerate(chars)}
print(stoi)

# integer to string encoding
itos = {i: s for i, s in enumerate(chars)}
print(itos)

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [12]:
# build the dataset

# use more than one char to predict the next char

block_size = 5

# the dataset would look like this
# X[0] = [0, 0, 0, ..., 0] -> Y[0] = [4] => the first character is a itos(4) = D
#        |--block_size---|
# X[1] = [0, 0, 0, ..., 1] -> Y[1] = [1] => the second character is a itos(1) = A

def build_dataset(words):
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    
    # end the word with a '.'
    for ch in w + '.':
      int_x = stoi[ch]
      X.append(context)
      Y.append(int_x)
    
      # remove oldest char and add latest 
      context = context[1:] + [int_x]
  
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y
  
# split the dataset into 3 sets - train, val and test
import random
random.seed(123)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtrain, Ytrain = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])


torch.Size([182543, 5]) torch.Size([182543])
torch.Size([22787, 5]) torch.Size([22787])
torch.Size([22816, 5]) torch.Size([22816])


In [24]:
# create a lookup table for each neuron that maps the 27 characters to a 10D sapce
C = torch.randn((27, 10))
# C is basically random data through which we will pass of our input data
# and create it's embedded form
C[0]

tensor([-0.5769, -0.3844,  0.7500,  0.0776,  0.8179,  0.4520,  0.8600,  0.0649,
         0.1095, -0.0740])

In [36]:
# Now we need our embeddings
print(Xtrain.shape, C.shape)
emb = C[Xtrain]
emb.shape


torch.Size([182543, 5]) torch.Size([27, 10])


torch.Size([182543, 5, 10])

In [19]:
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([182543, 50])

In [44]:
# create a generator 
g = torch.Generator().manual_seed(12315324)
C = torch.randn((27, 10), generator=g)

hiddenLayerSize = 150
hiddenLayerInput = emb.shape[1] * emb.shape[2]
numOfPossibleOutputs = 27 # 26 characters + "."

W1 = torch.randn((hiddenLayerInput, hiddenLayerSize), generator=g)
b1 = torch.randn(hiddenLayerSize, generator=g)
W2 = torch.randn((hiddenLayerSize, numOfPossibleOutputs), generator=g)
b2 = torch.randn(numOfPossibleOutputs, generator=g)
parameters = [C, W1, b1, W2, b2]

In [45]:
sum(p.nelement() for p in parameters)

11997

In [46]:
for p in parameters:
  p.requires_grad = True

In [47]:
print(torch.cat(torch.unbind(emb, 1), 1).shape)
print(emb.view(-1, 50).shape)
print(torch.cat(torch.unbind(emb, 1), 1) == emb.view(-1, 50))

torch.Size([182543, 50])
torch.Size([182543, 50])
tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])


In [66]:
iters = 100000
for i in range(iters):
  # forward pass
  emb = C[Xtrain]
  h = torch.tanh(emb.view(-1, 50) @ W1 + b1)
  logits = h @ W2 + b2

  # loss
  loss = F.cross_entropy(logits, Ytrain)

  # backward pass
  for p in parameters:
    p.grad = None

  loss.backward()

  # Update weights
  for p in parameters:
    p.data += -0.1 * p.grad

  print(loss.data)




tensor(12.5689)
tensor(12.3501)
tensor(12.1487)
tensor(11.9636)
tensor(11.7922)
tensor(11.6427)
tensor(11.4865)
tensor(11.3527)
tensor(11.2027)
tensor(11.0548)
tensor(10.9313)
tensor(10.8022)
tensor(10.6891)
tensor(10.5717)
tensor(10.4568)
tensor(10.3417)
tensor(10.2334)
tensor(10.1319)
tensor(10.0410)
tensor(9.9565)
tensor(9.8886)
tensor(9.8232)
tensor(9.7942)
tensor(9.6651)
tensor(9.5706)
tensor(9.4848)
tensor(9.4091)
tensor(9.3459)
tensor(9.2866)
tensor(9.2379)
tensor(9.1678)
tensor(9.1183)
tensor(9.0385)
tensor(8.9992)
tensor(8.9109)
tensor(8.8707)
tensor(8.8073)
tensor(8.7565)
tensor(8.6874)
tensor(8.6553)
tensor(8.5711)
tensor(8.5268)
tensor(8.4757)
tensor(8.4391)
tensor(8.3732)
tensor(8.3496)
tensor(8.2725)
tensor(8.2314)
tensor(8.1868)
tensor(8.1637)
tensor(8.0976)
tensor(8.0716)
tensor(8.0105)
tensor(7.9766)
tensor(7.9374)
tensor(7.9266)
tensor(7.8485)
tensor(7.8085)
tensor(7.7768)
tensor(7.7533)
tensor(7.7130)
tensor(7.6972)
tensor(7.6305)
tensor(7.5993)
tensor(7.5754)
tensor

KeyboardInterrupt: 

In [68]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
  out = []
  ctx = [0] * block_size
  
  while True:
    emb = C[torch.tensor([ctx])]
    h = torch.tanh(emb.view(-1, 50) @ W1 + b1)
    logits = h @ W2 + b2
    probs = F.softmax(logits, dim=1)
    ix = torch.multinomial(probs, num_samples=1, generator=g).item()
    ctx = ctx[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break
    
  print(''.join(itos[i] for i in out))

careah.
aablil.
khkimri.
rehty.
sacanane.
rahnen.
deliyha.
kaeei.
nerania.
ceriir.
kalenn.
dhlmo.
diniqhann.
salin.
alian.
qurrae.
maijaryni.
jaceininsa.
medde.
iia.


In [90]:
# ctx = [0] * block_size
# out = []

emb = C[torch.tensor([ctx])]
h = torch.tanh(emb.view(-1, 50) @ W1 + b1)
logits = h @ W2 + b2
print(f'{logits=} {logits.shape=}')
probs = F.softmax(logits, dim=1)
print(f'{probs=} {probs.shape=}')
ix = torch.multinomial(probs, num_samples=1, generator=g).item()
print(f'{ix}')
ctx = ctx[1:] + [ix]
out.append(ix)

logits=tensor([[  3.3956,   6.5123,  -3.1138, -10.4414,  -6.2226,   5.0934,  -6.2482,
         -14.9007,   5.8073,  -0.3638,   4.4702,   0.3045,   5.6061,   5.8509,
           5.9202,   5.0284,  -4.1170,  -2.9448,   5.5049,   7.5590,  -8.1065,
          -0.1808,   3.7585,  -8.1800,  -7.2704,   2.1393,   9.5375]],
       grad_fn=<AddBackward0>) logits.shape=torch.Size([1, 27])
probs=tensor([[1.6109e-03, 3.6360e-02, 2.3992e-06, 1.5766e-09, 1.0713e-07, 8.7982e-03,
         1.0443e-07, 1.8242e-11, 1.7967e-02, 3.7529e-05, 4.7180e-03, 7.3215e-05,
         1.4692e-02, 1.8766e-02, 2.0114e-02, 8.2450e-03, 8.7980e-07, 2.8409e-06,
         1.3278e-02, 1.0357e-01, 1.6284e-08, 4.5067e-05, 2.3157e-03, 1.5130e-08,
         3.7573e-08, 4.5861e-04, 7.4895e-01]], grad_fn=<SoftmaxBackward0>) probs.shape=torch.Size([1, 27])
26


In [92]:
out
print(''.join(itos[i] for i in out))

kalaras.ipvaz
