In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
block_size = 3
X, Y = [], []
for w in words[:5]:
    context = [0]*block_size
    for ch in w+'.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)
X, Y

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22],
         [ 9, 22,  9],
         [22,  9,  1],
         [ 0,  0,  0],
         [ 0,  0,  1],
         [ 0,  1, 22],
         [ 1, 22,  1],
         [ 0,  0,  0],
         [ 0,  0,  9],
         [ 0,  9, 19],
         [ 9, 19,  1],
         [19,  1,  2],
         [ 1,  2,  5],
         [ 2,  5, 12],
         [ 5, 12, 12],
         [12, 12,  1],
         [ 0,  0,  0],
         [ 0,  0, 19],
         [ 0, 19, 15],
         [19, 15, 16],
         [15, 16,  8],
         [16,  8,  9],
         [ 8,  9,  1]]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
          1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))

In [7]:
X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([32, 3]), torch.Size([32]), torch.int64, torch.int64)

In [8]:
C = torch.randn((27,2))

In [9]:
C

tensor([[ 0.6801,  1.0787],
        [-1.4146, -0.3287],
        [-0.6668, -0.2019],
        [ 1.0394, -0.5375],
        [-0.7690,  1.6483],
        [ 1.2095,  0.5990],
        [-0.2466, -0.5232],
        [-0.2043,  1.4570],
        [ 0.4200, -0.7183],
        [ 1.3345, -0.1253],
        [ 0.8496, -0.3642],
        [ 0.9389,  0.6935],
        [-0.4508, -0.4734],
        [-0.5621,  0.6765],
        [ 1.4382,  0.2040],
        [-0.5235, -0.1078],
        [ 0.1460,  1.9001],
        [-0.6188,  1.2268],
        [-0.6427,  1.8381],
        [ 0.3078,  0.2578],
        [ 0.3745,  0.5306],
        [ 1.5304, -0.8820],
        [ 0.9770, -1.8554],
        [-0.8653,  0.1883],
        [-0.5972, -0.8709],
        [ 0.3236,  2.1583],
        [-2.7509, -2.1017]])

In [10]:
# Create embedding matrix
emb = C[X]
emb.shape, emb

(torch.Size([32, 3, 2]),
 tensor([[[ 0.6801,  1.0787],
          [ 0.6801,  1.0787],
          [ 0.6801,  1.0787]],
 
         [[ 0.6801,  1.0787],
          [ 0.6801,  1.0787],
          [ 1.2095,  0.5990]],
 
         [[ 0.6801,  1.0787],
          [ 1.2095,  0.5990],
          [-0.5621,  0.6765]],
 
         [[ 1.2095,  0.5990],
          [-0.5621,  0.6765],
          [-0.5621,  0.6765]],
 
         [[-0.5621,  0.6765],
          [-0.5621,  0.6765],
          [-1.4146, -0.3287]],
 
         [[ 0.6801,  1.0787],
          [ 0.6801,  1.0787],
          [ 0.6801,  1.0787]],
 
         [[ 0.6801,  1.0787],
          [ 0.6801,  1.0787],
          [-0.5235, -0.1078]],
 
         [[ 0.6801,  1.0787],
          [-0.5235, -0.1078],
          [-0.4508, -0.4734]],
 
         [[-0.5235, -0.1078],
          [-0.4508, -0.4734],
          [ 1.3345, -0.1253]],
 
         [[-0.4508, -0.4734],
          [ 1.3345, -0.1253],
          [ 0.9770, -1.8554]],
 
         [[ 1.3345, -0.1253],
          [ 0.9

In [11]:
X[13], C[X][13]

(tensor([0, 0, 1]),
 tensor([[ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [-1.4146, -0.3287]]))

In [12]:
#Randomly initialized weights and biases
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
W1, b1

(tensor([[ 5.6446e-01,  5.7322e-01, -1.0658e+00, -8.8464e-01, -1.6878e+00,
           1.7789e+00, -7.0581e-01, -1.1461e+00,  5.7458e-01,  4.4989e-01,
           2.4812e-02,  1.9764e-01, -9.2092e-01, -6.1858e-01,  2.3695e-01,
          -4.8251e-01, -1.9106e+00,  5.3021e-01, -1.4214e-01, -5.1786e-01,
          -5.8515e-01, -1.3208e+00,  2.2127e-01, -1.5059e+00,  8.4437e-01,
           1.2612e+00, -1.7412e+00,  2.1107e+00,  2.1371e+00,  1.1199e+00,
           3.4250e-01,  1.3917e+00, -1.3459e+00,  6.8115e-01,  1.0716e+00,
           3.1565e-01,  2.5913e-01, -9.0852e-01, -2.0255e-01, -6.3639e-01,
          -1.6248e-01, -5.4301e-01,  4.7667e-01,  2.8950e-02,  1.5940e+00,
          -1.0527e-02,  1.5972e+00, -7.6902e-01, -1.6148e-01, -5.9776e-01,
           1.1040e-01,  7.4919e-01, -1.5797e+00, -8.3646e-01, -5.6052e-01,
           4.4806e-01, -4.0644e-01, -5.2072e-01, -1.1130e+00,  5.3527e-01,
           6.6605e-01, -2.8036e-01, -1.9393e-01,  5.1311e-01, -8.7106e-02,
          -2.0306e+00, -1

In [13]:
W1.shape, b1.shape

(torch.Size([6, 100]), torch.Size([100]))

In [14]:
# 3 Input nueron example
emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]

(tensor([[ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 1.2095,  0.5990],
         [-0.5621,  0.6765],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [-0.5235, -0.1078],
         [-0.4508, -0.4734],
         [ 1.3345, -0.1253],
         [ 0.9770, -1.8554],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [-1.4146, -0.3287],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 1.3345, -0.1253],
         [ 0.3078,  0.2578],
         [-1.4146, -0.3287],
         [-0.6668, -0.2019],
         [ 1.2095,  0.5990],
         [-0.4508, -0.4734],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.3078,  0.2578],
         [-0.5235, -0.1078],
         [ 0.1460,  1.9001],
         [ 0.4200, -0.7183]]),
 tensor([[ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 1

In [15]:
# Concatinating the second dimension of each to match Weight Matrix W1's dimensions
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1)

tensor([[ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  1.2095,  0.5990],
        [ 0.6801,  1.0787,  1.2095,  0.5990, -0.5621,  0.6765],
        [ 1.2095,  0.5990, -0.5621,  0.6765, -0.5621,  0.6765],
        [-0.5621,  0.6765, -0.5621,  0.6765, -1.4146, -0.3287],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787, -0.5235, -0.1078],
        [ 0.6801,  1.0787, -0.5235, -0.1078, -0.4508, -0.4734],
        [-0.5235, -0.1078, -0.4508, -0.4734,  1.3345, -0.1253],
        [-0.4508, -0.4734,  1.3345, -0.1253,  0.9770, -1.8554],
        [ 1.3345, -0.1253,  0.9770, -1.8554,  1.3345, -0.1253],
        [ 0.9770, -1.8554,  1.3345, -0.1253, -1.4146, -0.3287],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787, -1.4146, -0.3287],
        [ 0.6801,  1.0787, -1.4146, -0.3287,  0.9770, -1.8554],
        [-1.4146, -0.3287,  0.9770, -1.8

In [16]:
# Example of torch.unbind
torch.unbind(emb, 1)

(tensor([[ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 1.2095,  0.5990],
         [-0.5621,  0.6765],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [-0.5235, -0.1078],
         [-0.4508, -0.4734],
         [ 1.3345, -0.1253],
         [ 0.9770, -1.8554],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [-1.4146, -0.3287],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 1.3345, -0.1253],
         [ 0.3078,  0.2578],
         [-1.4146, -0.3287],
         [-0.6668, -0.2019],
         [ 1.2095,  0.5990],
         [-0.4508, -0.4734],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 0.3078,  0.2578],
         [-0.5235, -0.1078],
         [ 0.1460,  1.9001],
         [ 0.4200, -0.7183]]),
 tensor([[ 0.6801,  1.0787],
         [ 0.6801,  1.0787],
         [ 1

In [17]:
# Using cat and unvind together to avoid hardcoding dimensions and match emb and W1's dimensions
torch.cat(torch.unbind(emb , 1), 1)
# Output is same as torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1) but neurons and dimensions are not hard-coded, therefore is better for variable block_size

tensor([[ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  1.2095,  0.5990],
        [ 0.6801,  1.0787,  1.2095,  0.5990, -0.5621,  0.6765],
        [ 1.2095,  0.5990, -0.5621,  0.6765, -0.5621,  0.6765],
        [-0.5621,  0.6765, -0.5621,  0.6765, -1.4146, -0.3287],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787, -0.5235, -0.1078],
        [ 0.6801,  1.0787, -0.5235, -0.1078, -0.4508, -0.4734],
        [-0.5235, -0.1078, -0.4508, -0.4734,  1.3345, -0.1253],
        [-0.4508, -0.4734,  1.3345, -0.1253,  0.9770, -1.8554],
        [ 1.3345, -0.1253,  0.9770, -1.8554,  1.3345, -0.1253],
        [ 0.9770, -1.8554,  1.3345, -0.1253, -1.4146, -0.3287],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787, -1.4146, -0.3287],
        [ 0.6801,  1.0787, -1.4146, -0.3287,  0.9770, -1.8554],
        [-1.4146, -0.3287,  0.9770, -1.8

In [18]:
# Efficient way is to use torch.view instead of cat and/or unbind
emb.view(emb.shape[0], 6)
# Output is same as torch.cat(torch.unbind(emb , 1), 1) but is much more efficient than cat as .view() manipulates 'storage' component of neurons instead of creating new neurons

tensor([[ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  1.2095,  0.5990],
        [ 0.6801,  1.0787,  1.2095,  0.5990, -0.5621,  0.6765],
        [ 1.2095,  0.5990, -0.5621,  0.6765, -0.5621,  0.6765],
        [-0.5621,  0.6765, -0.5621,  0.6765, -1.4146, -0.3287],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787, -0.5235, -0.1078],
        [ 0.6801,  1.0787, -0.5235, -0.1078, -0.4508, -0.4734],
        [-0.5235, -0.1078, -0.4508, -0.4734,  1.3345, -0.1253],
        [-0.4508, -0.4734,  1.3345, -0.1253,  0.9770, -1.8554],
        [ 1.3345, -0.1253,  0.9770, -1.8554,  1.3345, -0.1253],
        [ 0.9770, -1.8554,  1.3345, -0.1253, -1.4146, -0.3287],
        [ 0.6801,  1.0787,  0.6801,  1.0787,  0.6801,  1.0787],
        [ 0.6801,  1.0787,  0.6801,  1.0787, -1.4146, -0.3287],
        [ 0.6801,  1.0787, -1.4146, -0.3287,  0.9770, -1.8554],
        [-1.4146, -0.3287,  0.9770, -1.8

In [19]:
# Creating second layer (hidden layer of activations) of NN 
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
h, h.shape
# Keep in mind broadcasting rules for b1. Not to worry in this case

(tensor([[-0.9999, -0.9918, -0.8626,  ...,  0.8578,  0.9879,  0.9868],
         [-1.0000, -0.9953, -0.7313,  ...,  0.7914,  0.7311,  0.9714],
         [-0.9857, -0.4610,  0.8747,  ...,  0.8123,  0.9998,  0.9847],
         ...,
         [-1.0000,  0.1054,  0.9542,  ..., -0.6029, -0.9866,  0.7940],
         [-0.9914, -0.9999,  0.9784,  ...,  0.6346, -0.0836, -0.1875],
         [ 0.8698,  0.9952,  0.9997,  ..., -0.0930,  0.9488,  0.9238]]),
 torch.Size([32, 100]))

In [20]:
#Creating second layer with 27 possible outcome
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [21]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [22]:
# Adding fake counts for 0 value parameters by exponentiating the parameters
counts = logits.exp()
counts

tensor([[1.1253e+02, 7.6225e-01, 1.6098e+04, 1.5970e+06, 3.4708e+04, 2.8725e+02,
         2.9077e-05, 1.6988e+06, 3.6304e+03, 7.0718e-03, 4.2636e-07, 1.2676e+04,
         1.2436e+04, 4.8907e-02, 1.2739e-05, 2.0333e-01, 8.9917e-06, 1.1470e-07,
         1.3226e+03, 3.9036e+06, 2.5330e+04, 1.4791e-03, 4.4970e+02, 1.4910e+04,
         2.5620e+03, 1.5208e+03, 5.8795e-01],
        [1.6404e+03, 3.9331e+00, 3.2003e+04, 1.4221e+09, 4.2275e+05, 7.2413e+02,
         4.8633e-02, 2.8727e+06, 4.3222e+00, 1.4333e-02, 1.0302e-05, 9.3116e+01,
         6.0575e+03, 1.0224e-02, 9.0347e-04, 1.6078e-03, 3.8061e-04, 1.2392e-07,
         2.9485e+03, 6.5621e+04, 9.7917e+01, 1.8935e-04, 5.0060e+03, 5.4720e+05,
         7.9554e+01, 4.8489e+04, 1.2739e+00],
        [7.0461e-02, 2.2809e+00, 8.3612e-04, 1.7116e+04, 2.4957e+00, 8.5643e+01,
         3.8005e-05, 7.2468e+02, 1.0682e+02, 2.5403e-03, 1.0774e-05, 5.1795e+03,
         1.3281e+04, 1.1540e+02, 9.5201e-03, 5.6337e+01, 6.6637e-02, 1.6938e-06,
         4.1786e-

In [23]:
# Normalising counts by creating probability matrix
prob = counts / counts.sum(1, keepdims=True)

In [24]:
prob.shape, prob[0].sum()

(torch.Size([32, 27]), tensor(1.0000))