In [1]:
# MLP - with using paper Bengio et al.2003

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [3]:
# read in all the words
words = open('cleaned_names.txt', 'r').read().splitlines()
words[:8]

['aaban', 'aabid', 'aabidah', 'aabir', 'aabriella', 'aada', 'aadam', 'aadarsh']

In [4]:
len(words)

29681

In [5]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
# build the dataset

block_size = 3 # context length: how many characters to use to predict the next one?
X, Y = [], []
for w in words[:5]:

  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join([itos[i] for i in context]), '---->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

aaban
... ----> a
..a ----> a
.aa ----> b
aab ----> a
aba ----> n
ban ----> .
aabid
... ----> a
..a ----> a
.aa ----> b
aab ----> i
abi ----> d
bid ----> .
aabidah
... ----> a
..a ----> a
.aa ----> b
aab ----> i
abi ----> d
bid ----> a
ida ----> h
dah ----> .
aabir
... ----> a
..a ----> a
.aa ----> b
aab ----> i
abi ----> r
bir ----> .
aabriella
... ----> a
..a ----> a
.aa ----> b
aab ----> r
abr ----> i
bri ----> e
rie ----> l
iel ----> l
ell ----> a
lla ----> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype 

(torch.Size([36, 3]), torch.int64, torch.Size([36]), torch.int64)

In [8]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  1],
        [ 2,  1, 14],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  9],
        [ 2,  9,  4],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  9],
        [ 2,  9,  4],
        [ 9,  4,  1],
        [ 4,  1,  8],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  9],
        [ 2,  9, 18],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2, 18],
        [ 2, 18,  9],
        [18,  9,  5],
        [ 9,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1]])

In [9]:
Y # labels

tensor([ 1,  1,  2,  1, 14,  0,  1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,
         8,  0,  1,  1,  2,  9, 18,  0,  1,  1,  2, 18,  9,  5, 12, 12,  1,  0])

In [10]:
C = torch.randn(27, 2) # each 27 characters is represented by a 2D embedding

In [11]:
C

tensor([[-1.3031e+00, -3.6236e-01],
        [ 5.5376e-01, -7.9104e-01],
        [-6.1251e-01, -1.9578e+00],
        [-1.6448e-02,  1.3773e+00],
        [-5.9627e-01,  1.2749e+00],
        [-2.2101e+00, -6.2283e-02],
        [-2.1936e+00,  1.7788e-01],
        [-1.3372e+00, -2.1797e+00],
        [-1.1250e+00, -5.5517e-01],
        [ 1.2213e+00,  4.8201e-01],
        [ 1.2218e+00, -1.5802e+00],
        [-1.4658e+00,  7.6298e-01],
        [ 1.6160e+00, -1.9099e-02],
        [-4.5613e-01, -4.3575e-01],
        [ 1.2443e-01, -8.6932e-01],
        [ 1.1588e+00,  1.6327e-02],
        [ 1.6506e+00,  1.0149e+00],
        [ 4.0574e-01,  5.1866e-01],
        [-8.6364e-01,  5.7886e-01],
        [-1.6941e+00,  1.2033e-03],
        [ 1.2299e-02,  9.8640e-01],
        [-6.8932e-01, -1.0770e-01],
        [ 5.9519e-01,  5.2548e-01],
        [-4.5347e-01,  1.1394e+00],
        [-1.2553e+00, -3.8083e-01],
        [ 7.4810e-01,  2.2655e-01],
        [-1.2261e+00,  2.8440e-01]])

In [12]:
C[5]

tensor([-2.2101, -0.0623])

In [13]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # one hot vector

tensor([-2.2101, -0.0623])

In [14]:
C[[5,6,7]] # indexing with list

tensor([[-2.2101, -0.0623],
        [-2.1936,  0.1779],
        [-1.3372, -2.1797]])

In [15]:
C[torch.tensor([5,6,7])] # indexing with tensor

tensor([[-2.2101, -0.0623],
        [-2.1936,  0.1779],
        [-1.3372, -2.1797]])

In [16]:
C[X].shape

torch.Size([36, 3, 2])

In [17]:
X[13,2] # 13th example, 2nd character

tensor(1)

In [18]:
C[X[13,2]] # embedding of the 13th example, 2nd character

tensor([ 0.5538, -0.7910])

In [19]:
C[1]

tensor([ 0.5538, -0.7910])

In [20]:
emb = C[X]
emb.shape

torch.Size([36, 3, 2])

In [21]:
# Constructing the hidden layer
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [22]:
h = emb.view(-1, 6) @ W1 + b1

In [23]:
h

tensor([[-1.8291,  3.4694, -0.8451,  ...,  1.0835,  2.7335, -1.0800],
        [-0.2129,  3.6277, -1.5535,  ..., -1.1340,  1.8812, -3.3228],
        [ 0.0992,  3.1450, -1.2577,  ..., -0.7864,  1.8737,  0.4081],
        ...,
        [-0.4359, -0.3801, -1.2317,  ..., -3.7264, -3.3775, -4.6495],
        [ 0.5047,  1.8811, -4.0027,  ..., -1.2813,  2.8477, -0.9158],
        [ 0.0375,  0.0330,  1.0419,  ..., -0.5893, -2.3872,  5.2425]])

In [24]:
# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape # first, second, third characters of the first example concatenated

In [25]:
# torch.cat(torch.unbind(emb, 1), 1).shape # split the tensor along the second dimension

In [26]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [27]:
a.shape

torch.Size([18])

In [28]:
a.view(2, 9) # view as 2x9 matrix

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [29]:
a.view(9, 2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [30]:
a.view(3, 3, 2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [31]:
a.storage() # memory address

  a.storage() # memory address


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [32]:
emb.shape

torch.Size([36, 3, 2])

In [33]:
# emb.view(32, 6) == torch.cat(torch.unbind(emb, 1), 1)

In [34]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [35]:
h

tensor([[-0.9497,  0.9981, -0.6885,  ...,  0.7945,  0.9916, -0.7932],
        [-0.2098,  0.9986, -0.9144,  ..., -0.8124,  0.9546, -0.9974],
        [ 0.0989,  0.9963, -0.8504,  ..., -0.6564,  0.9539,  0.3869],
        ...,
        [-0.4102, -0.3628, -0.8431,  ..., -0.9988, -0.9977, -0.9998],
        [ 0.4658,  0.9546, -0.9993,  ..., -0.8568,  0.9933, -0.7239],
        [ 0.0374,  0.0330,  0.7786,  ..., -0.5294, -0.9833,  0.9999]])

In [36]:
h.shape

torch.Size([36, 100])

In [37]:
(emb.view(-1, 6) @ W1).shape

torch.Size([36, 100])

In [38]:
b1.shape

torch.Size([100])

In [39]:
# 32, 100
# broadcasting to 100
# 1 , 100

In [40]:
# Creating the final layer (output layer)
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

In [41]:
logits = h @ W2 + b2

In [42]:
logits.shape

torch.Size([36, 27])

In [43]:
counts = logits.exp()

In [44]:
prob = counts / counts.sum(-1, keepdim=True)

In [45]:
prob.shape

torch.Size([36, 27])

In [46]:
prob

tensor([[5.6259e-03, 4.0365e-14, 1.1220e-12, 1.6429e-03, 6.2425e-07, 8.6124e-05,
         9.3170e-07, 2.1305e-06, 7.4542e-07, 2.1513e-10, 4.9405e-08, 1.0103e-05,
         1.7605e-06, 8.5938e-12, 1.2277e-10, 1.9547e-09, 2.9952e-07, 9.7833e-01,
         9.0586e-04, 7.4724e-10, 1.3368e-02, 4.4159e-09, 1.2711e-06, 1.9221e-09,
         4.5382e-10, 2.4110e-05, 2.2118e-10],
        [4.1534e-04, 1.6842e-16, 8.7745e-11, 4.9786e-05, 8.0236e-09, 1.2244e-04,
         1.9372e-08, 5.9999e-08, 9.3224e-01, 5.7765e-10, 1.6939e-10, 1.0296e-08,
         4.4263e-07, 1.3931e-15, 3.7864e-10, 8.8024e-14, 6.3250e-06, 6.7028e-02,
         9.3245e-10, 3.6358e-08, 1.6869e-07, 4.2361e-08, 4.3533e-13, 1.8655e-10,
         8.3037e-11, 1.3917e-04, 1.1000e-07],
        [4.1598e-05, 1.6091e-13, 1.4843e-10, 5.1148e-01, 2.8232e-06, 3.9842e-05,
         2.6763e-08, 4.1025e-01, 3.9622e-05, 4.5232e-04, 1.3787e-06, 1.6966e-06,
         1.9118e-08, 4.3731e-12, 7.7057e-02, 1.8821e-11, 1.6890e-07, 2.8826e-05,
         7.9090e-

In [47]:
prob[0].sum() # normalized

tensor(1.)

In [48]:
Y # labels

tensor([ 1,  1,  2,  1, 14,  0,  1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,
         8,  0,  1,  1,  2,  9, 18,  0,  1,  1,  2, 18,  9,  5, 12, 12,  1,  0])

In [49]:
torch.arange(36)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])

In [50]:
prob[torch.arange(36), Y] # probabilities of the true characters

tensor([4.0365e-14, 1.6842e-16, 1.4843e-10, 1.6768e-11, 1.2347e-12, 1.6959e-08,
        4.0365e-14, 1.6842e-16, 1.4843e-10, 6.3647e-11, 2.4420e-07, 1.2752e-10,
        4.0365e-14, 1.6842e-16, 1.4843e-10, 6.3647e-11, 2.4420e-07, 1.6706e-18,
        8.2011e-08, 4.1916e-05, 4.0365e-14, 1.6842e-16, 1.4843e-10, 6.3647e-11,
        2.3240e-12, 3.7885e-09, 4.0365e-14, 1.6842e-16, 1.4843e-10, 3.5136e-12,
        1.6672e-10, 6.2075e-01, 1.3232e-10, 9.9121e-01, 4.4846e-17, 2.5647e-07])

In [124]:
loss = -prob[torch.arange(36), Y].log().mean() # cross entropy loss
loss

tensor(17.8125)

In [51]:
# ----------------- Summary -----------------

In [52]:
X.shape, Y.shape

(torch.Size([36, 3]), torch.Size([36]))

In [53]:
g = torch.Generator().manual_seed(2147483647) # seed for reproducibility
C = torch.randn(27, 2, generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn(100, 27, generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [54]:
sum(p.nelement() for p in parameters) # total number of parameters

3481

In [62]:
for p in parameters:
  p.requires_grad = True

In [None]:
for _ in range(1000): # iterations

# Forward pass

  emb = C[X] # embedding
  h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # hidden layer
  logits = h @ W2 + b2 # output layer
# counts = logits.exp() # unnormalized probabilities
# prob = counts / counts.sum(1, keepdim=True) # normalized probabilities
# loss = -prob[torch.arange(36), Y].log().mean() # cross entropy loss
#  but never use the above code in practice as it is not efficient and numerically unstable
  loss = F.cross_entropy(logits, Y) 
  # print(loss.item())


  # Backward pass
  for p in parameters:
    p.grad = None

  loss.backward()

  # Update
  for p in parameters:
    p.data += -0.1 *  p.grad
print(loss.item())

0.2250777930021286


In [61]:
F.cross_entropy(logits, Y) # PyTorch's implementation of cross entropy we can put the cross entropy directly in this code with the help of Pytorch directly in this code which will be more efficient for forward pass and backward pass and numerical stability.

tensor(17.8694)

In [71]:
logits.max(1) # max in python is a function that returns two values: the maximum value and its index

torch.return_types.max(
values=tensor([24.8031, 17.9717, 17.1211, 15.1294, 15.7555, 17.4552, 24.8031, 17.9717,
        17.1211, 15.1294, 15.3354, 12.2345, 24.8031, 17.9717, 17.1211, 15.1294,
        15.3354, 12.2345, 13.1716, 20.6208, 24.8031, 17.9717, 17.1211, 15.1294,
        15.3354, 16.3180, 24.8031, 17.9717, 17.1211, 15.1294, 17.5865, 17.6060,
        13.3892, 12.7876, 12.2740, 15.9797], grad_fn=<MaxBackward0>),
indices=tensor([ 1,  1,  2,  9, 14,  0,  1,  1,  2,  9,  4,  1,  1,  1,  2,  9,  4,  1,
         8,  0,  1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  9,  5, 12, 12,  1,  0]))

In [72]:
Y # labels

tensor([ 1,  1,  2,  1, 14,  0,  1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,
         8,  0,  1,  1,  2,  9, 18,  0,  1,  1,  2, 18,  9,  5, 12, 12,  1,  0])