In [51]:
# MLP - with using paper Bengio et al.2003

In [52]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [53]:
# read in all the words
words = open('cleaned_names.txt', 'r').read().splitlines()
words[:8]

['aaban', 'aabid', 'aabidah', 'aabir', 'aabriella', 'aada', 'aadam', 'aadarsh']

In [54]:
len(words)

29681

In [55]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [56]:
# build the dataset

block_size = 3 # context length: how many characters to use to predict the next one?
X, Y = [], []
for w in words[:5]:

  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join([itos[i] for i in context]), '---->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

aaban
... ----> a
..a ----> a
.aa ----> b
aab ----> a
aba ----> n
ban ----> .
aabid
... ----> a
..a ----> a
.aa ----> b
aab ----> i
abi ----> d
bid ----> .
aabidah
... ----> a
..a ----> a
.aa ----> b
aab ----> i
abi ----> d
bid ----> a
ida ----> h
dah ----> .
aabir
... ----> a
..a ----> a
.aa ----> b
aab ----> i
abi ----> r
bir ----> .
aabriella
... ----> a
..a ----> a
.aa ----> b
aab ----> r
abr ----> i
bri ----> e
rie ----> l
iel ----> l
ell ----> a
lla ----> .


In [57]:
X.shape, X.dtype, Y.shape, Y.dtype 

(torch.Size([36, 3]), torch.int64, torch.Size([36]), torch.int64)

In [58]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  1],
        [ 2,  1, 14],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  9],
        [ 2,  9,  4],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  9],
        [ 2,  9,  4],
        [ 9,  4,  1],
        [ 4,  1,  8],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2,  9],
        [ 2,  9, 18],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        [ 1,  1,  2],
        [ 1,  2, 18],
        [ 2, 18,  9],
        [18,  9,  5],
        [ 9,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1]])

In [59]:
Y # labels

tensor([ 1,  1,  2,  1, 14,  0,  1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,
         8,  0,  1,  1,  2,  9, 18,  0,  1,  1,  2, 18,  9,  5, 12, 12,  1,  0])

In [60]:
C = torch.randn(27, 2) # each 27 characters is represented by a 2D embedding

In [61]:
C

tensor([[ 0.2267, -1.1814],
        [-0.3037,  0.5494],
        [ 0.7256, -0.4971],
        [ 0.3972,  0.0945],
        [ 0.1055, -0.1769],
        [ 0.1621,  0.5746],
        [-0.8591,  0.1463],
        [ 0.6487,  0.3895],
        [-1.7086, -0.1245],
        [-0.4339, -0.5667],
        [-0.2502,  0.4662],
        [ 0.5453,  0.7484],
        [ 0.6187,  0.5553],
        [ 0.7460, -0.3221],
        [-1.0132, -1.2370],
        [ 0.7007,  0.9348],
        [-0.0927,  0.2925],
        [-1.5795, -0.6300],
        [-0.1514,  0.9762],
        [ 0.1321,  1.8256],
        [-0.2423, -0.5703],
        [-0.6588,  0.1696],
        [ 0.4282,  1.7926],
        [ 0.7237, -0.1539],
        [-0.5655,  0.5998],
        [ 0.1590, -0.8350],
        [-0.5081,  1.6276]])

In [62]:
C[5]

tensor([0.1621, 0.5746])

In [63]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # one hot vector

tensor([0.1621, 0.5746])

In [64]:
C[[5,6,7]] # indexing with list

tensor([[ 0.1621,  0.5746],
        [-0.8591,  0.1463],
        [ 0.6487,  0.3895]])

In [65]:
C[torch.tensor([5,6,7])] # indexing with tensor

tensor([[ 0.1621,  0.5746],
        [-0.8591,  0.1463],
        [ 0.6487,  0.3895]])

In [66]:
C[X].shape

torch.Size([36, 3, 2])

In [67]:
X[13,2] # 13th example, 2nd character

tensor(1)

In [68]:
C[X[13,2]] # embedding of the 13th example, 2nd character

tensor([-0.3037,  0.5494])

In [69]:
C[1]

tensor([-0.3037,  0.5494])

In [70]:
emb = C[X]
emb.shape

torch.Size([36, 3, 2])

In [71]:
# Constructing the hidden layer
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [93]:
h = emb.view(-1, 6) @ W1 + b1

In [94]:
h

tensor([[ 3.1537, -3.8829, -0.1728,  ...,  1.3942, -4.0677,  1.2994],
        [ 2.7803, -5.2128,  0.3832,  ...,  1.6790,  0.8570, -0.4494],
        [ 0.7629, -3.2683, -0.3648,  ...,  0.6772,  1.1944, -0.1140],
        ...,
        [ 0.2591,  1.8355, -0.4823,  ...,  0.3382,  0.9148, -1.9791],
        [ 1.6234,  1.0837, -0.5028,  ...,  0.4975, -0.2569, -2.4144],
        [ 2.0655, -3.0583, -0.4778,  ...,  0.3471, -0.1510, -0.9035]])

In [73]:
# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape # first, second, third characters of the first example concatenated

torch.Size([36, 6])

In [74]:
# torch.cat(torch.unbind(emb, 1), 1).shape # split the tensor along the second dimension

torch.Size([36, 6])

In [75]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [76]:
a.shape

torch.Size([18])

In [77]:
a.view(2, 9) # view as 2x9 matrix

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [78]:
a.view(9, 2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [79]:
a.view(3, 3, 2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [80]:
a.storage() # memory address

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [81]:
emb.shape

torch.Size([36, 3, 2])

In [87]:
# emb.view(32, 6) == torch.cat(torch.unbind(emb, 1), 1)

In [95]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [96]:
h

tensor([[ 0.9964, -0.9992, -0.1711,  ...,  0.8841, -0.9994,  0.8616],
        [ 0.9923, -0.9999,  0.3655,  ...,  0.9327,  0.6947, -0.4214],
        [ 0.6428, -0.9971, -0.3494,  ...,  0.5897,  0.8319, -0.1135],
        ...,
        [ 0.2535,  0.9504, -0.4481,  ...,  0.3259,  0.7234, -0.9625],
        [ 0.9251,  0.7946, -0.4643,  ...,  0.4601, -0.2514, -0.9841],
        [ 0.9684, -0.9956, -0.4445,  ...,  0.3338, -0.1499, -0.7180]])

In [98]:
h.shape

torch.Size([36, 100])

In [99]:
(emb.view(-1, 6) @ W1).shape

torch.Size([36, 100])

In [100]:
b1.shape

torch.Size([100])

In [101]:
# 32, 100
# broadcasting to 100
# 1 , 100

In [102]:
# Creating the final layer (output layer)
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

In [103]:
logits = h @ W2 + b2

In [104]:
logits.shape

torch.Size([36, 27])

In [106]:
counts = logits.exp()

In [107]:
prob = counts / counts.sum(-1, keepdim=True)

In [108]:
prob.shape

torch.Size([36, 27])

In [109]:
prob

tensor([[8.8065e-09, 1.4690e-04, 1.9862e-05, 3.8407e-08, 7.6152e-08, 7.1989e-12,
         9.5434e-11, 2.1641e-07, 5.3638e-09, 2.0643e-08, 6.4877e-10, 1.4118e-02,
         4.4283e-11, 2.8704e-05, 3.8206e-03, 1.4188e-10, 7.2322e-01, 2.4760e-08,
         1.4033e-04, 4.8933e-08, 8.5709e-07, 1.9782e-06, 9.9065e-08, 9.2512e-10,
         2.5850e-01, 3.4688e-07, 1.8002e-10],
        [5.4459e-14, 1.2444e-08, 3.2858e-06, 9.8884e-06, 1.7350e-11, 1.9971e-09,
         2.8320e-07, 1.2003e-12, 2.7561e-10, 6.1739e-12, 7.5947e-04, 5.9870e-06,
         2.8088e-09, 5.1304e-07, 7.5357e-07, 2.0542e-08, 3.2429e-02, 6.0873e-12,
         2.6353e-04, 7.4122e-06, 2.2237e-08, 9.4560e-01, 9.5726e-11, 1.0953e-14,
         2.0924e-02, 7.3264e-09, 3.1278e-10],
        [7.0010e-04, 2.0603e-13, 1.5530e-06, 1.0714e-01, 1.3761e-02, 1.2612e-04,
         7.6711e-10, 3.4976e-10, 3.8315e-03, 3.6186e-10, 1.4581e-02, 3.2739e-09,
         3.2324e-08, 2.5105e-04, 3.8956e-04, 5.9131e-01, 8.0743e-06, 1.8042e-07,
         7.6603e-

In [117]:
prob[0].sum() # normalized

tensor(1.0000)

In [118]:
Y # labels

tensor([ 1,  1,  2,  1, 14,  0,  1,  1,  2,  9,  4,  0,  1,  1,  2,  9,  4,  1,
         8,  0,  1,  1,  2,  9, 18,  0,  1,  1,  2, 18,  9,  5, 12, 12,  1,  0])

In [122]:
torch.arange(36)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])

In [121]:
prob[torch.arange(36), Y] # probabilities of the true characters

tensor([1.4690e-04, 1.2444e-08, 1.5530e-06, 7.8607e-14, 2.6853e-13, 1.3313e-03,
        1.4690e-04, 1.2444e-08, 1.5530e-06, 4.9895e-09, 1.6498e-07, 7.7934e-11,
        1.4690e-04, 1.2444e-08, 1.5530e-06, 4.9895e-09, 1.6498e-07, 2.0607e-09,
        1.3075e-05, 3.9029e-06, 1.4690e-04, 1.2444e-08, 1.5530e-06, 4.9895e-09,
        2.8789e-13, 3.5346e-11, 1.4690e-04, 1.2444e-08, 1.5530e-06, 4.6268e-11,
        8.5628e-11, 1.0484e-12, 2.0031e-13, 3.1575e-09, 2.1483e-12, 3.9099e-07])

In [124]:
loss = -prob[torch.arange(36), Y].log().mean() # cross entropy loss
loss

tensor(17.8125)