[Building makemore Part 2: MLP](https://www.youtube.com/watch?v=TCH_1BHY58I)


This notebook is about training word embeddings on the names.txt file, to come up with new names.

It is based on the paper [A Neural Probabilistic Language Model](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf) by Yoshua Bengio.



![Neural Architecture](images/NeuralArchitecture.png)

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
chars

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [5]:
delimiter = '.'

In [6]:
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi[delimiter]=0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [7]:
itos = {i:s for s,i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [8]:
# build the dataset
block_size = 3 # aka context length: how many characters do we take to predict the next one?

In [9]:
# X is the input to the neural network
# Y are the labels
X, Y = [], []

In [10]:
[0]*block_size

[0, 0, 0]

In [11]:
for w in words[:5]:
    
    print(w)
    # [0] is the index for the delimiter we set above ... 
    # hence [0,0,0] run throught itos becomes ['.','.','.']
    context = [0]*block_size
    for ch in w + delimiter :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append
    print('')

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .

olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .

ava
... ---> a
..a ---> v
.av ---> a
ava ---> .

isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .

sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .



In [12]:
# repeat the above, but with a larger block_size, just to show the differenct
block_size = 10
X, Y = [], []
for w in words[:5]:
    
    print(w)
    context = [0]*block_size
    for ch in w + delimiter :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append
    print('')

emma
.......... ---> e
.........e ---> m
........em ---> m
.......emm ---> a
......emma ---> .

olivia
.......... ---> o
.........o ---> l
........ol ---> i
.......oli ---> v
......oliv ---> i
.....olivi ---> a
....olivia ---> .

ava
.......... ---> a
.........a ---> v
........av ---> a
.......ava ---> .

isabella
.......... ---> i
.........i ---> s
........is ---> a
.......isa ---> b
......isab ---> e
.....isabe ---> l
....isabel ---> l
...isabell ---> a
..isabella ---> .

sophia
.......... ---> s
.........s ---> o
........so ---> p
.......sop ---> h
......soph ---> i
.....sophi ---> a
....sophia ---> .



In [13]:
# now do everything for realz this time ... 
# which is to build out X and Y
# from all the names in words.
block_size = 3
X, Y = [], []
for w in words:
    
    # print(w)
    context = [0]*block_size
    for ch in w + delimiter :
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

In [14]:
X[:5]

[[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1]]

In [15]:
Y[:5]

[5, 13, 13, 1, 0]

In [16]:
X = torch.tensor(X)
Y = torch.tensor(Y)

In [17]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [68]:
noOfRecordsInTheInputDataSet = X.shape[0]

Now lets build the embedding lookup C which has a total of 27 characters.

In [18]:
# we will use a 2 dimensional embedding for every one of the 27 characters
embeddingWidth = 2
totalCharacters = len(stoi)
C = torch.rand((totalCharacters,embeddingWidth))
C

tensor([[0.7689, 0.4173],
        [0.1792, 0.0637],
        [0.3606, 0.6921],
        [0.1082, 0.1951],
        [0.2746, 0.6797],
        [0.0286, 0.8893],
        [0.7852, 0.0673],
        [0.6604, 0.9907],
        [0.3910, 0.5829],
        [0.1748, 0.6415],
        [0.5945, 0.0623],
        [0.7853, 0.2322],
        [0.4170, 0.6745],
        [0.8084, 0.4869],
        [0.0700, 0.4625],
        [0.1576, 0.8202],
        [0.7082, 0.9137],
        [0.4770, 0.3962],
        [0.4747, 0.6037],
        [0.7008, 0.3890],
        [0.1702, 0.8767],
        [0.6228, 0.7113],
        [0.2681, 0.6115],
        [0.4365, 0.3354],
        [0.2940, 0.4697],
        [0.4872, 0.6582],
        [0.3608, 0.3832]])

In [19]:
F.one_hot(torch.tensor(5), num_classes=totalCharacters)

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [20]:
# this fails ... 
# F.one_hot(torch.tensor(5), num_classes=27) @ C

In [21]:
F.one_hot(torch.tensor(5), num_classes=totalCharacters).dtype

torch.int64

In [22]:
C.dtype

torch.float32

In [23]:
F.one_hot(torch.tensor(5), num_classes=totalCharacters).float() @ C

tensor([0.0286, 0.8893])

In [24]:
# the above operation is the same as ...
C[5]

tensor([0.0286, 0.8893])

In [25]:
C[[5, 6, 7]]

tensor([[0.0286, 0.8893],
        [0.7852, 0.0673],
        [0.6604, 0.9907]])

In [26]:
C[torch.tensor([5, 6, 7])]

tensor([[0.0286, 0.8893],
        [0.7852, 0.0673],
        [0.6604, 0.9907]])

In [27]:
C[:6]

tensor([[0.7689, 0.4173],
        [0.1792, 0.0637],
        [0.3606, 0.6921],
        [0.1082, 0.1951],
        [0.2746, 0.6797],
        [0.0286, 0.8893]])

In [28]:
C[[0, 1, 5, 13]]

tensor([[0.7689, 0.4173],
        [0.1792, 0.0637],
        [0.0286, 0.8893],
        [0.8084, 0.4869]])

In [29]:
X[:5]

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]])

In [30]:
# so notice how we use X to index into C ...
C[X][:5]

tensor([[[0.7689, 0.4173],
         [0.7689, 0.4173],
         [0.7689, 0.4173]],

        [[0.7689, 0.4173],
         [0.7689, 0.4173],
         [0.0286, 0.8893]],

        [[0.7689, 0.4173],
         [0.0286, 0.8893],
         [0.8084, 0.4869]],

        [[0.0286, 0.8893],
         [0.8084, 0.4869],
         [0.8084, 0.4869]],

        [[0.8084, 0.4869],
         [0.8084, 0.4869],
         [0.1792, 0.0637]]])

In [31]:
emb = C[X]
emb.shape

torch.Size([228146, 3, 2])

In [32]:
emb[0]

tensor([[0.7689, 0.4173],
        [0.7689, 0.4173],
        [0.7689, 0.4173]])

Now let's construct the hidden layer, which is the first layer after the input layer in Figure 1. (the tanh layer)

W1 are the weights and the number of inputs into this layer will be 3x2=6, which is the product of the blocksize X embedding width.

In [33]:
noOfInputs = block_size * embeddingWidth
noOfNeurons = 100
W1 = torch.randn((noOfInputs, noOfNeurons))
W1.shape

torch.Size([6, 100])

In [34]:
b1 = torch.randn(noOfNeurons)
b1.shape

torch.Size([100])

Now remember we would like to multiply the inputs (the embeddings) by W1 and add b1, but the problem is the embeddings are stacked up in the tensor, so we can't simply multiply one by the other. We need to somehow concatonate the 3 embeddings for each input into a tensor of size 6.

h = emb @ W1 + b1

A brief look at the best way to reshape a tensor with a PyTorch [view](https://pytorch.org/docs/stable/generated/torch.Tensor.view.html), which `returns a new tensor with the same data as the self tensor but of a different shape.`

(The page [PyTorch interals](http://blog.ezyang.com/2019/05/pytorch-internals/) provides insights into what a view does with a tensor)



In [35]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [36]:
a.view(2,9)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [37]:
a.view(9,2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [38]:
a.view(3, 3, 2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

So let's reshape the emb tensor to the desired shape.

In [39]:
emb.shape

torch.Size([228146, 3, 2])

In [40]:
emb.shape[0]

228146

In [41]:
emb.view(emb.shape[0], 6).shape

torch.Size([228146, 6])

In [42]:
emb.view(-1, 6).shape

torch.Size([228146, 6])

In [43]:
emb.view(emb.shape[0], noOfInputs).shape

torch.Size([228146, 6])

So now let's calculate h, the output for the hidden layer.

In [44]:
h = (emb.view(-1, noOfInputs) @ W1) + b1
h.shape

torch.Size([228146, 100])

In [45]:
# right ... don't forget tanh ...
h = torch.tanh((emb.view(-1, noOfInputs) @ W1) + b1)
h.shape

torch.Size([228146, 100])

Now let's look at the final layer, where we do the softmax.

The input width will be 100, as seen above, and the output width will be the total number of characters, which is 27.

In [46]:
totalCharacters

27

In [47]:
hiddenLayerOutputWidth = h.shape[-1]
hiddenLayerOutputWidth

100

In [48]:
W2 = torch.randn((hiddenLayerOutputWidth, totalCharacters))
W2.shape

torch.Size([100, 27])

In [49]:
b2 = torch.randn(totalCharacters)
b2.shape

torch.Size([27])

In [50]:
h.shape

torch.Size([228146, 100])

In [51]:
h @ W2

tensor([[-12.1043,  -3.7227, -10.3510,  ...,   3.7496,   3.5210,   9.4259],
        [ -6.9154,   7.1670,  -7.6735,  ...,  -1.4320,   6.1822,   2.6551],
        [-14.2813,   1.5332,  -4.7559,  ...,  -4.1289,   3.0676,  11.3423],
        ...,
        [ -5.0964,   3.2972,  -8.8796,  ...,  -3.0974,   4.4761,   8.9045],
        [ -8.9287,   0.3623,  -6.3219,  ...,  -3.0575,   6.6317,   8.5183],
        [ -6.5772,   6.9971, -12.0569,  ...,  -2.8294,   6.3067,   8.9538]])

In [52]:
h @ W2 + b2

tensor([[-11.8233,  -4.1823, -10.8996,  ...,   1.8566,   3.5062,   9.5870],
        [ -6.6344,   6.7075,  -8.2221,  ...,  -3.3250,   6.1675,   2.8162],
        [-14.0002,   1.0736,  -5.3045,  ...,  -6.0219,   3.0529,  11.5034],
        ...,
        [ -4.8153,   2.8376,  -9.4282,  ...,  -4.9904,   4.4613,   9.0656],
        [ -8.6477,  -0.0973,  -6.8706,  ...,  -4.9505,   6.6170,   8.6794],
        [ -6.2962,   6.5375, -12.6055,  ...,  -4.7223,   6.2919,   9.1149]])

In [54]:
logits = h @ W2 + b2
logits.shape

torch.Size([228146, 27])

In [58]:
counts = logits.exp()
counts.shape

torch.Size([228146, 27])

In [59]:
prob = counts / counts.sum(1, keepdims=True)
prob.shape

torch.Size([228146, 27])

Notice how every row of prob will sum to 1.

In [60]:
prob[0].sum()

tensor(1.0000)

In [61]:
prob[234].sum()

tensor(1.0000)

In [62]:
Y.shape

torch.Size([228146])

In [64]:
Y[:16]

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0])

In [71]:
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [72]:
prob.shape

torch.Size([228146, 27])

In [69]:
noOfRecordsInTheInputDataSet

228146

In [70]:
prob[torch.arange(noOfRecordsInTheInputDataSet), Y]

tensor([6.9772e-10, 9.8193e-01, 9.7741e-02,  ..., 4.3038e-03, 1.2618e-08,
        3.9948e-10])

In [73]:
loss = -prob[torch.arange(noOfRecordsInTheInputDataSet), Y].log().mean()
loss

tensor(17.0373)