## Building the dataset

Based on [this paper][https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf], we build an MLP that trains the embeddings of the characters to learn some semantic meaning behind the letters and how they interact together. 

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
block_size = 3
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '->', itos[ix])
        context = context[1:] + [ix] # sliding window: take the last 2 and add the new one to the end

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -> e
..e -> m
.em -> m
emm -> a
mma -> .
olivia
... -> o
..o -> l
.ol -> i
oli -> v
liv -> i
ivi -> a
via -> .
ava
... -> a
..a -> v
.av -> a
ava -> .
isabella
... -> i
..i -> s
.is -> a
isa -> b
sab -> e
abe -> l
bel -> l
ell -> a
lla -> .
sophia
... -> s
..s -> o
.so -> p
sop -> h
oph -> i
phi -> a
hia -> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

## Embeddings layer

![image](images/MLP_arch.png)

In the paper, they reduce 17,000 words into 30D vectors. We can reduce our 27 characters to a 2D space. `C` is the lookup table.

In [8]:
C = torch.randn((27, 2), requires_grad=True)
C.shape

torch.Size([27, 2])

In [9]:
C[5]

tensor([-1.2969, -0.9388], grad_fn=<SelectBackward0>)

In [10]:
C[[0, 1, 6, 26]]

tensor([[-0.7415,  0.9407],
        [-1.2727, -0.9903],
        [ 0.8931,  0.9310],
        [ 0.6306,  1.2914]], grad_fn=<IndexBackward0>)

In [11]:
X

# ['.', '.', '.']
# ['.', '.', 'e']
# .
# .
# .

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [12]:
X.shape

torch.Size([32, 3])

In [13]:
C[X].shape

# ['.', '.', '.'] -> [2D embedding, 2D embedding, 2D embedding]
# For each of the 32 instances in X

torch.Size([32, 3, 2])

In [14]:
X[3, 2]

# 3rd instance: ['e', 2nd char: 'm', 'm'] -> 'm' -> 13

tensor(13)

In [15]:
print(C[X][3, 2])
print(C[13])

tensor([1.0754, 2.2166], grad_fn=<SelectBackward0>)
tensor([1.0754, 2.2166], grad_fn=<SelectBackward0>)


In [16]:
emb_X = C[X]
emb_X.shape

torch.Size([32, 3, 2])

## Hidden layer (first linear layer in MLP)

Our model has a block size of 3, so 3 2D vectors are the input to this layer -> 6D i/p

In [17]:
W1 = torch.randn((6, 100), requires_grad=True) # 100 neurons in the hidden layer
b1 = torch.randn((100), requires_grad=True)

In [18]:
emb_X.shape, W1.shape, b1.shape

(torch.Size([32, 3, 2]), torch.Size([6, 100]), torch.Size([100]))

In [19]:
emb_X @ W1 + b1

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [20]:
# Embeddings of the first char (3 block size) of each instance
emb_X[:, 0, :].shape

torch.Size([32, 2])

In [21]:
torch.cat([emb_X[:, 0, :], emb_X[:, 1, :], emb_X[:, 2, :]], dim=1).shape

# dim = 0 will add to rows and then the shape will be (94, 2)


torch.Size([32, 6])

We can use unbind to this dynamically and work for any block size

In [22]:
ex = torch.randn((32, 3, 2))
len(torch.unbind(ex, dim=0))

# 32 of (3, 2): [[[1, 1], [1, 2], [27, 32]], \
# [[1, 1], [1, 2], [27, 32]], \
# [[1, 1], [1, 2], [27, 32]]] -> 32 of (1): [[1, 1], [1, 2], [27, 32], [1, 1], [1, 2], [27, 32], [1, 1], [1, 2], [27, 32]]

# Unpacked about the 0th dimension

32

In [23]:
len(torch.unbind(ex, dim=1))

3

In [24]:
ex[0]

tensor([[-0.6008,  0.4004],
        [ 0.8582, -1.3808],
        [ 0.0350, -1.3037]])

In [25]:
ex[1]

tensor([[1.5084, 1.5885],
        [0.2781, 0.4130],
        [0.6727, 0.8347]])

In [26]:
torch.unbind(ex, dim=1)[0] # First chars (total 3) of each instance -> 32, each 2D

tensor([[-0.6008,  0.4004],
        [ 1.5084,  1.5885],
        [-0.0767,  0.2994],
        [-1.2582, -0.0549],
        [-0.0673,  0.7079],
        [ 0.9577,  0.1720],
        [ 1.8537,  0.4496],
        [-0.5099,  0.0539],
        [-0.8096,  0.6276],
        [ 0.0947, -0.2766],
        [-0.9441,  0.7434],
        [-1.1875,  1.2586],
        [ 0.2718, -0.3905],
        [-0.5370, -1.0579],
        [ 0.0290, -0.3944],
        [-0.9186, -0.7583],
        [ 0.7286, -0.7116],
        [-0.6313, -0.2918],
        [ 1.0501,  0.3888],
        [-0.0739,  0.9379],
        [-0.0638, -0.6771],
        [ 0.3108, -2.2547],
        [-0.5364,  1.1548],
        [ 1.2160, -1.4501],
        [-0.8232, -0.4846],
        [ 1.6128, -1.9849],
        [-1.7903,  0.7469],
        [ 0.9014,  0.3917],
        [-1.7914, -1.4323],
        [ 0.8031,  0.1079],
        [-1.6243, -0.0184],
        [-0.8825, -0.0607]])

In [27]:
torch.unbind(ex, dim=1)[0].shape

torch.Size([32, 2])

In [28]:
torch.cat(torch.unbind(ex, dim=1), dim=1).shape

# same as torch.cat([emb_X[:, 0, :], emb_X[:, 1, :], emb_X[:, 2, :]], dim=1).shape but for any block size

torch.Size([32, 6])

We can use `torch.view` for more simplicity and efficiency

In [29]:
ex2 = torch.randn(18)
ex2.storage()

 0.3952986001968384
 -1.8756723403930664
 0.10505320876836777
 0.38649794459342957
 1.0866100788116455
 -1.2979038953781128
 1.020769476890564
 -0.12894396483898163
 1.8218743801116943
 -0.06773625314235687
 1.7524528503417969
 -0.13102704286575317
 0.5515129566192627
 -0.6447035670280457
 -0.9541137218475342
 -0.0990188792347908
 1.213905930519104
 -0.9784950613975525
[torch.FloatStorage of size 18]

Tensors are always stored as a group of numbers in memory. We can use `torch.view()` to pack them as we desire.

In [30]:
ex2.view(3, 6)

tensor([[ 0.3953, -1.8757,  0.1051,  0.3865,  1.0866, -1.2979],
        [ 1.0208, -0.1289,  1.8219, -0.0677,  1.7525, -0.1310],
        [ 0.5515, -0.6447, -0.9541, -0.0990,  1.2139, -0.9785]])

In [31]:
emb_X.view(32, 6) == torch.cat(torch.unbind(emb_X, 1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

Finally,

In [32]:
h = torch.tanh(emb_X.view(emb_X.shape[0], 6) @ W1 + b1)
h.shape # activations of 100 neurons for each of the 32 instances

torch.Size([32, 100])

Even more dynamic,

In [33]:
h = torch.tanh(emb_X.view(-1, 6) @ W1 + b1) # PyTorch will infer the required size based on that dim 1 = 6
h.shape

torch.Size([32, 100])

## Output

We can build layer 2 now. This is the output layer so we need 27 neurons here (each taking the 100D o/p of previous layer as i/p) to get a prob dist for the next char given a context window of block size 3.

In [34]:
W2 = torch.randn((100, 27), requires_grad=True)
b2 = torch.randn((27), requires_grad=True)

In [35]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [36]:
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)

In [37]:
probs[0].sum()

tensor(1.0000, grad_fn=<SumBackward0>)

In [39]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [40]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [41]:
probs[torch.arange(32), Y] # for 0th instance (32 total), what is the probability of the correct char and so on?

tensor([1.6394e-12, 1.8248e-10, 6.5286e-01, 1.0694e-12, 3.9033e-06, 3.2258e-14,
        1.1710e-13, 1.1330e-09, 1.0305e-13, 1.2611e-10, 1.0948e-13, 7.2170e-04,
        4.6182e-11, 3.0503e-06, 3.6985e-08, 8.5468e-06, 6.4139e-12, 6.2201e-05,
        5.4088e-07, 5.7261e-14, 7.1735e-17, 6.4859e-07, 1.3852e-03, 7.9844e-07,
        4.2719e-10, 4.8082e-04, 4.6977e-09, 7.5686e-06, 1.4736e-09, 6.6716e-09,
        5.2144e-09, 1.3438e-05], grad_fn=<IndexBackward0>)

In [42]:
loss = -probs[torch.arange(32), Y].log().mean()
loss

tensor(19.0173, grad_fn=<NegBackward0>)

In [43]:
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [45]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), requires_grad=True, generator=g)
W1 = torch.randn((6, 100), requires_grad=True, generator=g)
b1 = torch.randn((100), requires_grad=True, generator=g)
W2 = torch.randn((100, 27), requires_grad=True, generator=g)
b2 = torch.randn((27), requires_grad=True, generator=g)
parameters = [C, W1, b1, W2, b2]

In [46]:
sum(p.nelement() for p in parameters)

3481

In [48]:
emb = C[X]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)
loss = -probs[torch.arange(32), Y].log().mean()
loss



tensor(17.7697, grad_fn=<NegBackward0>)

In [50]:
loss = F.cross_entropy(logits, Y) 
loss

tensor(17.7697, grad_fn=<NllLossBackward0>)

In [52]:
for p in parameters:
    p.requires_grad = True

In [54]:
for _ in range(10):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y) 
    print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 0.1 * p.grad

13.656403541564941
11.298772811889648
9.452458381652832
7.984263896942139
6.891322135925293
6.100015640258789
5.452036380767822
4.8981523513793945
4.4146647453308105
3.985849618911743
