In [3]:
# MLP - with using paper Bengio et al.2003

In [4]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [5]:
# read in all the words
words = open('cleaned_names.txt', 'r').read().splitlines()
words[:8]

['aaban', 'aabid', 'aabidah', 'aabir', 'aabriella', 'aada', 'aadam', 'aadarsh']

In [6]:
len(words)

29681

In [7]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(set(''.join(words)))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [8]:
# build the dataset

block_size = 3 # context length: how many characters to use to predict the next one?
X, Y = [], []
for w in words: # [:5]

  # print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join([itos[i] for i in context]), '---->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [9]:
X.shape, X.dtype, Y.shape, Y.dtype 

(torch.Size([212725, 3]), torch.int64, torch.Size([212725]), torch.int64)

In [10]:
# Converting the above code into a function

def build_dataset(words):
  block_size = 3 # context length: how many characters to use to predict the next one
  X, Y = [], []
  for w in words:

    # print(w)
    context = [0] * block_size
    for ch in w + '.':
      X.append(context)
      Y.append(ix)
      # print(''.join([itos[i] for i in context]), '---->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1]) # 80%
Xdev, Ydev = build_dataset(words[n1:n2]) # 10%
Xte, Yte = build_dataset(words[n2:]) # 10%

torch.Size([170379, 3]) torch.Size([170379])
torch.Size([21222, 3]) torch.Size([21222])
torch.Size([21124, 3]) torch.Size([21124])


In [11]:
len(words)

29681

In [12]:
n1

23744

In [13]:
n2

26712

In [14]:
n2 - n1

2968

In [15]:
len(words) - n2

2969

In [16]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1,  1],
        ...,
        [26, 26, 25],
        [26, 25, 26],
        [25, 26, 24]])

In [17]:
Y # labels

tensor([ 1,  1,  2,  ..., 26, 24,  0])

In [18]:
C = torch.randn(27, 2) # each 27 characters is represented by a 2D embedding

In [19]:
C

tensor([[-0.1283, -0.5436],
        [-0.7863,  0.7388],
        [ 0.7298,  1.0991],
        [-0.9613, -0.5001],
        [ 1.1867,  0.0904],
        [ 1.1607, -0.7505],
        [-0.4988, -2.0246],
        [ 0.4446, -0.1064],
        [ 0.0395,  0.2550],
        [ 1.5678, -0.9642],
        [ 1.3875,  0.0267],
        [ 0.2311, -0.5623],
        [ 1.9236, -0.2768],
        [ 1.2787,  0.4626],
        [-0.6334,  0.5176],
        [-1.6907, -1.1979],
        [-1.5329,  2.2698],
        [ 0.5447, -0.0792],
        [-0.9182,  0.0518],
        [-0.6797, -0.0091],
        [ 0.5102, -2.1360],
        [ 1.4016,  0.2388],
        [ 0.2124,  0.3835],
        [ 0.2622,  0.7438],
        [-0.1799,  1.0232],
        [-1.5902, -0.2433],
        [-2.1036,  2.1955]])

In [20]:
C[5]

tensor([ 1.1607, -0.7505])

In [21]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # one hot vector

tensor([ 1.1607, -0.7505])

In [22]:
C[[5,6,7]] # indexing with list

tensor([[ 1.1607, -0.7505],
        [-0.4988, -2.0246],
        [ 0.4446, -0.1064]])

In [23]:
C[torch.tensor([5,6,7])] # indexing with tensor

tensor([[ 1.1607, -0.7505],
        [-0.4988, -2.0246],
        [ 0.4446, -0.1064]])

In [24]:
C[X].shape

torch.Size([212725, 3, 2])

In [25]:
X[13,2] # 13th example, 2nd character

tensor(1)

In [26]:
C[X[13,2]] # embedding of the 13th example, 2nd character

tensor([-0.7863,  0.7388])

In [27]:
C[1]

tensor([-0.7863,  0.7388])

In [28]:
emb = C[X]
emb.shape

torch.Size([212725, 3, 2])

In [29]:
# Constructing the hidden layer
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [30]:
h = emb.view(-1, 6) @ W1 + b1

In [31]:
h

tensor([[ 2.7301,  0.6152, -0.7696,  ..., -0.5793, -1.6912,  0.5131],
        [-1.0546,  1.7785, -2.6869,  ..., -3.0657, -0.0700,  1.3441],
        [-3.1788,  1.8426, -1.8700,  ..., -1.9635,  2.0214, -0.5566],
        ...,
        [-1.8488,  2.3305, -0.3258,  ...,  1.3349,  0.6511, -5.8985],
        [-5.4550,  4.2311, -7.8543,  ..., -4.0529,  0.0493, -1.1333],
        [-4.5005,  2.8655, -3.2786,  ..., -0.8207,  3.6445, -2.9424]])

In [32]:
# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape # first, second, third characters of the first example concatenated

In [33]:
# torch.cat(torch.unbind(emb, 1), 1).shape # split the tensor along the second dimension

In [34]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [35]:
a.shape

torch.Size([18])

In [36]:
a.view(2, 9) # view as 2x9 matrix

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [37]:
a.view(9, 2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [38]:
a.view(3, 3, 2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [39]:
a.storage() # memory address

  a.storage() # memory address


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [40]:
emb.shape

torch.Size([212725, 3, 2])

In [41]:
# emb.view(32, 6) == torch.cat(torch.unbind(emb, 1), 1)

In [42]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [43]:
h

tensor([[ 0.9915,  0.5478, -0.6467,  ..., -0.5222, -0.9343,  0.4724],
        [-0.7836,  0.9445, -0.9908,  ..., -0.9957, -0.0699,  0.8726],
        [-0.9965,  0.9510, -0.9536,  ..., -0.9614,  0.9655, -0.5054],
        ...,
        [-0.9516,  0.9813, -0.3147,  ...,  0.8704,  0.5724, -1.0000],
        [-1.0000,  0.9996, -1.0000,  ..., -0.9994,  0.0492, -0.8121],
        [-0.9998,  0.9935, -0.9972,  ..., -0.6754,  0.9986, -0.9945]])

In [44]:
h.shape

torch.Size([212725, 100])

In [45]:
(emb.view(-1, 6) @ W1).shape

torch.Size([212725, 100])

In [46]:
b1.shape

torch.Size([100])

In [47]:
# 32, 100
# broadcasting to 100
# 1 , 100

In [48]:
# Creating the final layer (output layer)
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

In [49]:
logits = h @ W2 + b2

In [50]:
logits.shape

torch.Size([212725, 27])

In [51]:
counts = logits.exp()

In [52]:
prob = counts / counts.sum(-1, keepdim=True)

In [53]:
prob.shape

torch.Size([212725, 27])

In [54]:
prob

tensor([[3.7164e-06, 3.6963e-10, 5.1644e-13,  ..., 2.4925e-08, 2.4920e-09,
         3.4923e-05],
        [1.0526e-04, 1.1668e-05, 3.7881e-05,  ..., 4.2597e-07, 2.6370e-02,
         7.6387e-01],
        [1.0022e-04, 1.8795e-09, 4.3519e-03,  ..., 1.6499e-08, 3.5516e-04,
         5.6229e-05],
        ...,
        [2.4770e-03, 1.8276e-08, 2.6603e-06,  ..., 1.2900e-07, 1.2202e-08,
         3.2308e-10],
        [6.3587e-01, 3.1959e-07, 1.9488e-08,  ..., 8.5252e-08, 1.0425e-02,
         6.1740e-05],
        [7.0301e-05, 3.1253e-07, 5.5297e-07,  ..., 1.4377e-09, 7.4961e-05,
         8.6576e-07]])

In [55]:
prob[0].sum() # normalized

tensor(1.)

In [56]:
Y # labels

tensor([ 1,  1,  2,  ..., 26, 24,  0])

In [57]:
torch.arange(36)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35])

In [58]:
# prob[torch.arange(36), Y] # probabilities of the true characters

In [59]:
# loss = -prob[torch.arange(36), Y].log().mean() # cross entropy loss
# loss

In [60]:
# ----------------- Summary -----------------

In [61]:
Xtr.shape, Ytr.shape

(torch.Size([170379, 3]), torch.Size([170379]))

In [62]:
g = torch.Generator().manual_seed(2147483647) # seed for reproducibility
C = torch.randn(27, 2, generator=g)
W1 = torch.randn((6, 300), generator=g)
b1 = torch.randn(300, generator=g)
W2 = torch.randn(300, 27, generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [63]:
sum(p.nelement() for p in parameters) # total number of parameters

10281

In [64]:
# Enabling gradient computation
for p in parameters:
  p.requires_grad = True

In [141]:
# finding out the learning rate
lre = torch.linspace(-3, 0, 1000) # 1000 points between 0.001 and 1
lrs = 10 ** lre # learning rates
# lrs

In [65]:
lri = []
lossi = []
step = []

for i in range(30000): # iterations

# mini-batch construction
  ix = torch.randint(0, Xtr.shape[0], (36,)) # 36 random indices from the dataset
# Forward pass

  emb = C[Xtr[ix]] # embedding 
  h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # hidden layer
  logits = h @ W2 + b2 # output layer
# counts = logits.exp() # unnormalized probabilities
# prob = counts / counts.sum(1, keepdim=True) # normalized probabilities
# loss = -prob[torch.arange(36), Y].log().mean() # cross entropy loss
#  but never use the above code in practice as it is not efficient and numerically unstable
  loss = F.cross_entropy(logits, Ytr[ix]) # cross entropy loss
  # print(loss.item())


  # Backward pass
  for p in parameters:
    p.grad = None

  loss.backward()

  # Update
  # lr = lrs[i]
  lr = 0.01
  for p in parameters:
    p.data += -lr * p.grad # learning rate e.g. -1 (lr)

  # track stats
  # lri.append(lre[i])
  step.append(i)
  lossi.append(loss.item())  

In [66]:
print(loss.item())  

4.291525783628458e-06


In [170]:
# plt.plot(lri, lossi)
# plt.xscale('log')

In [171]:
# F.cross_entropy(logits, Y) # PyTorch's implementation of cross entropy we can put the cross entropy directly in this code with the help of Pytorch directly in this code which will be more efficient for forward pass and backward pass and numerical stability.

In [172]:
logits.max(1) # max in python is a function that returns two values: the maximum value and its index

torch.return_types.max(
values=tensor([42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211,
        42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211,
        42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211,
        42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211, 42.9211,
        42.9211, 42.9211, 42.9211, 42.9211], grad_fn=<MaxBackward0>),
indices=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [173]:
Y # labels

tensor([ 1,  1,  2,  ..., 26, 24,  0])

In [174]:
# creating minibatches
torch.randint(0, X.shape[0], (36,)) # random integers between 0 and 4

tensor([211940,  89486, 119083, 167078, 163006, 101152,  11620,  82442, 181455,
        198500, 152551, 110890,  28649,  86630,  21326, 117285, 165645, 204168,
          9095,  20786,  62294,  50311,  96603,  97153,  47111,  91442, 118660,
        164717,  78551,  30250, 199397,  91119, 164276,  17668,  93384, 167547])

In [67]:
# evaluation

emb = C[Xdev] # embedding 
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # hidden layer
logits = h @ W2 + b2 # output layer
loss = F.cross_entropy(logits, Ydev) # cross entropy loss
loss


tensor(4.2915e-06, grad_fn=<NllLossBackward0>)

In [68]:
# loss on training set
emb = C[Xtr] # embedding
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # hidden layer
logits = h @ W2 + b2 # output layer
loss = F.cross_entropy(logits, Ytr) # cross entropy loss
loss

tensor(4.2915e-06, grad_fn=<NllLossBackward0>)

In [2]:
# plt.figure(figsize=(8, 8))
# plt.scatter(C[:, 0].data, C[:, 1].data, s=200)
# for i in range(C.shape[0]):
  # plt.text(C[i, 0].item(), C[i, 1].item(), itos[i], ha='center', va='center', color='white')
# plt.grid('minor')

In [177]:
# Training split, dev / validation split, test split
# Training split: 80% of the data
# Dev split: 10% of the data
# Test split: 10% of the data