<a href="https://colab.research.google.com/github/romenlaw/NaiveNeuralNetwork/blob/main/makemore_backprop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Makemore - backprop ninja

## prepare datasets

In [16]:
!curl -O https://raw.githubusercontent.com/romenlaw/NaiveNeuralNetwork/main/names.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  222k  100  222k    0     0   639k      0 --:--:-- --:--:-- --:--:--  638k


In [37]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [39]:
words = open('names.txt', 'r').read().splitlines()
len(words), max(len(w) for w in words), words[:8]

(32033,
 15,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [33]:
vocab = sorted(list(set(''.join(words))))
vocab.insert(0, '.')
itos = { i:s for i,s in enumerate(vocab)}
stoi = { s:i for i,s in enumerate(vocab)}
vocab_size = len(vocab)  # 27

In [58]:
block_size = 3  # context size - 3 tokens

def build_dataset(words):
  """returns torch tensors X, Y where
  X is a list of n-grams indices covering the whole words list, where n=block_size
  Y is a list of indices predicting each n-gram in X
  """
  X, Y = [], []

  #for w in words[:5]:
  for w in words:
    context = [0] * block_size # repeat '.' to fill block_size
    for ch in w+'.':
      ix = stoi[ch]
      #print(' '.join([itos[i] for i in context]), '---->', itos[ix])
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]

  return torch.tensor(X), torch.tensor(Y)

X, Y = build_dataset(words)
#X[:32], Y[:32]

# split the data into 3 sets
# 80% for training set
# 10% for validation/development
# 10% for testing
import random
random.seed(42)
n1 = int(len(words) * .8)
n2 = int(len(words) * .9)
random.shuffle(words) # shuffle is in-place
X_train, Y_train = build_dataset(words[:n1])
X_dev, Y_dev = build_dataset(words[n1:n2])
X_test, Y_test = build_dataset(words[n2:])

#len(words[n1:n2])
(X_train.shape, Y_train.shape), (X_dev.shape, Y_dev.shape), (X_test.shape, Y_test.shape)

((torch.Size([182515, 3]), torch.Size([182515])),
 (torch.Size([22840, 3]), torch.Size([22840])),
 (torch.Size([22791, 3]), torch.Size([22791])))

## MLP

In [64]:
embed_dim = 10
hidden_dim = 200

g = torch.Generator().manual_seed(20240824)
C = torch.randn((vocab_size, embed_dim),  generator=g)

# hidden layer
fan_in = embed_dim*block_size # we concat multiple C's to feed into hidden layer
W1 = torch.randn((fan_in, hidden_dim), generator=g) * (5/3 / fan_in**0.5)
#b1 = torch.randn(hidden_dim,           generator=g) * 0.01 # no need since using BN
# output layer
W2 = torch.randn((hidden_dim, vocab_size), generator=g) * 0.01
b2 = torch.zeros(vocab_size)

# batch normalisation 1D layer placed after hidden layer, hence dim=hidden_dim
bn_gamma = torch.ones((1, hidden_dim))
bn_bias = torch.zeros((1, hidden_dim))
bn_running_mean = torch.zeros((1, hidden_dim))
bn_running_std = torch.ones((1, hidden_dim))

parameters = [C, W1, W2, b2, bn_gamma, bn_bias]
print('total params: ', sum([p.nelement() for p in parameters]))
for p in parameters:
  p.requires_grad = True

total params:  12097


## training

In [73]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):

  # construct mini-batch:
  # generate a list of random indices, length of list if batch_size
  ix = torch.randint(low=0, high=X_train.shape[0], size=(batch_size,), generator=g)
  xs = X_train[ix]  # (batch_size, block_size)
  ys = Y_train[ix]  # (batch_size)

  ################
  # forward pass
  ################
  # embedding ---------------------------
  emb = C[xs] # (batch_size, block_size, hidden_dim)
  # hidden layer ------------------------
  h_preact = emb.view(batch_size, -1) @ W1
  # BN layer ----------------------------
  bn_mean = h_preact.mean(dim=0, keepdim=True)
  bn_std = h_preact.std(dim=0, keepdim=True)
  x_hat = (h_preact - bn_mean) / bn_std
  h_preact = bn_gamma * x_hat + bn_bias
  with torch.no_grad():
    bn_running_mean = 0.999 * bn_running_mean + 0.001 * bn_mean
    bn_running_std = 0.999 * bn_running_std + 0.001 * bn_std
  # Non-linearity ----------------------
  h = torch.tanh(h_preact)  # (batch_size, hidden_dim)
  # output layer -----------------------
  logits = h


  break

In [74]:
h.shape

torch.Size([32, 200])