# Preraring the data

## Importing packages

In [1]:
import random

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import notebook

RANDOM_SEED = 42
TORCH_GEN_SEED = 2147483647

## Loading the data

In [2]:
# Opening the dataset with names and reading its content into a variable
words = open("../names.txt", "r").read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

## Building the vocabulary and creating character-number mappings

In [3]:
# Retrieving a set of unique letters
chars = sorted(list(set(''.join(words))))

# Creating a mapping from a letter to an id
char2id = {s: i+1 for i, s in enumerate(chars)}
# Adding the start_of_word/end_of_word token => "."
char2id['.'] = 0

# Creating a mapping from an id to letter
id2char = {i: s for s, i in char2id.items()}

# Computing the size of the vocabulary
vocab_size = len(id2char)

# Displaying the mappings and vocab size
print("Character -> Identifier:")
print(char2id)
print()
print("Identifier -> Character:")
print(id2char)
print()
print(f"Vocabulary size: {vocab_size}")

Character -> Identifier:
{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}

Identifier -> Character:
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}

Vocabulary size: 27


## Splitting the data

In [4]:
# Setting the random seed and reshuffling names
random.seed(RANDOM_SEED)
random.shuffle(words)

# Determining cutoff points for 10% dev and 10% test
cutoff_train = int(0.8*len(words))
cutoff_dev = int(0.9*len(words))

print(f"Training set: {0}-{cutoff_train-1:,}")
print(f"Development set: {cutoff_train:,}-{cutoff_dev-1:,}")
print(f"Testing set: {cutoff_dev:,}-{len(words)-1:,}")

print(f"\nTotal words: {len(words):,}")

Training set: 0-25,625
Development set: 25,626-28,828
Testing set: 28,829-32,032

Total words: 32,033


In [5]:
# Allocating shuffled words into three sets
words_train = words[:cutoff_train]
words_dev = words[cutoff_train:cutoff_dev]
words_test = words[cutoff_dev:]

print(f"Training set examples: {len(words_train):,} ({len(words_train)/len(words):.0%})")
print(f"Development set examples: {len(words_dev):,} ({len(words_dev)/len(words):.0%})")
print(f"Testing set examples: {len(words_test):,} ({len(words_test)/len(words):.0%})")

Training set examples: 25,626 (80%)
Development set examples: 3,203 (10%)
Testing set examples: 3,204 (10%)


In [6]:
def build_dataset(words, block_size):
    # Preallocating lists for dataset
    X, Y = [], []
    for word in words:
        # Creating a starting examples depending on block size
        context = [0] * block_size
        # Iterating through entire word with end of word token
        for char in word + '.':
            index = char2id[char]
            X.append(context)
            Y.append(index)
            # Adding the character index and shifting
            context = context[1:] + [index]
    
    # Casting as PyTorch tensors
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X, Y

In [7]:
# Setting the block size (number of character to use to predict the next one)
block_size = 3

# Building the datasets for three sets
X_train, Y_train = build_dataset(
    words=words_train, block_size=block_size,
)
X_dev, Y_dev = build_dataset(
    words=words_dev, block_size=block_size,
)
X_test, Y_test = build_dataset(
    words=words_test, block_size=block_size,
)

In [8]:
def cmp(s, dt, t):
    """Compares manual gradients to Pytorch gradients."""
    # Returning True if all gradients are exactly equal
    ex = torch.all(dt == t.grad).item()
    # Returning True if all gradients are approximately equal
    app = torch.allclose(dt, t.grad)
    # Computing the maximum difference between gradients
    maxdiff = (dt - t.grad).abs().max().item()
    # Printing the comparison information
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

# Exercise 4: Training MLP with a manual backward pass

## Verifying the gradients

Firstly, we will initialize and train the MLP for a few iterations and then compare the manually computed gradients with torch gradients:

In [9]:
n_embd = 10
n_hidden = 200
generator = torch.Generator().manual_seed(TORCH_GEN_SEED)

In [10]:
C  = torch.randn((vocab_size, n_embd), generator=generator)

W1 = torch.randn((n_embd * block_size, n_hidden), generator=generator)
W1 *= (5 / 3) / ((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator=generator) * 0.1

W2 = torch.randn((n_hidden, vocab_size), generator=generator) * 0.1
b2 = torch.randn(vocab_size, generator=generator) * 0.1

bngain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden)) * 0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
num_params = sum(p.nelement() for p in parameters)
print(f"Number of trainable parameters: {num_params:,}")

for p in parameters:
    p.requires_grad = True

Number of trainable parameters: 12,297


In [11]:
max_steps = 200000
batch_size = 32
n = batch_size
lossi = []

In [12]:
for i in range(max_steps):

    # Constructing the minibatch
    ix = torch.randint(0, X_train.shape[0], (batch_size,), generator=generator)
    Xb, Yb = X_train[ix], Y_train[ix]
    
    # Forward pass
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1)
    # -------------------------------------------------------------
    # Linear layer
    # -------------------------------------------------------------
    hprebn = embcat @ W1 + b1
    # -------------------------------------------------------------
    # BatchNorm layer
    # -------------------------------------------------------------
    bnmean = hprebn.mean(0, keepdim=True)
    bnvar = hprebn.var(0, keepdim=True, unbiased=True)
    bnvar_inv = (bnvar + 1e-5)**-0.5
    bnraw = (hprebn - bnmean) * bnvar_inv
    hpreact = bngain * bnraw + bnbias
    # -------------------------------------------------------------
    # Tanh-nonlinearity
    # -------------------------------------------------------------
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)
    # -------------------------------------------------------------
    
    
    
    
    # Backward pass (Using autograd for gradient verification)
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # Manual backpropagation (from Exercises 1, 2, 3)
    # -------------------------------------------------------------
    # Cross-entropy backprop (Exercise 2)
    # -------------------------------------------------------------
    dlogits = F.softmax(logits, 1)
    dlogits[range(n), Yb] -= 1
    dlogits /= n
    # -------------------------------------------------------------
    # 2nd layer backprop
    # -------------------------------------------------------------
    dh = dlogits @ W2.T
    dW2 = h.T @ dlogits
    db2 = dlogits.sum(0)
    # -------------------------------------------------------------
    # Tanh backprop
    # -------------------------------------------------------------
    dhpreact = (1.0 - h**2) * dh
    # -------------------------------------------------------------
    # Batchnorm backprop (Exercise 3)
    # -------------------------------------------------------------
    dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
    dbnbias = dhpreact.sum(0, keepdim=True)
    dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
    # -------------------------------------------------------------
    # 1st layer backprop
    # -------------------------------------------------------------
    dembcat = dhprebn @ W1.T
    dW1 = embcat.T @ dhprebn
    db1 = dhprebn.sum(0)
    # -------------------------------------------------------------
    # Embedding backprop
    # -------------------------------------------------------------
    demb = dembcat.view(emb.shape)
    dC = torch.zeros_like(C)
    for k in range(Xb.shape[0]):
        for j in range(Xb.shape[1]):
            ix = Xb[k,j]
            dC[ix] += demb[k,j]
    # -------------------------------------------------------------
    
    # Collecting all computed gradients
    grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]
  
    # Updating using learning rate decay
    lr = 0.1 if i < 100000 else 0.01
    for p, grad in zip(parameters, grads):
        p.data += -lr * p.grad
    
    # Tracking statistics
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    
    # Running for 100 interations
    if i >= 100:
        break

      0/ 200000: 3.8135


In [13]:
# Checking gradients
for p, g in zip(parameters, grads):
    cmp(str(tuple(p.shape)), g, p)

(27, 10)        | exact: False | approximate: True  | maxdiff: 1.1874362826347351e-08
(30, 200)       | exact: False | approximate: True  | maxdiff: 9.313225746154785e-09
(200,)          | exact: False | approximate: True  | maxdiff: 4.6566128730773926e-09
(200, 27)       | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08
(27,)           | exact: False | approximate: True  | maxdiff: 7.450580596923828e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09
(1, 200)        | exact: False | approximate: True  | maxdiff: 5.587935447692871e-09


We see that our backward pass works correctly so we can now implement the neural net without torch autograd mechanism.

## MLP from scratch

### Initialization

In [14]:
n_embd = 10
n_hidden = 200
generator = torch.Generator().manual_seed(TORCH_GEN_SEED)

In [15]:
C  = torch.randn((vocab_size, n_embd), generator=generator)

W1 = torch.randn((n_embd * block_size, n_hidden), generator=generator)
W1 *= (5 / 3) / ((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator=generator) * 0.1

W2 = torch.randn((n_hidden, vocab_size), generator=generator) * 0.1
b2 = torch.randn(vocab_size, generator=generator) * 0.1

bngain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden)) * 0.1

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
num_params = sum(p.nelement() for p in parameters)
print(f"Number of trainable parameters: {num_params:,}")

for p in parameters:
    p.requires_grad = True

Number of trainable parameters: 12,297


### Training

To save time, we will run the training loop for 50000 iterations.

In [16]:
max_steps = 50000
batch_size = 32
n = batch_size
lossi = []

Since we will not be using autograd functionality, we can specify in advance that no gradients will be computed (ther will be no `backward()` call) by using `torch.no_grad()` context manager:

In [17]:
with torch.no_grad():

    for i in notebook.tqdm(range(max_steps)):

        # Contructing minibatch
        ix = torch.randint(0, X_train.shape[0], (batch_size,), generator=generator)
        Xb, Yb = X_train[ix], Y_train[ix]
        
        # Forward pass
        emb = C[Xb]
        embcat = emb.view(emb.shape[0], -1)
        # -------------------------------------------------------------
        # Linear layer
        # -------------------------------------------------------------
        hprebn = embcat @ W1 + b1
        # -------------------------------------------------------------
        # BatchNorm layer
        # -------------------------------------------------------------
        bnmean = hprebn.mean(0, keepdim=True)
        bnvar = hprebn.var(0, keepdim=True, unbiased=True)
        bnvar_inv = (bnvar + 1e-5)**-0.5
        bnraw = (hprebn - bnmean) * bnvar_inv
        hpreact = bngain * bnraw + bnbias
        # -------------------------------------------------------------
        # Tanh-nonlinearity
        # -------------------------------------------------------------
        h = torch.tanh(hpreact)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Yb)
        # -------------------------------------------------------------
        
        
        
        # Manual backpropagation
        # -------------------------------------------------------------
        # Cross-entropy backprop
        # -------------------------------------------------------------
        dlogits = F.softmax(logits, 1)
        dlogits[range(n), Yb] -= 1
        dlogits /= n
        # -------------------------------------------------------------
        # 2nd layer backprop
        # -------------------------------------------------------------
        dh = dlogits @ W2.T
        dW2 = h.T @ dlogits
        db2 = dlogits.sum(0)
        # -------------------------------------------------------------
        # Tanh-backprop
        # -------------------------------------------------------------
        dhpreact = (1.0 - h**2) * dh
        # -------------------------------------------------------------
        # Batchnorm backprop
        # -------------------------------------------------------------
        dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
        dbnbias = dhpreact.sum(0, keepdim=True)
        dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
        # -------------------------------------------------------------
        # 1st layer backprop
        # -------------------------------------------------------------
        dembcat = dhprebn @ W1.T
        dW1 = embcat.T @ dhprebn
        db1 = dhprebn.sum(0)
        # -------------------------------------------------------------
        # Embedding backprop
        # -------------------------------------------------------------
        demb = dembcat.view(emb.shape)
        dC = torch.zeros_like(C)
        for k in range(Xb.shape[0]):
            for j in range(Xb.shape[1]):
                ix = Xb[k,j]
                dC[ix] += demb[k,j]
        # -------------------------------------------------------------
        
        # Collecting the gradients
        grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]

        # Updating using learning rate decay
        lr = 0.1 if i < 25000 else 0.01
        for p, grad in zip(parameters, grads):
            p.data += -lr * grad 
        
        # Tracking statistics
        if i % 5000 == 0:
            print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
        lossi.append(loss.log10().item())

  0%|          | 0/50000 [00:00<?, ?it/s]

      0/  50000: 3.8197
   5000/  50000: 2.2817
  10000/  50000: 2.1724
  15000/  50000: 2.1108
  20000/  50000: 2.4136
  25000/  50000: 2.3025
  30000/  50000: 2.3608
  35000/  50000: 2.2418
  40000/  50000: 2.0018
  45000/  50000: 1.7366


### Evaluation

In [18]:
# Calibrating batch norm at the end of training
with torch.no_grad():
    emb = C[X_train]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    # Computing the mean/std over the entire training set
    bnmean = hpreact.mean(0, keepdim=True)
    bnvar = hpreact.var(0, keepdim=True, unbiased=True)

In [19]:
@torch.no_grad()
def split_loss(split):
    """Evaluates the neural net performance on data sets."""
    x,y = {
      'train': (X_train, Y_train),
      'val': (X_dev, Y_dev),
      'test': (X_test, Y_test),
    }[split]
    emb = C[x]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

In [20]:
split_loss('train')
split_loss('val')
split_loss('test')

train 2.1317732334136963
val 2.1516716480255127
test 2.1520540714263916


### Names generation from the model

In [21]:
g = torch.Generator().manual_seed(TORCH_GEN_SEED + 42)

for _ in range(100):

    out = []
    context = [0] * block_size
    while True:
        
        emb = C[torch.tensor([context])]
        embcat = emb.view(emb.shape[0], -1)
        hpreact = embcat @ W1 + b1
        hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
        h = torch.tanh(hpreact)
        logits = h @ W2 + b2
        
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print(''.join(id2char[i] for i in out)[:-1])

amelia
mai
yara
aayle
otbuw
riyah
darco
ira
ketman
addon
kele
zalir
sim
pra
damahil
charie
jenni
kensi
yomaniya
ellle
levynishariyah
lie
jahanthi
cha
grita
mon
malee
rhylusman
guhristen
jay
join
anon
irabellah
torby
peya
brica
myre
lina
ala
rayoni
minie
aag
tay
nila
tauilyan
kio
bevticonnie
cali
odkiraha
wilon
iniya
lilley
marody
sha
dhan
matre
grefiyanna
kilani
van
loxyn
sow
aleigh
briah
xin
muraceyson
kalisodyn
hary
dego
zen
tem
rea
rozi
zary
lype
kylingu
jcoslee
zen
kalle
quyanae
ezlee
eira
ractany
adhima
jomana
essyer
zaant
einna
nik
dochurbestyn
aveut
jana
adcebdauthi
jopelr
trino
kriana
kias
nalyanniero
maley
faidrentylynnield
akolyanison
