In [104]:
import math
import random
import numpy

import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

# Input

In [105]:
words = open('names.txt', 'r').read().splitlines()
print(f'{len(words)}')
words[:8]

32033


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

lookup tables

In [106]:
chars = list('abcdefghijklmnopqrstuvwxyz')
stoi_lookup = {c: i+1 for i, c in enumerate(chars)}
stoi_lookup['.'] = 0
itos_lookups = {i: char for char, i in stoi_lookup.items()}

### Dataset creation

-  both BOS and EOS are represented using '.'

- context - characters to be considered for the next prediction
    - what would be context for the first letter? just '.', the number of '.' will depend on the length of the context


- make sure to represent the EOS as well

In [107]:
block_size = 3 # context length

X, Y = [], []

for word in words:
    # print('word: ', word)
    context = [0] * block_size
    seq = word + '.'    # don't forget to add the . add the end

    for char in seq:
        X.append(context)
        # index of the character to be predicted
        y_i = stoi_lookup[char]
        Y.append(y_i)
        # print(''.join(itos_lookups[c] for c in context), '--->', itos_lookups[y_i])
        context = context[1:] + [y_i]

X = torch.tensor(X)
Y = torch.tensor(Y)

X.shape, X.dtype, Y.shape, Y.dtype

# for i in range(len(X)):
#     print("context: ", X[i], " next char: ", Y[i])

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

### Embeddings lookup table

in [Bengio et al](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), 17k words were embeeded into 30 dims

we have to embed 27 chars into small dims, lets start with 2 dims embedding for now

In [108]:
# 27 chars emebedded into two dims randomly
dims = 2
C = torch.randn((27, dims))

# just to see what the embeddings are 
# for i, embed in enumerate(C):
#     print(itos_lookups[i], ": ", embed)

# now we have to embed the integers in the input X using the above lookup table C
embs = C[X]
embs.shape


torch.Size([228146, 3, 2])

### Hidden & Output Layer

- What would be the input to this layer?

    the embedding have the shape [32, 3, 2] which means that we have 32 inputs and for each input so we have three(one for each character) two dims embeddings => 3 * 2 = 6

- What would be the number of neurons, we can try it with different values, lets say 100 for now

In [10]:
# hidden layer
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
h = torch.tanh(embs.view(-1, block_size*dims) @ W1 + b1)

# output layer
W2 = torch.randn(100, 27)
b2 = torch.randn(27)

logits = h @ W2 + b2
counts = logits.exp()
prob = counts /counts.sum(1, keepdim=True)
print(prob[0].shape) # we have prob for all 27 chars for all our inputs

# calculate the loss
ix = torch.arange(32) # indices for all 32 outputs

# Y is the index of the character that was predicted
loss = -prob[ix, Y].log().mean() # nll
loss

torch.Size([27])


tensor(18.4791)

### Putting the Network Together

In [128]:
print("the dataset: ", X.shape, Y.shape)

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 300), generator=g)
b1 = torch.randn(300, generator=g)
W2 = torch.randn((300, 27), generator=g)
b2 = torch.randn(27, generator=g)

params = [C, W1, b1, W2, b2]

print(sum(p.nelement() for p in params))

the dataset:  torch.Size([228146, 3]) torch.Size([228146])
10281


In [130]:
# set the p.grad requires to true first so that we are able to get the grad for the params
for p in params:
    p.requires_grad = True

In [28]:
for step in range(10):
    # forward pass
    embs = C[X]
    h = torch.tanh(embs.view(-1, block_size*dims) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y) # does the same as calculating the counts and the summation and normalization
    print(f'{loss=}')
    
    # backward pass
    # set the gradients to 0 - IMP
    for p in params:
        p.grad = None
    
    # calculate the gradients of the loss w.r.t params
    loss.backward()
    
    # update the params
    for p in params:
        l_rate = 0.1 # learning rate 
        p.data += -l_rate * p.grad

loss=tensor(19.5052, grad_fn=<NllLossBackward0>)
loss=tensor(17.0845, grad_fn=<NllLossBackward0>)
loss=tensor(15.7765, grad_fn=<NllLossBackward0>)
loss=tensor(14.8333, grad_fn=<NllLossBackward0>)
loss=tensor(14.0026, grad_fn=<NllLossBackward0>)
loss=tensor(13.2533, grad_fn=<NllLossBackward0>)
loss=tensor(12.5799, grad_fn=<NllLossBackward0>)
loss=tensor(11.9831, grad_fn=<NllLossBackward0>)
loss=tensor(11.4705, grad_fn=<NllLossBackward0>)
loss=tensor(11.0519, grad_fn=<NllLossBackward0>)


mini_batches 

Why? 
- forwarding and backwarding 228146 takes a significant amout of time

What? 
- mini batch - take some portion of the dataset, work on that(fwd, bkwd, update) on the mini batch, can this done be in parallel though? That would be rad!

How? 
- what should be the size of the mini_batch, lets say 32, so each mini_batch will have 32 data points(their indices)

Adding mini_batches to our NN

In [112]:
ix = torch.randint(0, X.shape[0], (32, ))

for step in range(10000):
    # forward pass
    ix = torch.randint(0, X.shape[0], (32, ))
    # make sure to only select the indices in the current mini batch
    embs = C[X[ix]]
    h = torch.tanh(embs.view(-1, block_size*dims) @ W1 + b1)
    logits = h @ W2 + b2
    # only select the indices in the current mini batch
    loss = F.cross_entropy(logits, Y[ix])
    # print(f'{loss=}')
    # backward pass
    # set the gradients to 0 - IMP
    for p in params:
        p.grad = None
    
    # calculate the gradients of the loss w.r.t params
    loss.backward()
    
    l_rate = 0.1 # learning rate 
    # update the params
    for p in params:
        p.data += -l_rate * p.grad
print(f'{loss=}')

loss=tensor(2.6592, grad_fn=<NllLossBackward0>)


training(80%), val(10%), test(10%) splits 
 - training set is used for optimizing the params of the model
 - validation set is used for training the hyperparameters(hidden layer size, embedding size etc)
 - evaluating the performance of the model

In [131]:
def build_dataset(words):
    block_size = 3 # context length
    X, Y = [], []
    for word in words:
        # print('word: ', word)
        context = [0] * block_size
        seq = word + '.'    # don't forget to add the . add the end

        for char in seq:
            X.append(context)
            # index of the character to be predicted
            y_i = stoi_lookup[char]
            Y.append(y_i)
            # print(''.join(itos_lookups[c] for c in context), '--->', itos_lookups[y_i])
            context = context[1:] + [y_i]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print("shapes: ", X.shape, Y.shape)
    return X, Y

random.seed(42)
random.shuffle(words)

train_size = int(0.8*len(words))
val_size = int(0.9*len(words))

X_train, Y_train = build_dataset(words[:train_size])
X_val, Y_val = build_dataset(words[train_size:val_size])
X_test, Y_test = build_dataset(words[val_size:])

shapes:  torch.Size([182441, 3]) torch.Size([182441])
shapes:  torch.Size([22902, 3]) torch.Size([22902])
shapes:  torch.Size([22803, 3]) torch.Size([22803])


In [147]:
# training

ix = torch.randint(0, X_train.shape[0], (32, ))

steps = []
losses = []

for step in range(30000):
    # forward pass
    ix = torch.randint(0, X_train.shape[0], (32, ))
    # make sure to only select the indices in the current mini batch
    embs = C[X_train[ix]]
    h = torch.tanh(embs.view(-1, block_size*dims) @ W1 + b1)
    logits = h @ W2 + b2
    # only select the indices in the current mini batch
    loss = F.cross_entropy(logits, Y_train[ix])
    # print(f'{loss=}')
    # backward pass
    # set the gradients to 0 - IMP
    for p in params:
        p.grad = None
    
    # calculate the gradients of the loss w.r.t params
    loss.backward()
    
    l_rate = 0.01 # learning rate 
    
    # update the params
    for p in params:
        p.data += -l_rate * p.grad

    steps.append(step)
    losses.append(loss.item())
# print(f'{loss=}')

In [148]:
# training loss
embs = C[X_train]
h = torch.tanh(embs.view(-1, block_size*dims) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y_train)
loss

tensor(2.2218, grad_fn=<NllLossBackward0>)

In [149]:
# validation loss

embs = C[X_val]
h = torch.tanh(embs.view(-1, block_size*dims) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y_val)
loss

tensor(2.2338, grad_fn=<NllLossBackward0>)