In [2]:
import torch
import torch.nn.functional as F

# Goal is to create a NN to take trigrams or more
1. Create X with input letters, create Y with output. Need to create itos and stoi
3. Create C which is a lookup table of dimension 2
4. Create a NN that takes that and gives our probabilities of next letter
5. calculate loss and reduce loss

In [3]:
words = open('names.txt', 'r').read().splitlines()
print(words[:5])

['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [4]:
# create itos and stoi
chars = sorted(set(''.join(words)))
itos = {i+1: ch for i, ch in enumerate(chars)}
itos[0] = '.'
stoi = {ch: i for i, ch in itos.items()}



In [30]:
import numpy as np
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)

    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [34]:
emb_size = 10
hidden_size = 300


g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, emb_size), generator=g)
W1 = torch.randn((block_size * emb_size, hidden_size),  generator=g) * ((5/3)/((block_size * emb_size)**0.5))
#B1 = torch.randn((hidden_size), generator=g) * 0.01
W2 = torch.randn((hidden_size, 27), generator=g) * 0.01
B2 = torch.randn((27), generator=g) * 0

bngain = torch.ones(((1, hidden_size)))
bnbias = torch.zeros((1, hidden_size))

bnmean_running = torch.zeros((1, hidden_size))
bnstd_running = torch.ones((1, hidden_size))

parameters = [C, W1, W2, B2, bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

17997


In [35]:
batch_size = 32
n_epochs = 200000

for i in range(n_epochs):
    # forward pass
    #minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    emb = C[Xtr[ix]]
    embcat = emb.view(-1, block_size * emb_size)
    #-------------batch norm----------------
    hprev = embcat @ W1 #+ B1
    bnmeani = hprev.mean(0, keepdim=True)
    bnstdi = hprev.std(0, keepdim=True)
    hprev = (hprev - bnmeani)/bnstdi
    hprev = hprev * bngain + bnbias

    with torch.no_grad():
        bnmean_running = bnmean_running * 0.999 + bnmeani * 0.001
        bnstd_running = bnstd_running * 0.999 + bnstdi * 0.001

    h = torch.tanh(hprev)
    #-------------------------------
    logits = h @ W2 + B2
    loss = F.cross_entropy(logits, Ytr[ix])


    for p in parameters:
        p.grad = None

    loss.backward()
    lr = 0.1 if i < (n_epochs * 0.8) else 0.01
    for p in parameters:
        p.data -= lr * p.grad 
print(loss.item())

2.061699390411377


In [37]:
#beat xDev of 2.17 loss
emb = C[Xdev]
embcat = emb.view(-1, block_size * emb_size)
hprev = embcat @ W1 #+ B1
hprev = (hprev - bnmean_running) / bnstd_running
hprev = hprev * bngain + bnbias
h = torch.tanh(hprev)
logits = h @ W2 + B2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())

2.097026824951172


In [38]:
emb = C[Xte]
embcat = emb.view(-1, block_size * emb_size)
hprev = embcat @ W1 #+ B1
hprev = (hprev - bnmean_running) / bnstd_running
hprev = hprev * bngain + bnbias
h = torch.tanh(hprev)
logits = h @ W2 + B2
loss = F.cross_entropy(logits, Yte)
print(loss.item())

2.100803852081299


## Below is the code to torichify the whole Linear layer along with batch norm

In [None]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / (fan_in**0.5)
        self.bias = torch.zeros((fan_out), generator=g) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        p = self.weight + ([] if self.bias is not None else [self.bias])
    

class BatchNorm1d:
    def __init__(self, dim, esp=1e-5, momentum=0.1):
        self.esp = esp
        self.momentum = momentum
        self.training = True
        # running mean to keep track of things
        self.mean_running = torch.zeros(1,dim)
        self.std_running = torch.ones(1, dim)
        # paramaters for normalization
        self.bngain = torch.ones(1, dim)
        self.bnbias = torch.zeros(1, dim)

        def __call__(self, x):
            
            
