In [1]:
import random
import torch
import torch.nn.functional as F 
import matplotlib.pyplot as plt
%matplotlib inline

TimeoutError: [Errno 60] Operation timed out

In [None]:
words = open("words_alpha.txt", "r").read().splitlines()

words[1:10]

In [None]:
len(words)

In [None]:
# lookup tables

characters = sorted(list(set("".join(words))))
stoi = {s : i + 1 for i, s in enumerate(characters)}
stoi["."] = 0
itos = {i : s for s, i in stoi.items()}

vocab_size = 27

In [None]:
# building the dataset
random.seed(2)
block_size = 3 # how many chars used to predict the next character

def build_dataset(words):
    X, y = [], []

    for w in words:
        context = [0] * block_size # start characters in the start
        for ch in w + '.':
            ix = stoi[ch] 
            X.append(context)
            y.append(ix)
            context = context[1:] + [ix] # crop and append 

    X = torch.tensor(X)
    y = torch.tensor(y)
    
    return X, y

random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

X_train, y_train = build_dataset(words[:n1])
X_dev, y_dev = build_dataset(words[n1:n2])
X_test, y_test = build_dataset(words[n2:])


In [None]:
# Constants for neural net training 
n_dims = 10
n_hidden = 100

In [None]:
# PyTorch-like API for neural net layers

class Linear:
    def __init__(self, num_in, num_out, bias = True):
        self.weight = torch.randn((num_in, num_out)) / (num_in ** (1/2))
        self.bias = torch.zeros(num_out) if bias else None
        
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])
    
class BatchNorm1D:
    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.rmean = torch.zeros(dim)
        self.rvar = torch.ones(dim)
        
    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim = True)
            xvar = x.var(0, keepdim = True)
        else:
            xmean = self.rmean
            xvar = self.var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        
        if self.training:
            with torch.no_grad():
                self.rmean = (1 - self.momentum) * self.rmean + self.momentum * xmean
                self.rvar = (1 - self.momentum) * self.rvar + self.momentum * xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []            
    
# class Embeddings:
    

In [None]:
C = torch.randn((vocab_size, n_dims))
layers = [
    Linear(n_dims * block_size, n_hidden), BatchNorm1D(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), BatchNorm1D(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), BatchNorm1D(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), BatchNorm1D(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), BatchNorm1D(n_hidden), Tanh(),
    Linear(           n_hidden, vocab_size), BatchNorm1D(vocab_size), 
]

with torch.no_grad():
    # make last layer less confident
    layers[-1].gamma *= 0.1
    # apply gain on all remaining layers
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 5/3
    
parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

In [None]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, X_train.shape[0], (batch_size, ))
    Xb, yb = X_train[ix], y_train[ix]
    
    # forward
    emb = C[Xb]
    x = emb.view(emb.shape[0], -1)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, yb)
    
    # backward
    for layer in layers:
        layer.out.retain_grad()
    for p in parameters:
        p.grad = None
    loss.backward()
    
    #update
    lr = 0.1 if i < max_steps / 2 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    
    # tracking stats
    print(f'Run {i}/{max_steps - 1}, Loss: {loss}') if (((i % (max_steps / 10)) == 0) or (i == max_steps - 1)) else ""
    lossi.append(loss.log10().item())