In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
print(len(words))
words[:8]

32033


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [19]:
chars = set()
for word in words:
    for char in word:
        chars.add(char)

chars.add('.')
chars =sorted(chars)

stoi = {s:i for i,s in enumerate(chars)}
stoi['.'] = 0

stoi

{'.': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [20]:
itos = {i:s for s, i in stoi.items()}

In [23]:
vocab_size = len(itos)
print(vocab_size)

27


In [24]:
import random
random.seed(42)
random.shuffle(words)

In [37]:
block_size = 4
def build_dataset(words):
    X, Y =[],[]

    for w in words[:100]:
        context = [0]*block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

n1 = int(0.8*len(words))
n2 = int(0.8*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
    

torch.Size([744, 4]) torch.Size([744])
torch.Size([0]) torch.Size([0])
torch.Size([678, 4]) torch.Size([678])


In [38]:
for x,y in zip(Xtr[:20], Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

.... --> y
...y --> u
..yu --> h
.yuh --> e
yuhe --> n
uhen --> g
heng --> .
.... --> d
...d --> i
..di --> o
.dio --> n
dion --> d
iond --> r
ondr --> e
ndre --> .
.... --> x
...x --> a
..xa --> v
.xav --> i
xavi --> e


In [69]:
class Linear:
    def __init__(self, fan_out, fan_in, bias = True):
        self.weight = torch.randn((fan_in,fan_out))/(fan_in**0.5)
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__ (self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias] )


In [68]:
class BatchNorm1d:
    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        #parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running momentum update)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        #calculate the forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
            xmean = x.mean(dim, keepdim = True) # batch mean
            xvar = x.var(dim, keepdim = True) #batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x-xmean)/torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma*xhat + self.beta
        #update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*xmean
                self.running_var = (1-self.momentum)(self.running_var) + self.momentum*xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    

In [70]:
class Tanh:
    def __call__ (self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

class Embedding:
    def __init__(self, num_embeddings, embeddings_dim):
        self.weight = torch.randn((num_embeddings, embeddings_dim))

    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]

class FlattenConsecutive:
    def __init__(self, n):
        self.n = n
    
    def __call__(self, x):
        B,T,C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out
    
    def parameters(self):
        return []
    
class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        # get parameters of all layers and stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]
    


In [72]:
torch.manual_seed(42);

In [None]:
#original network
n_embd = 10
n_hidden = 300
model = Sequential([
    Embedding(vocab_size, n_embd),
    FlattenConsecutive(8), Linear(n_embd*8, n_hidden, bias =False), BatchNorm1d(n_hidden),Tanh(),
    Linear(n_hidden, vocab_size)
])

#heirarchical network
n_embd = 24
n_hidden = 128
model = Sequential([
    Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd*2, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_embd*2, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_embd*2, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
])

#parameter init
with torch.no_grad():
    model.layers[-1].weight *= 0.1 # last layer makes less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad = True

