In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
with open("../Names.txt", 'r') as f:
    names = f.read().split('\n')

In [None]:
print(len(names))

In [None]:
print(np.unique(list(''.join(names))))

In [None]:
stopchars = [" ","-","."]

In [None]:
names = [name.lower() for name in names if not any(char in stopchars for char in name)]

In [None]:
chars = np.unique(['!']+list(''.join(names)))

In [None]:
char_index = dict()
for char in chars:
    char_index[char] = len(char_index)

In [None]:
window_length = 8
Xs,ys = [],[]
for name in names:
    window = [0]*window_length
    for c in name+'!':
        Xs.append(window)
        ys.append(char_index[c])
        window = window[1:]+[char_index[c]]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
Xs = torch.tensor(Xs, dtype=torch.long, device=device)
ys = torch.tensor(ys, dtype=torch.long, device=device)

In [None]:
for i in range(30):
    print(chars[Xs[i].cpu()], chars[ys[i].cpu()])

In [None]:
Xs.shape, Xs.dtype, ys.shape, ys.dtype

In [None]:
class Linear():
    def __init__(self, fan_in, fan_out, bias=True, device="cpu"):
        self.weight = torch.randn((fan_in, fan_out), device=device) / fan_in**0.5
        self.bias = torch.zeros(fan_out, device=device) if bias else None
    def __call__(self,X):
        self.out = X @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    def parameters(self):
        return [self.weight] + ([self.bias] if self.bias is not None else [])

class BatchNorm1d():
    def __init__(self, dim, eps=1e-5, momentum=0.001, device="cpu"):
        self.gamma = torch.ones((1,dim), device=device)
        self.beta = torch.zeros((1,dim), device=device)
        self.eps = eps
        self.momentum = momentum
        self.running_mean = torch.zeros((1,dim), device=device)
        self.running_var = torch.ones((1,dim), device=device)
        self.training = True
    def __call__(self, X):
        if self.training:
            with torch.inference_mode():
                mean = X.mean(0, keepdim=True)
                var = X.var(0, keepdim=True)
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * mean
                self.running_var = (1-self.momentum) * self.running_var + self.momentum * var
        else:
            mean = self.running_mean
            var = self.running_var
        return self.gamma * (X - mean) / (torch.sqrt(var)+self.eps) + self.beta
    def parameters(self):
        return [self.gamma, self.beta]

class Embedding():
    def __init__(self, num_embeddings, dim, device="cpu"):
        self.weight = torch.randn((num_embeddings, dim), device=device)
    def __call__(self, ix):
        self.out = self.weight[ix]
        return self.out
    def parameters(self):
        return [self.weight]

class Flatten():
    def __call__(self, X):
        self.out = X.reshape(X.shape[0],-1)
        return self.out
    def parameters(self):
        return []

class Tanh():
    def __call__(self, X):
        self.out = torch.tanh(X)
        return self.out
    def parameters(self):
        return []

class Sequential():
    def __init__(self, layers=[]):
        self.layers=layers
    def __call__(self, X):
        for layer in self.layers:
            X = layer(X)
        return X
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [None]:
embedding_dim = 15
hidden_layer = 100
# window_length = 8
l = window_length*embedding_dim
model = Sequential([
    Embedding(len(chars), embedding_dim, device=device),
    Flatten(),
    Linear(l,hidden_layer,bias=False, device=device),
    BatchNorm1d(hidden_layer, device=device),
    Tanh(),
    Linear(hidden_layer,len(chars), device=device),
])
for p in model.parameters():
    p.requires_grad = True
print(sum(p.nelement() for p in model.parameters()))

In [None]:
lre = np.linspace(-3,0,1000)
lrs = 10**lre

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=0.1, random_state=42)
X_train,X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
batch_size=32
exp_rates = []
losses = []
for i in range(1000):
    #minibatch
    batch = torch.randint(0,len(X_train), (batch_size,))
    x_batch=X_train[batch]
    y_batch=y_train[batch]
    # forward pass
    logits = model(x_batch)
    loss = F.cross_entropy(logits, y_batch)
    # backward pass
    for p in model.parameters():
        p.grad = None
    loss.backward()
    # update parameters
    for p in model.parameters():
        p.data -= lrs[i] * p.grad
    losses.append(loss.item())
    exp_rates.append(lre[i])

In [None]:
plt.plot(exp_rates,losses)

In [None]:
#10**--1 = 0.1 seems to be a good learning rate
batch_size=32
losses = []
epochs = 200000
for i in range(epochs):
    #minibatch
    batch = torch.randint(0,len(X_train), (batch_size,))
    x_batch=X_train[batch]
    y_batch=y_train[batch]
    # forward pass
    logits = model(x_batch)
    loss = F.cross_entropy(logits, y_batch)
    # backward pass
    for p in model.parameters():
        p.grad = None
    loss.backward()
    # update parameters
    lr = 0.1 if i < 100000 else 0.01
    for p in model.parameters():
        p.data -= lr * p.grad
    # print(loss.item())
    losses.append(loss.item())

In [None]:
losses = np.array(losses)
plt.plot(losses.reshape(-1,1000).mean(1))

In [None]:
for layer in model.layers:
    layer.training = False
with torch.inference_mode():
    logits = model(X_train)
    train_loss = F.cross_entropy(logits, y_train)
    x = X_val
    logits = model(x)
    val_loss = F.cross_entropy(logits, y_val)
    print(f'Train loss: {train_loss.item()}, Val loss: {val_loss.item()}')

In [None]:
import pickle
torch.save(model, 'params/model.pt')
with open('params/chars.pkl', 'wb') as f:
    pickle.dump(chars, f)
with open('params/char_index.pkl', 'wb') as f:
    pickle.dump(char_index, f)