# Prepare

In [75]:
!wget "https://github.com/romenlaw/NaiveNeuralNetwork/blob/main/names.txt?raw=True" -O names.txt

--2024-08-28 04:58:00--  https://github.com/romenlaw/NaiveNeuralNetwork/blob/main/names.txt?raw=True
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/romenlaw/NaiveNeuralNetwork/raw/main/names.txt [following]
--2024-08-28 04:58:00--  https://github.com/romenlaw/NaiveNeuralNetwork/raw/main/names.txt
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/romenlaw/NaiveNeuralNetwork/main/names.txt [following]
--2024-08-28 04:58:01--  https://raw.githubusercontent.com/romenlaw/NaiveNeuralNetwork/main/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting

In [76]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [77]:
words = open('names.txt', 'r').read().splitlines()
len(words), max([len(w) for w in words]), words[:8]

(32033,
 15,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [78]:
vocab = sorted(list(set(''.join(words))))
vocab.insert(0, '.')
vocab_size = len(vocab)
itos={i:s for i, s in zip(range(vocab_size), vocab)}
stoi={s:i for i, s in zip(range(vocab_size), vocab)}

In [79]:
# build datasets: 80% training, 10% validation, 10% testing
import random
random.seed(42)
random.shuffle(words) # shuffle is in-place


block_size=3
X, Y = [], []
context = []
for w in words:
  context = [0] * block_size # context contains indices
  for c in (w + '.'):
    ix = stoi[c]
    X.append(context)
    Y.append(ix)
    if c=='.':
      break
    context = context[1:] + [ix]

n1 = int(len(X)*.8)
n2 = int(len(X)*.9)
X_train = torch.tensor(X[:n1])
Y_train = torch.tensor(Y[:n1])
X_val = torch.tensor(X[n1:n2])
Y_val = torch.tensor(Y[n1:n2])
X_test = torch.tensor(X[n2:])
Y_test = torch.tensor(Y[n2:])

X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape

(torch.Size([182516, 3]),
 torch.Size([182516]),
 torch.Size([22815, 3]),
 torch.Size([22815]),
 torch.Size([22815, 3]),
 torch.Size([22815]))

In [80]:
X_val[:5], Y_val[:5]

(tensor([[ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [13,  1, 12],
         [ 1, 12,  9]]),
 tensor([13,  1, 12,  9,  5]))

# MLP

## classes

In [100]:
class Linear:
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps=eps
    self.momentum = momentum
    self.training = True

    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    if self.training:
      xmean = x.mean(dim=0, keepdim=True)
      xvar = x.var(dim=0, keepdim=True)
      with torch.no_grad():
        self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*xmean
        self.running_var = (1-self.momentum)*self.running_var + self.momentum*xvar
    else:
      xmean = self.running_mean
      xvar = self.running_var

    x_hat = (x - xmean) / (xvar + self.eps)**0.5
    self.out = self.gamma * x_hat + self.beta

    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out

  def parameters(self):
    return []

## initialisation

In [101]:
torch.manual_seed(42)

emb_dim = 10
hidden_dim = 200

C = torch.randn((vocab_size, emb_dim))
layers = [
    Linear(emb_dim*block_size, hidden_dim, bias=False), BatchNorm1d(hidden_dim), Tanh(),
    Linear(hidden_dim, vocab_size) # output layer
]
with torch.no_grad():
  layers[-1].weight *= 0.1 # make output layer less confident(ly wrong)

parameters = [C] + [p for layer in layers for p in layer.parameters()]
for p in parameters:
  p.requires_grad=True
print(sum(p.nelement() for p in parameters))

12097


In [83]:
emb_dim*block_size * hidden_dim + hidden_dim*2 + hidden_dim*vocab_size +vocab_size+ vocab_size * emb_dim

12097

## training

In [102]:
max_steps=200000
batch_size=32
lossi=[]

for step in range(max_steps):
  # create mini_batch
  ix = torch.randint(0, X_train.shape[0], size=(batch_size,))
  Xb, Yb = X_train[ix], Y_train[ix]

  # forward pass
  emb = C[Xb]  # (batch_size, vocab_size, emb_dim)
  x = emb.view(emb.shape[0], -1)  # (batch_size, vocab_size*emb_dim)
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, Yb)

  # backward pass
  for p in parameters:
    p.grad=None
  loss.backward()

  # update
  lr = 0.1 if step<100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  if step % 10000 ==0:
    print('%7d/%7d: %2.10f' % (step, max_steps, loss))
  lossi.append(loss)
print('%7d/%7d: %2.10f' % (step, max_steps, loss))

      0/ 200000: 3.3003702164
  10000/ 200000: 2.1085405350
  20000/ 200000: 2.3290817738
  30000/ 200000: 1.6030472517
  40000/ 200000: 2.4885463715
  50000/ 200000: 2.0115578175
  60000/ 200000: 2.2257134914
  70000/ 200000: 2.1586527824
  80000/ 200000: 2.0268161297
  90000/ 200000: 2.1670222282
 100000/ 200000: 2.2194030285
 110000/ 200000: 2.1041414738
 120000/ 200000: 1.8756295443
 130000/ 200000: 2.0888373852
 140000/ 200000: 2.3188447952
 150000/ 200000: 2.2042424679
 160000/ 200000: 2.3583600521
 170000/ 200000: 2.3546292782
 180000/ 200000: 1.9698082209
 190000/ 200000: 2.0460581779
 199999/ 200000: 1.9368194342


## validate

In [103]:
for layer in layers:
  layer.training = False

@torch.no_grad()
def split_loss(s):
  """s is one of 'train', 'val', 'test'
  """
  xs, ys = { 'train': (X_train, Y_train),
          'val': (X_val, Y_val,),
           'test': (X_test, Y_test)}[s]
  # forward pass
  emb = C[xs]
  x = emb.view(emb.shape[0], -1)
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, ys)
  print(s, loss.item())

split_loss('train')
split_loss('val')

train 2.0648605823516846
val 2.1072237491607666


In [99]:
layers[1].running_mean.shape

torch.Size([32, 200])

## Sample

In [115]:
with torch.no_grad():
  for _ in range(20):
    out=[]
    context=[0]*block_size

    while True:
      emb = C[torch.tensor([context])] # (1, block_size, emb_dim)
      x = emb.view(emb.shape[0], -1)
      for layer in layers:
        x = layer(x)
      probs = F.softmax(x, dim=1)
      ix = torch.multinomial(probs, num_samples=1)
      context = context[1:]+[ix]
      out.append(ix.item())
      if ix==0:
        break
    print(''.join(itos[i] for i in out))

alyz.
aris.
tri.
gere.
sayah.
ayvorie.
rosson.
emon.
catine.
aib.
alitithira.
liza.
jemin.
ana.
alynna.
jamaur.
ben.
quan.
tori.
makyia.
