<a href="https://colab.research.google.com/github/sharon504/ml-learning/blob/main/makemore_name_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [None]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-06-04 06:32:24--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2024-06-04 06:32:24 (6.27 MB/s) - ‘names.txt’ saved [228145/228145]



In [None]:
words = list(open('names.txt', 'r').read().splitlines())
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [None]:
stoi = {w: i for i, w in enumerate(['.'] + sorted(list(set("".join(words)))))}
itos = {i: w for w, i in stoi.items()}
vocab_size = len(stoi)
vocab_size

27

In [None]:
def build_dataset(words, block_size):
  x = []
  y = []

  for word in words:
    context = [0] * block_size
    for char in word:
      x.append(context)
      y.append(stoi[char])
      context = context[1:] + [stoi[char]]
  x = torch.tensor(x)
  y = torch.tensor(y)
  return x, y

In [None]:
def split_dataset(words, train, test, block_size):
  import random
  random.seed(42)
  random.shuffle(words)
  n = len(words)

  n1 = int(train * n)
  n2 = int(test * n) + n1
  x_train, y_train = build_dataset(words[:n1], block_size)
  x_test, y_test = build_dataset(words[n1:n2], block_size)
  x_val, y_val = build_dataset(words[n2:], block_size)
  return x_train, x_test, x_val, y_train, y_test, y_val

In [None]:
class Embedding:
  def __init__(self, vocab_size, emb_dim):
    self.weight = torch.randn(vocab_size, emb_dim)

  def __call__(self, x):
    self.out = self.weight[x]
    return self.out

  def parameters(self):
    return [self.weight]

In [None]:
class FlattenConsecutive:
  def __init__(self, n):
    self.n = n
  def __call__(self, x):
    a, b, c = x.shape
    x = x.view(a, b // self.n, c * self.n)
    if x.shape[1] == 1:
      x = x.squeeze(dim=1)
    self.out = x
    return self.out

  def parameters(self):
    return []

In [None]:
class Linear:
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in ** 0.5
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

In [None]:
class BatchNorm:
  def __init__(self, dims, momentum=1e-2, eps=1e-5, training=True):
    self.dims = dims
    self.momentum = momentum
    self.eps = eps
    self.training = training

    self.bnmean_running = torch.zeros(dims)
    self.bnvar_running = torch.ones(dims)

    self.gamma = torch.ones(dims)
    self.beta = torch.zeros(dims)

  def __call__(self, x):
    if self.training:
      if x.ndim <= 2:
        dim = 0
      else:
        dim = tuple(list(range(0, x.ndim - 1)))
      bnmean = x.mean(dim, keepdim=True)
      bnvar = x.var(dim, keepdim=True)
    else:
      bnmean = self.bnmean_running
      bnvar = self.bnvar_running
    hpre = (x - bnmean) / torch.sqrt(bnvar + self.eps)
    self.out = self.gamma * hpre + self.beta

    if self.training:
      with torch.no_grad():
        self.bnmean_running = self.bnmean_running * (1 - self.momentum) + bnmean * self.momentum
        self.bnvar_running = self.bnvar_running * (1 - self.momentum) + bnvar * self.momentum

    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

In [None]:
class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

In [None]:
class Sequential:
  def __init__(self, layers):
    self.layers = layers

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out

  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

In [None]:
def training(X, Y, model, epoch=2e5, batch_size=32):
  lossi = []
  parameters = model.parameters()
  print(sum(p.nelement() for p in parameters))
  for p in parameters:
    p.requires_grad = True
  for i in range(int(epoch)):
    ix = torch.randint(0, X.shape[0], (batch_size,))
    x = X[ix]
    y = Y[ix]

    logits = model(x)
    loss = F.cross_entropy(logits, y)

    for p in parameters:
      p.grad = None
    loss.backward()

    lr = 1e-1 if i < 1e4 else 1e-2
    for p in parameters:
      p.data -= p.grad * lr

    if i % 10000 == 0:
      print(f"{i}/{epoch} - {loss}")
    lossi.append(loss)
  return lossi, parameters

In [None]:
emb_dim = 24
n_hidden = 128
flatten_by = 2
block_size = 8
x_train, x_test, x_val, y_train, y_test, y_val = split_dataset(words, 0.8, 0.1, block_size)
model = Sequential([
    Embedding(vocab_size, emb_dim),
    FlattenConsecutive(flatten_by), Linear(emb_dim * flatten_by, n_hidden, bias=False), BatchNorm(n_hidden), Tanh(),
    FlattenConsecutive(flatten_by), Linear(flatten_by * n_hidden, n_hidden, bias=False), BatchNorm(n_hidden), Tanh(),
    FlattenConsecutive(flatten_by), Linear(flatten_by * n_hidden, n_hidden, bias=False), BatchNorm(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size)
])

with torch.no_grad():
  # last layer: make less confident
  #layers[-1].weight *= 0.1
  # all other layers: apply gain
  for layer in model.layers[:-1]:
    if isinstance(layer, Linear):
      layer.weight *= 5/3


In [None]:
loss, params = training(x_train, y_train, model)

76579
0/200000.0 - 3.397691488265991
10000/200000.0 - 2.3353497982025146
20000/200000.0 - 2.4722025394439697
30000/200000.0 - 2.259833574295044
40000/200000.0 - 2.0992672443389893
50000/200000.0 - 2.4642746448516846
60000/200000.0 - 1.8219878673553467
70000/200000.0 - 2.1999118328094482
80000/200000.0 - 1.6512300968170166


In [None]:
for layer in model.layers:
  print(layer.__class__.__name__, ': ', tuple(layer.out.shape))

Embedding :  (32, 8, 10)
FlattenConsecutive :  (32, 4, 20)
Linear :  (32, 4, 68)
BatchNorm :  (32, 4, 68)
Tanh :  (32, 4, 68)
FlattenConsecutive :  (32, 2, 136)
Linear :  (32, 2, 68)
BatchNorm :  (32, 2, 68)
Tanh :  (32, 2, 68)
FlattenConsecutive :  (32, 136)
Linear :  (32, 68)
BatchNorm :  (32, 68)
Tanh :  (32, 68)
Linear :  (32, 27)


In [None]:
plt.plot(torch.tensor(loss).view(-1, 10000).mean(1))

In [None]:
for layer in model.layers:
  if isinstance(BatchNorm, layer):
    layer.training = False

In [None]:
with torch.no_grad():
  logits_train = model(x_train)
  loss_train = F.cross_entropy(logits_train, y_train)

  logits_val = model(x_val)
  loss_val = F.cross_entropy(logits_val, y_val)
  loss_train, loss_val