In [5]:
!pip install torch
!pip install matplotlib
!pip install numpy
!pip install kagglehub

Collecting matplotlib
  Using cached matplotlib-3.10.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.58.4-cp313-cp313-macosx_10_13_universal2.whl.metadata (106 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.2 kB)
Collecting numpy>=1.23 (from matplotlib)
  Using cached numpy-2.3.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (8.9 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotlib-3.10.3-cp313-cp3

In [271]:
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import kagglehub
import torch
import os
from torch.distributions import Categorical
%matplotlib inline

In [278]:
def download_dataset() -> str:
    path = kagglehub.dataset_download("rishitjakharia/names-txt")
    return path # for easier access later

In [279]:
dataset_path = download_dataset() + "/names.txt"

In [281]:
words = open(dataset_path, 'r').read().splitlines()

In [282]:
def make_int_char_maps() -> tuple[dict, dict]:
  chars = sorted(list(set(''.join(words))))
  ctoi = {c: i + 1 for i, c in enumerate(chars)}
  ctoi['.'] = 0
  itoc = {i: c for c, i in ctoi.items()}
  return ctoi, itoc

In [283]:
ctoi, itoc = make_int_char_maps()

In [284]:
print(itoc)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [285]:
print(ctoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [286]:
vocab_size = len(ctoi)
print(vocab_size)

27


In [287]:
block_size = 7
def build_dataset(inp_words) -> tuple[torch.tensor, torch.tensor]:
  X, Y = [], []
  for w in words:
      # pad with '.' for start, append '.' for end
      padded = '.' * block_size + w + '.'
      for i in range(len(w) + 1):  # +1 to include the end token
          context = padded[i:i+block_size]
          target = padded[i+block_size]
          X.append([ctoi[c] for c in context])
          Y.append(ctoi[target])
  X = torch.tensor(X, dtype=torch.long)
  Y = torch.tensor(Y, dtype=torch.long)
  return X, Y

In [288]:
import random
random.seed(42)

random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [289]:
Xtr.shape, Ytr.shape

(torch.Size([228146, 7]), torch.Size([228146]))

In [207]:
for x,y in zip(Xtr[:30], Ytr[:30]):
  print(''.join(itoc[ix.item()] for ix in x), '-->', itoc[y.item()])

....... --> n
......n --> a
.....na --> s
....nas --> i
...nasi --> a
..nasia --> h
.nasiah --> .
....... --> g
......g --> i
.....gi --> l
....gil --> a
...gila --> n
..gilan --> a
.gilana --> .
....... --> e
......e --> v
.....ev --> e
....eve --> r
...ever --> l
..everl --> e
.everle --> a
everlea --> .
....... --> k
......k --> h
.....kh --> y
....khy --> r
...khyr --> i
..khyri --> e
.khyrie --> .
....... --> s


In [290]:
import torch.nn as nn

In [399]:
n_emb = 35 # dimensionality of the character vectors
n_hidden = 200 # num hidden layer neurons

In [400]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)  
        self.rnn = nn.LSTM(input_size=vocab_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_seq, hidden_state=None):
        # input_seq: (seq_len, batch)
        embedding = self.embedding(input_seq)  
        output, hidden_state = self.rnn(embedding, hidden_state)
        out = self.decoder(output)  
        return out, (hidden_state[0].detach(), hidden_state[1].detach())

In [401]:
model = CharRNN(vocab_size, n_emb, n_hidden)

# make the last layer less confident
with torch.no_grad():
    model.decoder.weight.mul(0.1)

params = model.parameters()
print(f"Num Params: {sum(p.nelement() for p in params)}")
for param in params:
    param.requires_grad = True


optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50_000, gamma=0.5) 


Num Params: 2016581


In [402]:
print(model)

CharRNN(
  (embedding): Embedding(27, 27)
  (rnn): LSTM(27, 35, num_layers=200)
  (decoder): Linear(in_features=35, out_features=27, bias=True)
)


In [403]:
def sample_name(model, block_size, ctoi, itoc, device):
    model.eval()
    context = [ctoi['.']] * block_size
    name = ''
    hidden = None
    while True:
        x = torch.tensor([context], dtype=torch.long).to(device)
        logits, hidden = model(x, hidden)            # logits: (1, seq_len, vocab_size)
        logits = logits[:, -1, :]                    # take the output at the last timestep: (1, vocab_size)
        probs = F.softmax(logits, dim=-1)            # (1, vocab_size)
        dist = Categorical(probs)
        idx = dist.sample().item()
        char = itoc[idx]
        if char == '.':
            break
        name += char
        context = context[1:] + [idx]
    model.train()
    return name.capitalize()

In [404]:
def train(
    X, Y, model, epochs=1_000, batch_size=64, lr=0.01, device='mps'
):
    model = model.to(device)
    X, Y = X.to(device), Y.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(1, epochs+1):
        idxs = torch.randint(0, X.size(0), (batch_size,))
        xb, yb = X[idxs], Y[idxs]
        logits, _ = model(xb)
        logits = logits[:, -1, :]   # (batch, vocab_size), at last timestep
        loss = loss_fn(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 100 == 0:
            print(f"Epoch {epoch} | Loss: {loss:.4f}")
            # Sample after each epoch
            print("-- Sampled names: --")
            for _ in range(5):
                print(sample_name(model, block_size, ctoi, itoc, device))
            print('-' * 30)

In [405]:
train(X=Xtr, Y=Ytr, model=model)

Epoch 100 | Loss: 2.8217
-- Sampled names: --
I
Llndraeceeta

Baislni
Ca
------------------------------
Epoch 200 | Loss: 2.7302
-- Sampled names: --
Naae
Ot

Rnkts
Serznohaboheai
------------------------------
Epoch 300 | Loss: 2.8032
-- Sampled names: --
T
Reellainiiirm
Iky
Tlgsyayiiaziaaal
Lrenyaiiszha
------------------------------


KeyboardInterrupt: 

In [237]:
model.eval()


RNN_Model(
  (embedding): Embedding(27, 32)
  (gru): GRU(32, 150, batch_first=True)
  (linear): Linear(in_features=150, out_features=27, bias=True)
)

In [240]:
@torch.no_grad()
def split_loss(split):
    x,y = {
        'train' : (Xtr, Ytr),
        'val' : (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    
    x = model(x)
    probs = x.view(x.shape[0], -1)
    loss = F.cross_entropy(probs, y)
    print(split, loss.item())