# Seq LM playground

Implement the RNN, LSTM and GRU based character based LM for makemore

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import heapq
import random
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
%matplotlib inline
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7dd49aeac730>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Step 1: Preprocess Data

In [3]:
file_path = "../names.txt"

names = []

with open(file_path, "r") as f:
    for name in f.readlines():
        names.append(name.strip())

print(names[:5])

['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [4]:
len(names)

32033

In [5]:
vocab = sorted(set([l for name in names for l in name]))
vocab

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [6]:
stoi, itos = {}, {}

stoi["."] = 0
itos[0] = "."

for i, l in enumerate(vocab):
    stoi[l] = i + 1
    itos[i + 1] = l

stoi["<pad>"] = len(stoi)
itos[len(itos)] = "<pad>"

print(stoi)
print(itos)

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '<pad>': 27}
{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 27: '<pad>'}


In [7]:
import random

random.seed(42)
names_shuffled = names.copy()
random.shuffle(names_shuffled)

n = len(names_shuffled)
n_train = int(0.8 * n)
n_val = int(0.1 * n)

train_names = names_shuffled[:n_train]
val_names = names_shuffled[n_train:n_train + n_val]
test_names = names_shuffled[n_train + n_val:]

print(f"Train: {len(train_names)}, Val: {len(val_names)}, Test: {len(test_names)}")

Train: 25626, Val: 3203, Test: 3204


In [8]:
def create_data(names, padding=False, max_seq_len=None):
    # create dataset like
    # for name soham -> make it .soham. -> [.,s,o,h,a,m] ---> [s,o,h,a,m,.]

    x, y = [], []

    for name in names:
        name = "." + name + "."
        tx, ty = [], []
        for l1, l2 in zip(name, name[1:]):
            tx.append(stoi[l1])
            ty.append(stoi[l2])

        if padding:
            assert max_seq_len is not None
            if len(tx) < max_seq_len:
                tx.extend([stoi["<pad>"]] * (max_seq_len - len(tx)))
                ty.extend([stoi["<pad>"]] * (max_seq_len - len(ty)))

        x.append(tx)
        y.append(ty)

    return x, y

In [9]:
# first no padding, each instance is a batch
train_x, train_y = create_data(train_names, padding=True, max_seq_len=64)
val_x, val_y = create_data(val_names, padding=True, max_seq_len=64)
test_x, test_y = create_data(test_names, padding=True, max_seq_len=64)

train_x, train_y = torch.LongTensor(train_x), torch.LongTensor(train_y)
val_x, val_y = torch.LongTensor(val_x), torch.LongTensor(val_y)
test_x, test_y = torch.LongTensor(test_x), torch.LongTensor(test_y)

print(train_x.shape, train_y.shape)
print(val_x.shape, val_y.shape)
print(test_x.shape, test_y.shape)

torch.Size([25626, 64]) torch.Size([25626, 64])
torch.Size([3203, 64]) torch.Size([3203, 64])
torch.Size([3204, 64]) torch.Size([3204, 64])


In [10]:
train_dataset = TensorDataset(train_x, train_y)
val_dataset = TensorDataset(val_x, val_y)
test_dataset = TensorDataset(test_x, test_y)

In [11]:
train_dataset.tensors[0].shape

torch.Size([25626, 64])

## Training code

In [13]:
def compute_loss(dataset, model, criterion, device):
    model.eval()
    x = dataset.tensors[0].to(device) # (B, seq_len)
    y = dataset.tensors[1].to(device) # (B, seq_len)

    logits = model(x) # (B, C, V)
    B, C, V = logits.shape
    loss = criterion(logits.view(B * C, V), y.view(B * C))
    return loss.item()

def train_instance_wise(train_dataset, val_dataset, batch_size, n_eopchs, model, optimizer, criterion, device):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=False)

    train_loss_dict, val_loss_dict = {}, {}

    train_loss = compute_loss(train_dataset, model, criterion, device)
    val_loss = compute_loss(val_dataset, model, criterion, device)

    print(f"Start of training: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    for epoch in range(n_eopchs):
        # train loop
        model.train()
        for x, y in train_dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            B, C, V = logits.shape
            loss = criterion(logits.view(B * C, V), y.view(B * C))
            loss.backward()
            optimizer.step()

        # Compute loss
        train_loss = compute_loss(train_dataset, model, criterion, device)
        val_loss = compute_loss(val_dataset, model, criterion, device)

        train_loss_dict[epoch + 1] = train_loss
        val_loss_dict[epoch + 1] = val_loss

        print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    return train_loss_dict, val_loss_dict

## Step 2: Create a model (RNN Model)

In [42]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        self.ln = nn.LayerNorm(hidden_dim)
        self.ffn = nn.Linear(hidden_dim, vocab_size)

        self.init_weights()

    def init_weights(self):
        # init the embedings
        init.xavier_normal_(self.embeddings.weight)

        # Initialize RNN weights
        for name, param in self.rnn.named_parameters():
            if 'weight_ih' in name:  # input-to-hidden weights
                init.xavier_normal_(param)
            elif 'weight_hh' in name:  # hidden-to-hidden weights
                init.xavier_normal_(param)
            elif 'bias' in name:  # biases
                init.zeros_(param)
        
        # Initialize output layer
        init.xavier_normal_(self.ffn.weight)
        init.zeros_(self.ffn.bias)
        
    def forward(self, x):
        # x shape (B, seq_len)
        B, seq_len = x.shape

        emb = self.embeddings(x) # (B, seq_len, emb_dim)

        h0 = torch.zeros(self.num_layers, B, self.hidden_dim).to(device)
        out, ht = self.rnn(emb, h0) # out shape: (B, seq_len, hidden_dim)
        out = self.ln(out)
        
        logits = self.ffn(out) # logits shape: (B, seq_len, vocab_size)
        return logits

### 1 layer RNN

In [25]:
vocab_size = len(stoi)
embedding_dim = 256
hidden_dim = 256
num_layers = 1

In [26]:
model = RNNModel(vocab_size=len(stoi), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers)
model.to(device)

RNNModel(
  (embeddings): Embedding(28, 256)
  (rnn): RNN(256, 256, batch_first=True)
  (ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (ffn): Linear(in_features=256, out_features=28, bias=True)
)

In [27]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 146460


In [28]:
batch_size = 256
n_epochs = 20
lr = 0.002
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=stoi["<pad>"])

In [29]:
train_loss_dict, val_loss_dict = train_instance_wise(train_dataset, val_dataset, batch_size, n_epochs, model, optimizer, criterion, device)

Start of training: Train Loss: 3.3484, Val Loss: 3.3499


Epoch 1: Train Loss: 2.2175, Val Loss: 2.2261
Epoch 2: Train Loss: 2.1510, Val Loss: 2.1674
Epoch 3: Train Loss: 2.1017, Val Loss: 2.1276
Epoch 4: Train Loss: 2.0595, Val Loss: 2.0941
Epoch 5: Train Loss: 2.0227, Val Loss: 2.0661
Epoch 6: Train Loss: 2.0019, Val Loss: 2.0585
Epoch 7: Train Loss: 1.9682, Val Loss: 2.0340
Epoch 8: Train Loss: 1.9521, Val Loss: 2.0316
Epoch 9: Train Loss: 1.9288, Val Loss: 2.0197
Epoch 10: Train Loss: 1.9147, Val Loss: 2.0110
Epoch 11: Train Loss: 1.8980, Val Loss: 2.0077
Epoch 12: Train Loss: 1.8903, Val Loss: 2.0047
Epoch 13: Train Loss: 1.8770, Val Loss: 2.0027
Epoch 14: Train Loss: 1.8636, Val Loss: 1.9988
Epoch 15: Train Loss: 1.8515, Val Loss: 1.9958
Epoch 16: Train Loss: 1.8414, Val Loss: 1.9989
Epoch 17: Train Loss: 1.8316, Val Loss: 1.9954
Epoch 18: Train Loss: 1.8264, Val Loss: 1.9997
Epoch 19: Train Loss: 1.8144, Val Loss: 1.9952
Epoch 20: Train Loss: 1.8070, Val Loss: 1.9975


lr = 0.002
Train Loss: 1.8070, Val Loss: 1.9975

In [30]:
test_loss = compute_loss(test_dataset, model, criterion, device)
print(test_loss)

1.9966771602630615


### 2 layer RNN

In [43]:
vocab_size = len(stoi)
embedding_dim = 256
hidden_dim = 256
num_layers = 2

In [44]:
model = RNNModel(vocab_size=len(stoi), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers)
model.to(device)

RNNModel(
  (embeddings): Embedding(28, 256)
  (rnn): RNN(256, 256, num_layers=2, batch_first=True)
  (ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (ffn): Linear(in_features=256, out_features=28, bias=True)
)

In [45]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in model1: {total_params}")

Total parameters in model1: 278044


In [46]:
batch_size = 256
n_epochs = 20
lr = 0.001
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=stoi["<pad>"])

In [47]:
train_loss_dict, val_loss_dict = train_instance_wise(train_dataset, val_dataset, batch_size, n_epochs, model, optimizer, criterion, device)

Start of training: Train Loss: 4.3736, Val Loss: 4.3664
Epoch 1: Train Loss: 2.2449, Val Loss: 2.2556
Epoch 2: Train Loss: 2.1895, Val Loss: 2.2039
Epoch 3: Train Loss: 2.1476, Val Loss: 2.1673
Epoch 4: Train Loss: 2.1152, Val Loss: 2.1407
Epoch 5: Train Loss: 2.0821, Val Loss: 2.1150
Epoch 6: Train Loss: 2.0551, Val Loss: 2.0930
Epoch 7: Train Loss: 2.0221, Val Loss: 2.0697
Epoch 8: Train Loss: 1.9951, Val Loss: 2.0550
Epoch 9: Train Loss: 1.9764, Val Loss: 2.0438
Epoch 10: Train Loss: 1.9590, Val Loss: 2.0370
Epoch 11: Train Loss: 1.9346, Val Loss: 2.0237
Epoch 12: Train Loss: 1.9165, Val Loss: 2.0199
Epoch 13: Train Loss: 1.9007, Val Loss: 2.0147
Epoch 14: Train Loss: 1.8862, Val Loss: 2.0100
Epoch 15: Train Loss: 1.8690, Val Loss: 2.0038
Epoch 16: Train Loss: 1.8549, Val Loss: 2.0041
Epoch 17: Train Loss: 1.8397, Val Loss: 1.9972
Epoch 18: Train Loss: 1.8322, Val Loss: 2.0046
Epoch 19: Train Loss: 1.8104, Val Loss: 1.9929
Epoch 20: Train Loss: 1.7998, Val Loss: 1.9952


lr = 0.002
Train Loss: 1.8070, Val Loss: 1.9975

In [41]:
test_loss = compute_loss(test_dataset, model, criterion, device)
print(test_loss)

1.9841831922531128


In [2]:
import torch

def clear_gpu_memory_pytorch():
    """Clears PyTorch's CUDA cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print("PyTorch CUDA cache cleared.")
    else:
        print("CUDA is not available.")

In [8]:
!nvidia-smi

Sat Jun 28 20:03:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:46:00.0 Off |                  Off |
|  0%   50C    P8              4W /  450W |     482MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Test pretrain code

In [3]:
import string

vocab = set()
vocab.update(string.ascii_lowercase)
vocab.add(".")
vocab.add("<pad>")
len(vocab)

28

In [8]:
import re

def clean_file(input_path):
    with open(input_path, 'r', encoding='utf-8') as infile:
        content = infile.read()
    
    # Convert to lowercase
    content = content.lower()
    
    # Retain only lowercase letters, dots, spaces, and newlines
    cleaned = re.sub(r'[^a-z.\n ]', '', content)
    return cleaned

In [9]:
file_path = "/root/makemore/personal-extension/character_level_lm/cleaned_merged_fairy_tales_without_eos.txt"
cleaned = clean_file(file_path)
cleaned[:200]

'the happy prince.\nhigh above the city on a tall column stood the statue of the happy prince.  he was gilded all over with thin leaves of fine gold for eyes he had two bright sapphires and a large red '

In [7]:
data = np.load("/root/makemore/personal-extension/character_level_lm/pretrain_data/pretrain_data_seq_len_64.npz")
x_train, y_train, x_val, y_val, x_test, y_test = data["x_train"], data["y_train"],\
        data["x_val"], data["y_val"], data["x_test"], data["y_test"]

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

print("-" * 100)

train_x, train_y = torch.LongTensor(x_train), torch.LongTensor(y_train)
val_x, val_y = torch.LongTensor(x_val), torch.LongTensor(y_val)
test_x, test_y = torch.LongTensor(x_test), torch.LongTensor(y_test)

print(train_x.shape, train_y.shape)
print(val_x.shape, val_y.shape)
print(test_x.shape, test_y.shape)

(16364504, 64) (16364504, 64)
(2045563, 64) (2045563, 64)
(2045563, 64) (2045563, 64)
----------------------------------------------------------------------------------------------------
torch.Size([16364504, 64]) torch.Size([16364504, 64])
torch.Size([2045563, 64]) torch.Size([2045563, 64])
torch.Size([2045563, 64]) torch.Size([2045563, 64])


In [10]:
train_x[5]

tensor([59, 74, 74, 83,  1, 45, 76, 67, 72, 61, 63, 13,  0, 37, 38, 36, 37,  1,
        59, 60, 73, 80, 63,  1, 78, 66, 63,  1, 61, 67, 78, 83, 11,  1, 73, 72,
         1, 59,  1, 78, 59, 70, 70,  1, 61, 73, 70, 79, 71, 72, 11,  1, 77, 78,
        73, 73, 62,  1, 78, 66, 63,  1, 77, 78])

In [11]:
train_y[5]

tensor([74, 74, 83,  1, 45, 76, 67, 72, 61, 63, 13,  0, 37, 38, 36, 37,  1, 59,
        60, 73, 80, 63,  1, 78, 66, 63,  1, 61, 67, 78, 83, 11,  1, 73, 72,  1,
        59,  1, 78, 59, 70, 70,  1, 61, 73, 70, 79, 71, 72, 11,  1, 77, 78, 73,
        73, 62,  1, 78, 66, 63,  1, 77, 78, 59])