# Train NanoGPT on Karpathy's hyperparams

## Load the dataset

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1337)
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
%matplotlib inline

In [38]:
with open("input.txt") as file:
    data = file.read()

len(data)

1115394

In [39]:
chars = sorted(list(set(data)))
chars

['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [40]:
len(chars)

65

In [41]:
stoi = {c:i for i, c in enumerate(chars)}
itos = {i:c for i, c in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

In [42]:
tokens = encode(data)
len(tokens)

1115394

In [43]:
data_size = len(tokens)
split_idx = int(0.8 * data_size)
train_tokens = tokens[:split_idx]
val_tokens = tokens[split_idx:]
len(train_tokens), len(val_tokens)

(892315, 223079)

In [44]:
def get_batch(tokens, block_size, batch_size):
    batch = torch.randint(0, len(tokens)-block_size, (batch_size,)) # B dimension array of random indices
    Xb = torch.stack([torch.LongTensor(tokens[i:i+block_size]) for i in batch], dim=0) # Create (B, T) dimension array
    yb = torch.stack([torch.LongTensor(tokens[i+1:i+block_size+1]) for i in batch], dim=0) # Create (B, T) dimension array
    return Xb, yb

In [48]:
@torch.no_grad()
def compute_loss(tokens, block_size, batch_size, model, device):
    loss_values = []
    for _ in range(100):
        Xb, yb = get_batch(tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        _, loss = model(Xb, yb)
        loss_values.append(loss.item())

    mean_loss = torch.FloatTensor(loss_values).mean().item()
    return mean_loss

In [69]:
def train(train_tokens, val_tokens, model, optimizer, device, block_size, batch_size, n_iters, eval_interval):
    train_lossi, val_lossi = [], []

    for i in range(n_iters):
        model.train()
        Xb, yb = get_batch(train_tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        # forward
        _, loss = model(Xb, yb)

        # set grads to zero
        optimizer.zero_grad(set_to_none=True)

        # do backward
        loss.backward()

        # optimizer step
        optimizer.step()

        if (i % eval_interval == 0) or (i == n_iters - 1):
            model.eval()
            train_loss = compute_loss(train_tokens, block_size, batch_size, model, device)
            val_loss = compute_loss(val_tokens, block_size, batch_size, model, device)

            train_lossi.append(train_loss)
            val_lossi.append(val_loss)

            print(f"Step {i}/{n_iters} --> Train: {train_loss:.4f} | Val: {val_loss:.4f}")

        # break

    return train_lossi, val_lossi

## Implement NanoGPT

![nanogpt-architecture](nanogpt-architecture.png)

In [72]:
class SelfAttentionDecoder(nn.Module):
    def __init__(self, channel_dim, head_dim, block_size, dropout):
        super().__init__()
        self.k_project = nn.Linear(channel_dim, head_dim, bias=False)
        self.q_project = nn.Linear(channel_dim, head_dim, bias=False)
        self.v_project = nn.Linear(channel_dim, head_dim, bias=False)
        self.dropout = nn.Dropout(dropout)
        mask = torch.triu(torch.ones(block_size, block_size), diagonal=1).bool()
        self.register_buffer('mask', mask)

    def forward(self, x):
        # x shape: (B, T, C)
        # create keys queries and values
        k, q, v = self.k_project(x), self.q_project(x), self.v_project(x) # all shaped (B, T, H)

        # get the weights
        B, T, H = k.shape
        wei = (q @ k.transpose(-2, -1)) * (H**-0.5) # B,T,H @ B,H,T --> B,T,T
        wei = wei.masked_fill(self.mask[:T, :T], float('-inf')) # B,T,T
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # get the out
        out = wei @ v # (B,T,T) @ (B,T,H) -> (B,T,H)
        return out

class MultiHeadSelfAttentionDecoderBlock(nn.Module):
    def __init__(self, channel_dim, num_heads, head_dim, block_size, dropout):
        super().__init__()
        # MHA
        self.ln1 = nn.LayerNorm(channel_dim)
        self.mha_decoder = nn.ModuleList([SelfAttentionDecoder(channel_dim, head_dim, block_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads*head_dim, channel_dim)
        self.dropout1 = nn.Dropout(dropout)

        # FFN
        self.ln2 = nn.LayerNorm(channel_dim)
        self.ffn = nn.Sequential(
            nn.Linear(channel_dim, 4 * channel_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(4 * channel_dim, channel_dim),
        )
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.proj(torch.cat([head(self.ln1(x)) for head in self.mha_decoder],\
                           dim=-1)) # head (x) -> (B,T,H) * NH -> [B,T,H*NH]
        x = self.dropout1(x)
        x = x + self.ffn(self.ln2(x))
        x = self.dropout2(x)

        return x

class NanoGPT(nn.Module):
    def __init__(self, emb_dim, vocab_size, block_size, num_heads, n_layers, dropout, device):
        super().__init__()
        self.device = device
        self.block_size = block_size
        # get the embeddings matrix
        self.tok_embs = nn.Embedding(vocab_size, emb_dim)
        self.pos_embs = nn.Embedding(block_size, emb_dim)

        self.mha_block = nn.Sequential(
            *[MultiHeadSelfAttentionDecoderBlock(emb_dim, num_heads, emb_dim//num_heads, block_size, dropout) for _ in range(n_layers)]
        )

        # Final linear layer
        self.ln = nn.LayerNorm(emb_dim)
        self.lm_layer = nn.Linear(emb_dim, vocab_size)

        self.apply(self._init_weights)

        print(f"No. of parameters: {sum([p.numel() for p in self.parameters()])}")

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def forward(self, x, targets=None):
        # x shape (B, T), targets shape (B, T)
        B, T = x.shape
        token_embs = self.tok_embs(x) #(B,T,C)
        position_embs = self.pos_embs(torch.arange(T).to(self.device)) #(T,C)
        embs = token_embs + position_embs
        embs = self.mha_block(embs)
        logits = self.lm_layer(self.ln(embs))

        if targets is None:
            loss = None
        else:
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

        return logits, loss


In [78]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
n_iters = 5000
eval_interval = n_iters//10
lr = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
emb_dim = 384
num_heads = 6
n_layers = 6
dropout = 0.2
vocab_size = len(stoi)

In [79]:
model = NanoGPT(emb_dim=emb_dim, vocab_size=vocab_size, block_size=block_size, num_heads=num_heads,\
                 n_layers=n_layers, dropout=dropout, device=device)
model = model.to(device)

No. of parameters: 10788929


In [25]:
10,788,929

(10, 788, 929)

In [80]:
optimizer = optim.AdamW(model.parameters(), lr=lr)

In [None]:
train_lossi, val_lossi = train(train_tokens=train_tokens, val_tokens=val_tokens, model=model, optimizer=optimizer,\
      device=device, block_size=block_size, batch_size=batch_size, n_iters=n_iters, eval_interval=eval_interval)

Step 0/5000 --> Train: 3.8190 | Val: 3.8336
Step 500/5000 --> Train: 2.0245 | Val: 2.1289
Step 1000/5000 --> Train: 1.6913 | Val: 1.9363
Step 1500/5000 --> Train: 1.5734 | Val: 1.8870
Step 2000/5000 --> Train: 1.5145 | Val: 1.8665


In [None]:
@torch.no_grad()
def generate(model, max_tokens=1000, random_seed=1337):
  model.eval()

  torch.manual_seed(random_seed)

  curr_window = torch.LongTensor([[0]]).to(device)

  full_window = []

  for _ in range(max_tokens):
    # truncate current window to block size
    curr_window = curr_window[:, -block_size:]

    # forward pass and get logits
    logits, _ = model(curr_window) # (B,T,V)

    # get probs and sample
    probs = torch.softmax(logits, dim=-1)
    B,T,C = probs.shape
    next_idx = torch.multinomial(probs.view(B*T, C), num_samples=1, replacement=True)[-1].item() # (B*T,)

    # collect the resulst
    full_window.append(next_idx)
    curr_window = torch.cat([curr_window,\
                             torch.LongTensor([[next_idx]]).to(device)], dim=-1)
    
  generated_str = decode(full_window)
  return generated_str

In [None]:
display(Markdown(generate(model)))

In [None]:
generated_str = generate(model, max_tokens=10000, random_seed=42)
with open("out.txt", "w") as f:
  f.write(generated_str)