In [1]:
# @title 1. Setup & Data Loading
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd
import math
import time
from google.colab import drive

# 1. setup environment
drive.mount('/content/drive')
BASE_DIR = '/content/drive/My Drive/HW6_Project'

os.makedirs(f"{BASE_DIR}/logs", exist_ok=True)       # For CSVs
os.makedirs(f"{BASE_DIR}/images", exist_ok=True)     # For Plots
os.makedirs(f"{BASE_DIR}/checkpoints", exist_ok=True) # For Models

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 2. tokenizers
class CharTokenizer:
    def __init__(self, text):
        chars = sorted(list(set(text)))
        self.vocab_size = len(chars)
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
    def encode(self, s): return [self.stoi[c] for c in s]
    def decode(self, l): return ''.join([self.itos[i] for i in l])

class WordTokenizer:
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        words = text.replace('\n', ' <eos> ').split()
        self.vocab = sorted(list(set(words)))
        self.vocab_size = len(self.vocab)
        self.stoi = { w:i for i,w in enumerate(self.vocab) }
        self.itos = { i:w for i,w in enumerate(self.vocab) }
    def encode(self, s_list): return [self.stoi.get(w, 0) for w in s_list]
    def decode(self, l): return ' '.join([self.itos[i] for i in l])

# 3. data loader
def load_data_from_drive(dataset_name, mode='word'):
    folder = f"{BASE_DIR}/datasets/{dataset_name}"
    print(f"--- Loading {dataset_name} ({mode}) ---")

    # create tokenizer from TRAIN set
    if mode == 'char':
        # check if split files exist, otherwise use input.txt
        if os.path.exists(f"{folder}/train.txt"):
            with open(f"{folder}/train.txt", 'r') as f: train_text = f.read()
            tokenizer = CharTokenizer(train_text)
        else:
            with open(f"{folder}/input.txt", 'r') as f: train_text = f.read()
            tokenizer = CharTokenizer(train_text)
    else:
        tokenizer = WordTokenizer(f"{folder}/train.txt")

    # helper to load file -> tensor
    def file_to_tensor(fname):
        path = f"{folder}/{fname}"
        if not os.path.exists(path):
            if fname == 'train.txt' and os.path.exists(f"{folder}/input.txt"):
                 with open(f"{folder}/input.txt", 'r') as f: content = f.read()
                 split_idx = int(0.9 * len(content))
                 content = content[:split_idx]
            else:
                 print(f"Warning: {fname} not found in {folder}")
                 return torch.tensor([], dtype=torch.long)
        else:
            with open(path, 'r') as f: content = f.read()

        if mode == 'char':
            encoded = tokenizer.encode(content)
        else:
            encoded = tokenizer.encode(content.replace('\n', ' <eos> ').split())
        return torch.tensor(encoded, dtype=torch.long)

    return {
        'train': file_to_tensor('train.txt'),
        'val':   file_to_tensor('valid.txt'),
        'test':  file_to_tensor('test.txt'),
        'tokenizer': tokenizer
    }

# 4. load data
try:
    shakespeare = load_data_from_drive('tiny_shakespeare', mode='char')
    wiki = load_data_from_drive('wikitext-2', mode='word')
    ptb = load_data_from_drive('ptb', mode='word')
    print("\nSUCCESS: All datasets loaded.")
    print(f"Shakespeare Vocab: {shakespeare['tokenizer'].vocab_size}")
    print(f"WikiText Vocab: {wiki['tokenizer'].vocab_size}")
    print(f"PTB Vocab: {ptb['tokenizer'].vocab_size}")
except Exception as e:
    print(f"\nERROR: {e}")
    print("Please check your Drive folder structure.")

Mounted at /content/drive
Using device: cuda
--- Loading tiny_shakespeare (char) ---
--- Loading wikitext-2 (word) ---
--- Loading ptb (word) ---

SUCCESS: All datasets loaded.
Shakespeare Vocab: 65
WikiText Vocab: 33278
PTB Vocab: 10000


In [5]:
# @title 2. Model Definitions (Fixed Mamba Einsum & Split)
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# --- A. HELPER MODULES ---
class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )
    def forward(self, x): return self.net(x)

# --- B. LINEAR ---
class LinearModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    def forward(self, idx, targets=None):
        x = self.token_embedding(idx)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# --- C. MLP ---
class MLPModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_hidden=256):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_hidden), nn.ReLU(),
            nn.Linear(n_hidden, n_embd)
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)
    def forward(self, idx, targets=None):
        x = self.token_embedding(idx)
        x = self.mlp(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# --- D. ATTENTION ---
class Head(nn.Module):
    def __init__(self, head_size, n_embd, block_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embd, block_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.proj(out)

# --- E. SELF-ATTENTION ONLY ---
class SelfAttentionModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_head):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.sa_head = MultiHeadAttention(n_head, n_embd//n_head, n_embd, block_size)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# --- F. TRANSFORMER ---
class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_head, block_size):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = LayerNorm(n_embd, bias=True)
        self.ln2 = LayerNorm(n_embd, bias=True)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head, block_size) for _ in range(n_layer)])
        self.ln_f = LayerNorm(n_embd, bias=True)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# --- G. MAMBA ---
class MinimalMambaBlock(nn.Module):
    def __init__(self, n_embd, d_state=16, d_conv=4, expand=2):
        super().__init__()
        self.d_inner = int(expand * n_embd)
        self.dt_rank = math.ceil(n_embd / 16)
        self.d_state = d_state

        self.in_proj = nn.Linear(n_embd, self.d_inner * 2, bias=False)
        self.conv1d = nn.Conv1d(in_channels=self.d_inner, out_channels=self.d_inner,
                                kernel_size=d_conv, groups=self.d_inner, padding=d_conv - 1)
        self.x_proj = nn.Linear(self.d_inner, self.dt_rank + d_state * 2, bias=False)
        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True)

        # Ensure A_log is compatible with varying d_state
        A_init = torch.arange(1, d_state + 1, dtype=torch.float32).repeat(self.d_inner, 1)
        self.A_log = nn.Parameter(torch.log(A_init))
        self.D = nn.Parameter(torch.ones(self.d_inner))
        self.out_proj = nn.Linear(self.d_inner, n_embd, bias=False)
        self.act = nn.SiLU()

    def selective_scan(self, u, delta, A, B, C, D):
        # A comes in as (d_inner, d_state)
        # delta comes in as (batch, seq_len, d_inner)


        batch, seq_len, d_inner = u.shape
        d_state = A.shape[-1]

        #  Explicit reshaping to avoid Einsum ambiguity
        # delta: (B, L, D_in) -> (B, L, D_in, 1)
        # A: (D_in, D_state) -> (1, 1, D_in, D_state)
        # Result: (B, L, D_in, D_state)
        deltaA = torch.exp(delta.unsqueeze(-1) * A.view(1, 1, d_inner, d_state))

        # B: (B, L, D_state)
        # u: (B, L, D_in)
        # deltaB_u = delta * B * u
        # delta: (B, L, D_in, 1)
        # B: (B, L, 1, D_state)
        # u: (B, L, D_in, 1)
        deltaB_u = delta.unsqueeze(-1) * u.unsqueeze(-1) * B.unsqueeze(2)

        x = torch.zeros((batch, d_inner, d_state), device=u.device)
        ys = []
        for i in range(seq_len):
            x = deltaA[:, i] * x + deltaB_u[:, i]
            # x: (B, D_in, D_state)
            # C: (B, L, D_state) -> C[:, i]: (B, D_state)
            # y = x @ C
            y = torch.einsum('bdn,bn->bd', x, C[:, i])
            ys.append(y)

        y = torch.stack(ys, dim=1)
        y = y + u * D
        return y

    def forward(self, x):
        B, L, E = x.shape
        xz = self.in_proj(x)
        x_in, z = xz.chunk(2, dim=-1)
        x_conv = self.conv1d(x_in.transpose(1, 2))[:, :, :L].transpose(1, 2)
        x_conv = self.act(x_conv)
        x_dbl = self.x_proj(x_conv)

        delta, B_ssm, C_ssm = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)

        delta = F.softplus(self.dt_proj(delta))
        A = -torch.exp(self.A_log)
        y = self.selective_scan(x_conv, delta, A, B_ssm, C_ssm, self.D)
        y = y * self.act(z)
        return self.out_proj(y)

class MambaModel(nn.Module):
    def __init__(self, vocab_size, n_embd, block_size, n_layer, d_state=16):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.blocks = nn.Sequential(*[
            nn.Sequential(MinimalMambaBlock(n_embd, d_state=d_state), LayerNorm(n_embd, bias=True))
            for _ in range(n_layer)
        ])
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size
    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.token_embedding(idx)
        x = self.blocks(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
# @title 3. The Training Engine (Strict Data Hygiene)
import time
import json
import torch
import pandas as pd
import numpy as np

# --- CONFIGURATION ---
BLOCK_SIZE = 128
BATCH_SIZE = 64  # Shakespeare
WORD_BATCH_SIZE = 32 # Wiki/PTB
MAX_ITERS = 1000
EVAL_INTERVAL = 100
LR_CHAR = 1e-3
LR_WORD = 5e-4

# --- HELPERS ---
def get_batch(data_dict, split, block_size, batch_size):
    data = data_dict[split]
    if len(data) <= block_size:
        return data[:-1].unsqueeze(0), data[1:].unsqueeze(0)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_single_split_loss(model, data_dict, split, block_size, batch_size, eval_iters=20):
    # Helper to calculate loss on just ONE split (train OR test)
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(data_dict, split, block_size, batch_size)
        logits, loss = model(X, Y)
        losses[k] = loss.item()
    model.train()
    return losses.mean().item()

def count_flops(model_name, params, steps, batch_size, block_size):
    factor = 6
    if 'Linear' in model_name or 'MLP' in model_name: factor = 2
    if 'Mamba' in model_name: factor = 4
    return factor * params * steps * batch_size * block_size

def run_experiment(name, model, data_dict, block_size, batch_size, max_iters, lr):
    print(f"--> Training {name}...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    logs = []
    n_params = sum(p.numel() for p in model.parameters())

    # --- TRAINING LOOP ---
    for iter in range(max_iters):
        xb, yb = get_batch(data_dict, 'train', block_size, batch_size)
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        # Log Training Progress
        if iter % EVAL_INTERVAL == 0:
            train_loss = estimate_single_split_loss(model, data_dict, 'train', block_size, batch_size)
            flops = count_flops(name, n_params, iter+1, batch_size, block_size)
            logs.append({
                'step': iter, 'flops': flops,
                'train_loss': train_loss,
                'test_loss': np.nan,
                'params': n_params
            })
            print(f"    Step {iter}: Train Loss {train_loss:.3f}")

    # --- FINAL EVALUATION  ---
    print("    Training complete. Performing Final Test Set Evaluation...")
    final_test_loss = estimate_single_split_loss(model, data_dict, 'test', block_size, batch_size)
    total_flops = count_flops(name, n_params, max_iters, batch_size, block_size)

    # Log the final result
    logs.append({
        'step': max_iters, 'flops': total_flops,
        'train_loss': np.nan,
        'test_loss': final_test_loss,
        'params': n_params
    })
    print(f"    >> FINAL RESULTS: Total FLOPs={total_flops:.2e}, Test Loss={final_test_loss:.4f}")

    pd.DataFrame(logs).to_csv(f"{BASE_DIR}/logs/{name}.csv", index=False)
    torch.save(model.state_dict(), f"{BASE_DIR}/checkpoints/{name}.pt")
    return final_test_loss

# --- 1. HYPERPARAMETER SWEEPS (Tiny Shakespeare) ---
print("\n=== DELIVERABLE 1 SWEEPS (Tiny Shakespeare) ===")
sweep_results = {'Linear': [], 'MLP': [], 'SelfAttn': [], 'Transformer': [], 'Mamba': []}

# A. Linear
for c in [32, 64, 128, 256]:
    m = LinearModel(shakespeare['tokenizer'].vocab_size, 128, c).to(device)
    loss = run_experiment(f"Linear_ctx{c}", m, shakespeare, c, BATCH_SIZE, MAX_ITERS, LR_CHAR)
    sweep_results['Linear'].append({'val': c, 'loss': loss})

# B. MLP
for h in [64, 128, 256, 512]:
    m = MLPModel(shakespeare['tokenizer'].vocab_size, 128, BLOCK_SIZE, n_hidden=h).to(device)
    loss = run_experiment(f"MLP_hid{h}", m, shakespeare, BLOCK_SIZE, BATCH_SIZE, MAX_ITERS, LR_CHAR)
    sweep_results['MLP'].append({'val': h, 'loss': loss})

# C. Self-Attn
for h in [2, 4, 8]:
    m = SelfAttentionModel(shakespeare['tokenizer'].vocab_size, 128, BLOCK_SIZE, n_head=h).to(device)
    loss = run_experiment(f"SelfAttn_head{h}", m, shakespeare, BLOCK_SIZE, BATCH_SIZE, MAX_ITERS, LR_CHAR)
    sweep_results['SelfAttn'].append({'val': h, 'loss': loss})

# D. Transformer (Layers)
for l in [2, 4, 6]:
    m = TransformerModel(shakespeare['tokenizer'].vocab_size, 128, BLOCK_SIZE, n_head=4, n_layer=l).to(device)
    loss = run_experiment(f"Transformer_lay{l}", m, shakespeare, BLOCK_SIZE, BATCH_SIZE, MAX_ITERS, LR_CHAR)
    sweep_results['Transformer'].append({'val': l, 'loss': loss})

# E. Mamba (State Dim)
for s in [8, 16, 32]:
    m = MambaModel(shakespeare['tokenizer'].vocab_size, 128, BLOCK_SIZE, n_layer=4, d_state=s).to(device)
    loss = run_experiment(f"Mamba_state{s}", m, shakespeare, BLOCK_SIZE, BATCH_SIZE, MAX_ITERS, LR_CHAR)
    sweep_results['Mamba'].append({'val': s, 'loss': loss})

with open(f"{BASE_DIR}/logs/sweep_summary.json", 'w') as f: json.dump(sweep_results, f)

# --- FIND WINNERS ---
best_tf = min(sweep_results['Transformer'], key=lambda x: x['loss'])
WINNER_LAYERS = best_tf['val']

best_mamba = min(sweep_results['Mamba'], key=lambda x: x['loss'])
WINNER_STATE = best_mamba['val']

print("\n" + "="*50)
print(f">> TRANSFORMER WINNER: {WINNER_LAYERS} Layers (Loss: {best_tf['loss']:.4f})")
print(f">> MAMBA WINNER:       State Dim {WINNER_STATE} (Loss: {best_mamba['loss']:.4f})")
print("="*50 + "\n")

# --- 2. WORD LEVEL BENCHMARKS (Adaptive) ---
print(f"=== DELIVERABLE 2 BENCHMARKS (Adaptive Settings) ===")
datasets = [('WikiText2', wiki), ('PTB', ptb)]

for ds_name, ds_data in datasets:
    vocab = ds_data['tokenizer'].vocab_size

    # 1. Transformer
    tf = TransformerModel(vocab, 128, BLOCK_SIZE, n_head=4, n_layer=WINNER_LAYERS).to(device)
    run_experiment(f"{ds_name}_Transformer", tf, ds_data, BLOCK_SIZE, WORD_BATCH_SIZE, MAX_ITERS, LR_WORD)

    # 2. Mamba
    ma = MambaModel(vocab, 128, BLOCK_SIZE, n_layer=WINNER_LAYERS, d_state=WINNER_STATE).to(device)
    run_experiment(f"{ds_name}_Mamba", ma, ds_data, BLOCK_SIZE, WORD_BATCH_SIZE, MAX_ITERS, LR_WORD)

print("\nALL TRAINING COMPLETE.")


=== DELIVERABLE 1 SWEEPS (Tiny Shakespeare) ===
--> Training Linear_ctx32...
    Step 0: Train Loss 4.344
    Step 100: Train Loss 2.719
    Step 200: Train Loss 2.555
    Step 300: Train Loss 2.501
    Step 400: Train Loss 2.497
    Step 500: Train Loss 2.474
    Step 600: Train Loss 2.471
    Step 700: Train Loss 2.473
    Step 800: Train Loss 2.450
    Step 900: Train Loss 2.466
    Training complete. Performing Final Test Set Evaluation...
    >> FINAL RESULTS: Total FLOPs=6.84e+10, Test Loss=2.5150
--> Training Linear_ctx64...
    Step 0: Train Loss 4.300
    Step 100: Train Loss 2.703
    Step 200: Train Loss 2.528
    Step 300: Train Loss 2.500
    Step 400: Train Loss 2.475
    Step 500: Train Loss 2.471
    Step 600: Train Loss 2.466
    Step 700: Train Loss 2.464
    Step 800: Train Loss 2.472
    Step 900: Train Loss 2.457
    Training complete. Performing Final Test Set Evaluation...
    >> FINAL RESULTS: Total FLOPs=1.37e+11, Test Loss=2.5149
--> Training Linear_ctx128...

In [None]:
# @title 4. Plotting
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
import numpy as np

# --- SETUP ---
BASE_DIR = '/content/drive/My Drive/HW6_Project'
IMG_DIR = f"{BASE_DIR}/images"
LOG_DIR = f"{BASE_DIR}/logs"
os.makedirs(IMG_DIR, exist_ok=True)

# load sweep data
with open(f"{LOG_DIR}/sweep_summary.json", 'r') as f:
    sweeps = json.load(f)

colors = {'Linear': 'tab:blue', 'MLP': 'tab:orange', 'SelfAttn': 'tab:green',
          'Transformer': 'tab:red', 'Mamba': 'tab:purple'}
markers = {'Linear': 'o', 'MLP': 's', 'SelfAttn': '^', 'Transformer': 'D', 'Mamba': 'v'}

# identify best configs for the combined plots
best_files = {}
for model_name, results in sweeps.items():
    best_run = min(results, key=lambda x: x['loss'])
    val = best_run['val']

    if model_name == 'Linear': fname = f"Linear_ctx{val}"
    elif model_name == 'MLP': fname = f"MLP_hid{val}"
    elif model_name == 'SelfAttn': fname = f"SelfAttn_head{val}"
    elif model_name == 'Transformer': fname = f"Transformer_lay{val}"
    elif model_name == 'Mamba': fname = f"Mamba_state{val}"
    best_files[model_name] = fname

print("Best Configurations Found:")
for k, v in best_files.items(): print(f"  {k}: {v}")


# ==============================================================================
# 1. DELIVERABLE: plot_1_shakespeare_train.png (Combined Training Convergence)
# ==============================================================================
print("\nGenerating Plot 1: Shakespeare Training Convergence...")
plt.figure(figsize=(10, 6))

for model_label, filename in best_files.items():
    try:
        df = pd.read_csv(f"{LOG_DIR}/{filename}.csv")
        train_df = df.dropna(subset=['train_loss'])

        plt.plot(train_df['epoch'], train_df['train_loss'],
                 label=model_label, color=colors[model_label], linewidth=2, alpha=0.8)
    except FileNotFoundError:
        print(f"  Warning: {filename} not found.")

plt.xlabel('Effective Epochs', fontsize=12, fontweight='bold')
plt.ylabel('Training Loss', fontsize=12, fontweight='bold')
plt.title('Training Convergence on Tiny Shakespeare', fontsize=14)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/plot_1_shakespeare_train.png", dpi=300)
plt.close()
print(f"  Saved: {IMG_DIR}/plot_1_shakespeare_train.png")


# ==============================================================================
# 2. DELIVERABLE: plot_2_hyperparam_sensitivity.png (5-Panel Figure)
# ==============================================================================
print("\nGenerating Plot 2: Hyperparameter Sensitivity...")
fig, axes = plt.subplots(1, 5, figsize=(20, 4), sharey=True)

# 5 subplots
configs = [
    ('Linear', 'Context Size', 'Linear'),
    ('MLP', 'Hidden Dim', 'MLP'),
    ('SelfAttn', 'Heads', 'SelfAttn'),
    ('Transformer', 'Layers', 'Transformer'),
    ('Mamba', 'State Dim', 'Mamba')
]

for i, (model_key, x_label, title) in enumerate(configs):
    ax = axes[i]
    data = sweeps[model_key] # list of dicts: {'val': 32, 'loss': 2.5}

    # sort by hyperparam value for clean plotting
    data.sort(key=lambda x: x['val'])

    x_vals = [d['val'] for d in data]
    y_vals = [d['loss'] for d in data]

    ax.plot(x_vals, y_vals, marker='o', linestyle='-', color=colors[model_key], linewidth=2)
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_xlabel(x_label, fontsize=10)
    ax.grid(True, linestyle='--', alpha=0.5)

    # force integer ticks for discrete params (Layers, Heads)
    if title in ['Transformer', 'SelfAttn']:
        ax.set_xticks(x_vals)

axes[0].set_ylabel('Final Test Set Loss', fontsize=12, fontweight='bold')
plt.suptitle('Hyperparameter Sensitivity Analysis', fontsize=16, y=1.05)
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/plot_2_hyperparam_sensitivity.png", dpi=300, bbox_inches='tight')
plt.close()
print(f"  Saved: {IMG_DIR}/plot_2_hyperparam_sensitivity.png")


# ==============================================================================
# 3. DELIVERABLE: plot_3_compute_efficiency.png (Efficiency Scatter)
# ==============================================================================
print("\nGenerating Plot 3: Compute Efficiency...")
plt.figure(figsize=(10, 7))

for model_label, filename in best_files.items():
    try:
        df = pd.read_csv(f"{LOG_DIR}/{filename}.csv")
        final_row = df.iloc[-1]

        # plot scatter point
        plt.scatter(final_row['flops'], final_row['test_loss'],
                    color=colors[model_label], marker=markers[model_label], s=250,
                    label=model_label, edgecolors='black', zorder=5)

        # label text near the dot
        plt.text(final_row['flops'], final_row['test_loss'] - 0.05,
                 f"{final_row['test_loss']:.2f}", ha='center', fontsize=9)

    except FileNotFoundError: continue

plt.xscale('log')
plt.xlabel('Training FLOPs (Log Scale)', fontsize=12, fontweight='bold')
plt.ylabel('Final Test Set Loss', fontsize=12, fontweight='bold')
plt.title('Compute Efficiency Frontier', fontsize=14)
plt.legend(title="Best Configurations")
plt.grid(True, which="both", linestyle='--', alpha=0.4)
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/plot_3_compute_efficiency.png", dpi=300)
plt.close()
print(f"  Saved: {IMG_DIR}/plot_3_compute_efficiency.png")


# ==============================================================================
# 4. INDIVIDUAL PLOTS
# ==============================================================================
print("\nGenerating Individual Backup Plots...")
for model_label, filename in best_files.items():
    try:
        df = pd.read_csv(f"{LOG_DIR}/{filename}.csv")
        train_df = df.dropna(subset=['train_loss'])

        plt.figure(figsize=(8, 6))
        plt.plot(train_df['epoch'], train_df['train_loss'], color=colors[model_label])
        plt.title(f"{model_label} Training Curve")
        plt.xlabel("Epochs")
        plt.ylabel("Train Loss")
        plt.grid(True, alpha=0.3)
        plt.savefig(f"{IMG_DIR}/individual_train_{model_label}.png")
        plt.close()
    except: pass

print("\nALL PLOTS GENERATED.")

In [None]:
# @title 5. Emergency Resume (Run ONLY if Runtime died during Benchmarks)
# Run Block 1 & 2 first!
import json
import torch
import pandas as pd

# Load the winner configs from the saved JSON
with open(f"{BASE_DIR}/logs/sweep_summary.json", 'r') as f:
    sweep_results = json.load(f)

best_tf = min(sweep_results['Transformer'], key=lambda x: x['loss'])
WINNER_LAYERS = best_tf['val']

best_mamba = min(sweep_results['Mamba'], key=lambda x: x['loss'])
WINNER_STATE = best_mamba['val']

print(f"Resuming with Winners -> Transformer Layers: {WINNER_LAYERS}, Mamba State: {WINNER_STATE}")

# Benchmarks
datasets = [('WikiText2', wiki), ('PTB', ptb)]
WORD_BATCH_SIZE = 32
MAX_ITERS = 1000
LR_WORD = 5e-4

for ds_name, ds_data in datasets:
    vocab = ds_data['tokenizer'].vocab_size

    # Check if Transformer is already done
    if os.path.exists(f"{BASE_DIR}/logs/{ds_name}_Transformer.csv"):
        print(f"Skipping {ds_name}_Transformer (Already Done)")
    else:
        tf = TransformerModel(vocab, 128, BLOCK_SIZE, n_head=4, n_layer=WINNER_LAYERS).to(device)
        run_experiment(f"{ds_name}_Transformer", tf, ds_data, BLOCK_SIZE, WORD_BATCH_SIZE, MAX_ITERS, LR_WORD)

    # Check if Mamba is already done
    if os.path.exists(f"{BASE_DIR}/logs/{ds_name}_Mamba.csv"):
         print(f"Skipping {ds_name}_Mamba (Already Done)")
    else:
        ma = MambaModel(vocab, 128, BLOCK_SIZE, n_layer=WINNER_LAYERS, d_state=WINNER_STATE).to(device)
        run_experiment(f"{ds_name}_Mamba", ma, ds_data, BLOCK_SIZE, WORD_BATCH_SIZE, MAX_ITERS, LR_WORD)

print("All Benchmarks Complete.")

___