In [None]:
# === CELL 2: SETUP + DRIVE STAGING ===
from google.colab import drive
import os
import time
import subprocess
from pathlib import Path

print("üîß Mounting Google Drive...")
drive.mount('/content/drive')

# === PATHS (single source of truth) ===
DRIVE_BASE = '/content/drive/MyDrive/NYU_ML_Project'
DRIVE_TARBALL = f'{DRIVE_BASE}/Data/lmd_full.tar.gz'

PROCESSED_DIR = f'{DRIVE_BASE}/Data/processed_v3'
CHECKPOINT_DIR = f'{DRIVE_BASE}/checkpoints_v3'
OUTPUT_DIR = f'{DRIVE_BASE}/outputs'

os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === STAGE DATA TO LOCAL DISK ===
LOCAL_ABC_DIR = '/content/abc_corpus'

def find_abc_files(root, max_count=5000, timeout=10):
    """Quick check if directory has .abc files."""
    count = 0
    start = time.time()
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            if fn.endswith('.abc'):
                count += 1
                if count >= max_count or (time.time() - start) > timeout:
                    return count
    return count

# Check if already staged
if Path(LOCAL_ABC_DIR).exists():
    existing = find_abc_files(LOCAL_ABC_DIR, max_count=1000)
    if existing >= 1000:
        print(f"‚úÖ Local data already staged: {LOCAL_ABC_DIR} ({existing:,}+ files)")
        RAW_ABC_DIR = LOCAL_ABC_DIR
    else:
        print("‚ö†Ô∏è Local dir exists but has few files. Re-staging...")
        subprocess.run(['rm', '-rf', LOCAL_ABC_DIR], check=False)
        RAW_ABC_DIR = None
else:
    RAW_ABC_DIR = None

if RAW_ABC_DIR is None:
    # Extract tarball to local
    if not Path(DRIVE_TARBALL).exists():
        raise FileNotFoundError(f"Tarball not found: {DRIVE_TARBALL}")
    
    print(f"üì¶ Extracting {DRIVE_TARBALL} to local VM...")
    print("   (This avoids Drive I/O errors on 178K small files)")
    extract_root = Path('/content/abc_extract')
    extract_root.mkdir(exist_ok=True)
    
    subprocess.run(['tar', '-xzf', DRIVE_TARBALL, '-C', str(extract_root)], check=True)
    
    # Find the .abc directory inside extracted content
    for root, dirs, files in os.walk(extract_root):
        for fn in files:
            if fn.endswith('.abc'):
                abc_root = Path(root)
                # Move to standard location
                subprocess.run(['mv', str(abc_root), LOCAL_ABC_DIR], check=True)
                RAW_ABC_DIR = LOCAL_ABC_DIR
                break
        if RAW_ABC_DIR:
            break
    
    if not RAW_ABC_DIR:
        raise RuntimeError("Could not find .abc files in tarball")
    
    print(f"‚úÖ Staged to: {RAW_ABC_DIR}")

# Verify
count = find_abc_files(RAW_ABC_DIR, max_count=5000)
if count == 0:
    raise FileNotFoundError(f"No .abc files found in {RAW_ABC_DIR}")
print(f"‚úÖ ABC files detected: {count:,}+")

# GPU check
import torch
if torch.cuda.is_available():
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU detected! Change runtime type to GPU (L4 recommended)")

print(f"\n‚úÖ Setup complete. Ready to tokenize from: {RAW_ABC_DIR}")

In [None]:
# === CELL 3: INSTALL DEPENDENCIES ===
!pip install -q tokenizers scipy matplotlib tqdm
print("‚úÖ Packages installed")

In [None]:
# === CELL 4: TOKENIZATION (STREAMING, MEMORY-SAFE) ===
import hashlib
import pickle
import numpy as np
from tqdm import tqdm
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer_path = f'{PROCESSED_DIR}/music_bpe.json'
meta_path = f'{PROCESSED_DIR}/meta.pkl'
train_bin = f'{PROCESSED_DIR}/train.bin'
val_bin = f'{PROCESSED_DIR}/val.bin'
test_bin = f'{PROCESSED_DIR}/test.bin'

if os.path.exists(tokenizer_path) and os.path.exists(train_bin):
    print("‚úÖ Tokenization already complete. Loading metadata...")
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    print(f"   Vocab: {meta['vocab_size']}, Train tokens: {meta['train_tokens']:,}")
else:
    print("üî§ Starting tokenization pipeline...")
    
    # Discover all .abc files
    abc_files = []
    for root, _, files in os.walk(RAW_ABC_DIR):
        for fn in files:
            if fn.endswith('.abc'):
                abc_files.append(os.path.join(root, fn))
    
    print(f"   Found {len(abc_files):,} ABC files")
    if len(abc_files) == 0:
        raise FileNotFoundError("No .abc files discovered")
    
    # Deduplicate by hash (streaming)
    print("   Step 1/4: Deduplicating...")
    seen = set()
    unique_files = []
    for fp in tqdm(abc_files, desc="Dedup"):
        try:
            text = Path(fp).read_text(errors='ignore')
            if len(text) < 50:
                continue
            h = hashlib.md5(text.encode('utf-8', errors='ignore')).hexdigest()
            if h not in seen:
                seen.add(h)
                unique_files.append(fp)
        except:
            continue
    
    print(f"   Kept {len(unique_files):,} unique files")
    
    # Train BPE tokenizer
    print("   Step 2/4: Training BPE tokenizer (vocab=5000)...")
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(vocab_size=5000, special_tokens=["[UNK]", "[PAD]"], show_progress=False)
    
    def text_iter():
        for fp in unique_files:
            try:
                yield Path(fp).read_text(errors='ignore')
            except:
                continue
    
    tokenizer.train_from_iterator(text_iter(), trainer=trainer)
    tokenizer.save(tokenizer_path)
    vocab_size = tokenizer.get_vocab_size()
    print(f"   ‚úÖ Tokenizer saved: vocab={vocab_size}")
    
    # Encode to single binary (streaming)
    print("   Step 3/4: Encoding tokens to disk...")
    all_bin = f'{PROCESSED_DIR}/all.bin'
    total_tokens = 0
    with open(all_bin, 'wb') as f:
        for fp in tqdm(unique_files, desc="Encode"):
            try:
                text = Path(fp).read_text(errors='ignore')
                enc = tokenizer.encode(text)
                if enc.ids:
                    arr = np.array(enc.ids, dtype=np.uint16)
                    arr.tofile(f)
                    total_tokens += len(arr)
            except:
                continue
    
    print(f"   Total tokens: {total_tokens:,}")
    
    # Split 98/1/1
    print("   Step 4/4: Splitting train/val/test (98/1/1)...")
    n_train = int(total_tokens * 0.98)
    n_val = int(total_tokens * 0.01)
    n_test = total_tokens - n_train - n_val
    
    data = np.memmap(all_bin, dtype=np.uint16, mode='r')
    np.asarray(data[:n_train]).tofile(train_bin)
    np.asarray(data[n_train:n_train+n_val]).tofile(val_bin)
    np.asarray(data[n_train+n_val:]).tofile(test_bin)
    del data
    os.remove(all_bin)
    
    meta = {
        'vocab_size': vocab_size,
        'train_tokens': n_train,
        'val_tokens': n_val,
        'test_tokens': n_test,
        'total_files': len(unique_files)
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    
    print(f"\n‚úÖ Tokenization complete!")
    print(f"   Train: {n_train:,} | Val: {n_val:,} | Test: {n_test:,}")
    if n_train >= 100_000_000:
        print("   ‚úÖ Meets 100M token requirement")
    else:
        print(f"   ‚ö†Ô∏è Only {n_train:,} tokens (project asks for 100M+)")

print(f"\n‚úÖ Data ready in: {PROCESSED_DIR}")

In [None]:
# === CELL 5: MODEL DEFINITIONS ===
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from dataclasses import dataclass

@dataclass
class GPTConfig:
    vocab_size: int
    n_layer: int = 6
    n_head: int = 6
    n_embd: int = 384
    block_size: int = 256
    dropout: float = 0.1

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
    
    def forward(self, x):
        B, T, C = x.size()
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        return self.resid_dropout(self.c_proj(y))

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, x):
        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, idx, targets=None):
        x = self.embedding(idx)
        x, _ = self.lstm(x)
        logits = self.fc(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

class DataLoader:
    def __init__(self, data_path, block_size, batch_size):
        self.data = np.memmap(data_path, dtype=np.uint16, mode='r')
        self.block_size = block_size
        self.batch_size = batch_size
    
    def get_batch(self):
        ix = torch.randint(len(self.data) - self.block_size, (self.batch_size,))
        x = torch.stack([torch.from_numpy(self.data[i:i+self.block_size].astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy(self.data[i+1:i+1+self.block_size].astype(np.int64)) for i in ix])
        return x, y

print("‚úÖ Model classes defined (GPT, LSTM, DataLoader)")

In [None]:
# === CELL 6: TRAIN ALL MODELS (5 GPT + 4 LSTM) ===
import time
import json

with open(meta_path, 'rb') as f:
    meta = pickle.load(f)

VOCAB_SIZE = meta['vocab_size']
BLOCK_SIZE = 256
BATCH_SIZE = 32
STEPS_PER_EPOCH = meta['train_tokens'] // (BATCH_SIZE * BLOCK_SIZE)

print(f"üìä Training config: vocab={VOCAB_SIZE}, block={BLOCK_SIZE}, batch={BATCH_SIZE}")
print(f"   Steps per epoch (1 pass over data): {STEPS_PER_EPOCH:,}")

def get_lr(step, max_steps, base_lr):
    warmup = 500
    if step < warmup:
        return base_lr * (step + 1) / warmup
    progress = (step - warmup) / (max_steps - warmup)
    return base_lr * 0.5 * (1.0 + math.cos(math.pi * progress))

def train_one_epoch(model, config, name):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    
    # Optimizer with weight decay
    decay = [p for n, p in model.named_parameters() if p.requires_grad and p.dim() >= 2]
    nodecay = [p for n, p in model.named_parameters() if p.requires_grad and p.dim() < 2]
    optim = torch.optim.AdamW([
        {'params': decay, 'weight_decay': config['wd']},
        {'params': nodecay, 'weight_decay': 0.0}
    ], lr=config['lr'])
    
    train_loader = DataLoader(train_bin, BLOCK_SIZE, BATCH_SIZE)
    val_loader = DataLoader(val_bin, BLOCK_SIZE, BATCH_SIZE)
    
    # Resume from checkpoint
    ckpt_path = f"{CHECKPOINT_DIR}/{name}_checkpoint.pt"
    start_step = 0
    val_losses = []
    if os.path.exists(ckpt_path):
        ckpt = torch.load(ckpt_path, map_location=device)
        model.load_state_dict(ckpt['model'])
        optim.load_state_dict(ckpt['optimizer'])
        start_step = ckpt['step']
        val_losses = ckpt.get('val_losses', [])
        print(f"   ‚Ü©Ô∏è Resuming from step {start_step}")
    
    model.train()
    t0 = time.time()
    
    pbar = tqdm(range(start_step, STEPS_PER_EPOCH), desc=name)
    for step in pbar:
        lr = get_lr(step, STEPS_PER_EPOCH, config['lr'])
        for g in optim.param_groups:
            g['lr'] = lr
        
        x, y = train_loader.get_batch()
        x, y = x.to(device), y.to(device)
        
        _, loss = model(x, y)
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        
        pbar.set_postfix({'loss': f"{loss.item():.4f}", 'lr': f"{lr:.2e}"})
        
        # Checkpoint every 1000 steps
        if (step + 1) % 1000 == 0 or step == STEPS_PER_EPOCH - 1:
            model.eval()
            with torch.no_grad():
                xv, yv = val_loader.get_batch()
                _, vl = model(xv.to(device), yv.to(device))
                val_losses.append(vl.item())
            model.train()
            torch.save({
                'model': model.state_dict(),
                'optimizer': optim.state_dict(),
                'step': step + 1,
                'val_losses': val_losses
            }, ckpt_path)
    
    elapsed = time.time() - t0
    final_val = val_losses[-1] if val_losses else loss.item()
    
    # Save final
    final_path = f"{CHECKPOINT_DIR}/{name}_final.pt"
    torch.save({
        'model': model.state_dict(),
        'val_loss': final_val,
        'time': elapsed,
        'config': config
    }, final_path)
    
    print(f"   ‚úÖ Done: val_loss={final_val:.4f}, time={elapsed/3600:.2f}h")
    return final_val, elapsed

# 9 experiments
experiments = [
    {'name': 'gpt_tiny',   'type': 'gpt', 'n_layer': 2,  'n_embd': 128, 'n_head': 4,  'lr': 1e-3, 'wd': 0.01},
    {'name': 'gpt_small',  'type': 'gpt', 'n_layer': 4,  'n_embd': 256, 'n_head': 4,  'lr': 6e-4, 'wd': 0.01},
    {'name': 'gpt_medium', 'type': 'gpt', 'n_layer': 6,  'n_embd': 384, 'n_head': 6,  'lr': 3e-4, 'wd': 0.1},
    {'name': 'gpt_large',  'type': 'gpt', 'n_layer': 10, 'n_embd': 512, 'n_head': 8,  'lr': 2e-4, 'wd': 0.1},
    {'name': 'gpt_xl',     'type': 'gpt', 'n_layer': 16, 'n_embd': 768, 'n_head': 12, 'lr': 1e-4, 'wd': 0.1},
    {'name': 'lstm_small', 'type': 'lstm', 'layers': 2, 'hidden': 256,  'lr': 1e-3, 'wd': 0.0},
    {'name': 'lstm_medium','type': 'lstm', 'layers': 2, 'hidden': 512,  'lr': 6e-4, 'wd': 0.0},
    {'name': 'lstm_large', 'type': 'lstm', 'layers': 3, 'hidden': 768,  'lr': 3e-4, 'wd': 0.0},
    {'name': 'lstm_xl',    'type': 'lstm', 'layers': 4, 'hidden': 1024, 'lr': 1e-4, 'wd': 0.0},
]

results_pkl = f'{OUTPUT_DIR}/all_results.pkl'
if os.path.exists(results_pkl):
    with open(results_pkl, 'rb') as f:
        all_results = pickle.load(f)
else:
    all_results = {}

print(f"\nüöÄ Training {len(experiments)} models (skips already done)\n")

for exp in experiments:
    name = exp['name']
    final_path = f"{CHECKPOINT_DIR}/{name}_final.pt"
    
    if os.path.exists(final_path) and name in all_results:
        print(f"‚è© Skipping {name} (already trained)")
        continue
    
    print(f"\n‚ñ∂Ô∏è  Training {name}")
    
    try:
        if exp['type'] == 'gpt':
            model = GPTModel(GPTConfig(
                vocab_size=VOCAB_SIZE,
                n_layer=exp['n_layer'],
                n_embd=exp['n_embd'],
                n_head=exp['n_head'],
                block_size=BLOCK_SIZE
            ))
        else:
            model = LSTMModel(VOCAB_SIZE, exp['hidden'], exp['layers'])
        
        n_params = sum(p.numel() for p in model.parameters())
        print(f"   Params: {n_params:,}")
        
        val_loss, train_time = train_one_epoch(model, {'lr': exp['lr'], 'wd': exp['wd']}, name)
        
        all_results[name] = {
            'type': exp['type'],
            'params': n_params,
            'val_loss': val_loss,
            'time_h': train_time / 3600,
            'config': exp
        }
        
        with open(results_pkl, 'wb') as f:
            pickle.dump(all_results, f)
        
        del model
        torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
        continue

# Save JSON summary
summary = {k: {'params': v['params'], 'val_loss': v['val_loss'], 'time_h': v['time_h']} for k, v in all_results.items()}
with open(f'{OUTPUT_DIR}/scaling_results.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n‚úÖ Training complete! {len(all_results)} models trained.")
print(f"   Results: {OUTPUT_DIR}/scaling_results.json")

In [None]:
# === CELL 7: SCALING LAW + GENERATION + MIDI ===
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import subprocess

print("üìà Step 1/3: Fitting scaling laws...\n")

# Separate GPT vs LSTM
gpt_res = {k: v for k, v in all_results.items() if v['type'] == 'gpt'}
lstm_res = {k: v for k, v in all_results.items() if v['type'] == 'lstm'}

gpt_params = np.array([r['params'] for r in gpt_res.values()])
gpt_losses = np.array([r['val_loss'] for r in gpt_res.values()])
lstm_params = np.array([r['params'] for r in lstm_res.values()])
lstm_losses = np.array([r['val_loss'] for r in lstm_res.values()])

def power_law(N, a, alpha, c):
    return a * N**(-alpha) + c

# Fit GPT
try:
    gpt_fit, _ = curve_fit(power_law, gpt_params, gpt_losses, p0=[1.0, 0.1, 1.0], maxfev=10000)
    ga, galpha, gc = gpt_fit
    print(f"‚úÖ GPT: L = {ga:.4f}¬∑N^(-{galpha:.4f}) + {gc:.4f}")
except Exception as e:
    print(f"‚ùå GPT fit failed: {e}")
    ga, galpha, gc = None, None, None

# Fit LSTM
try:
    lstm_fit, _ = curve_fit(power_law, lstm_params, lstm_losses, p0=[1.0, 0.1, 1.0], maxfev=10000)
    la, lalpha, lc = lstm_fit
    print(f"‚úÖ LSTM: L = {la:.4f}¬∑N^(-{lalpha:.4f}) + {lc:.4f}")
except Exception as e:
    print(f"‚ùå LSTM fit failed: {e}")
    la, lalpha, lc = None, None, None

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.scatter(gpt_params, gpt_losses, s=100, c='blue', alpha=0.7, label='GPT')
for i, name in enumerate(gpt_res.keys()):
    ax1.annotate(name, (gpt_params[i], gpt_losses[i]), xytext=(5,5), textcoords='offset points', fontsize=8)
if ga:
    N_fit = np.logspace(np.log10(gpt_params.min()), np.log10(gpt_params.max()), 100)
    ax1.plot(N_fit, power_law(N_fit, ga, galpha, gc), 'r--', linewidth=2, label=f'Œ±={galpha:.4f}')
ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_xlabel('Parameters')
ax1.set_ylabel('Val Loss')
ax1.set_title('GPT Scaling Law')
ax1.legend()
ax1.grid(alpha=0.3)

ax2.scatter(lstm_params, lstm_losses, s=100, c='green', alpha=0.7, label='LSTM')
for i, name in enumerate(lstm_res.keys()):
    ax2.annotate(name, (lstm_params[i], lstm_losses[i]), xytext=(5,5), textcoords='offset points', fontsize=8)
if la:
    N_fit = np.logspace(np.log10(lstm_params.min()), np.log10(lstm_params.max()), 100)
    ax2.plot(N_fit, power_law(N_fit, la, lalpha, lc), 'r--', linewidth=2, label=f'Œ±={lalpha:.4f}')
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlabel('Parameters')
ax2.set_ylabel('Val Loss')
ax2.set_title('LSTM Scaling Law')
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/scaling_laws.png', dpi=300, bbox_inches='tight')
print(f"‚úÖ Plot saved: {OUTPUT_DIR}/scaling_laws.png\n")

# Save params
with open(f'{OUTPUT_DIR}/scaling_params.json', 'w') as f:
    json.dump({
        'gpt': {'a': float(ga) if ga else None, 'alpha': float(galpha) if galpha else None, 'c': float(gc) if gc else None},
        'lstm': {'a': float(la) if la else None, 'alpha': float(lalpha) if lalpha else None, 'c': float(lc) if lc else None}
    }, f, indent=2)

# === GENERATION ===
print("üéµ Step 2/3: Generating music samples...\n")

best_name = min(all_results.items(), key=lambda x: x[1]['val_loss'])[0]
best = all_results[best_name]
print(f"Using best model: {best_name} (val_loss={best['val_loss']:.4f})")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
ckpt = torch.load(f"{CHECKPOINT_DIR}/{best_name}_final.pt", map_location=device)

cfg = best['config']
if best['type'] == 'gpt':
    model = GPTModel(GPTConfig(vocab_size=VOCAB_SIZE, n_layer=cfg['n_layer'], n_embd=cfg['n_embd'], n_head=cfg['n_head'], block_size=BLOCK_SIZE)).to(device)
else:
    model = LSTMModel(VOCAB_SIZE, cfg['hidden'], cfg['layers']).to(device)

model.load_state_dict(ckpt['model'])
model.eval()

tokenizer = Tokenizer.from_file(tokenizer_path)

def nucleus_sample(logits, top_p=0.9, temp=0.8):
    logits = logits / temp
    probs = F.softmax(logits, dim=-1)
    sorted_probs, sorted_idx = torch.sort(probs, descending=True)
    cum = torch.cumsum(sorted_probs, dim=-1)
    mask = cum > top_p
    mask[..., 1:] = mask[..., :-1].clone()
    mask[..., 0] = 0
    sorted_probs[mask] = 0
    sorted_probs = sorted_probs / sorted_probs.sum()
    return sorted_idx[torch.multinomial(sorted_probs, 1)]

@torch.no_grad()
def generate(prompt, max_tokens=400):
    enc = tokenizer.encode(prompt)
    tokens = torch.tensor(enc.ids, dtype=torch.long, device=device).unsqueeze(0)
    for _ in range(max_tokens):
        crop = tokens[:, -BLOCK_SIZE:]
        logits, _ = model(crop)
        next_tok = nucleus_sample(logits[:, -1, :])
        tokens = torch.cat([tokens, next_tok], dim=1)
    return tokenizer.decode(tokens[0].tolist())

prompts = [
    "X:1\nM:4/4\nK:C\n", "X:1\nM:6/8\nK:G\n", "X:1\nM:3/4\nK:D\n",
    "X:1\nM:4/4\nK:Am\n", "X:1\nM:2/4\nK:F\n", "X:1\nM:4/4\nK:Em\n",
    "X:1\nM:6/8\nK:A\n", "X:1\nM:3/4\nK:Bm\n", "X:1\nM:4/4\nK:E\n",
    "X:1\nM:2/2\nK:Bb\n", "X:1\nM:9/8\nK:D\n", "X:1\nM:5/4\nK:Gm\n"
]

samples_dir = f'{OUTPUT_DIR}/generated_samples'
os.makedirs(samples_dir, exist_ok=True)
samples = []

for i, prompt in enumerate(tqdm(prompts, desc="Generate")):
    text = generate(prompt)
    path = f'{samples_dir}/sample_{i+1:02d}.abc'
    with open(path, 'w') as f:
        f.write(text)
    samples.append({'id': i+1, 'prompt': prompt, 'text': text, 'path': path})

print(f"‚úÖ Generated {len(samples)} samples\n")

# === MIDI CONVERSION ===
print("üéπ Step 3/3: Converting to MIDI & computing metrics...\n")

try:
    subprocess.run(['abc2midi', '-h'], capture_output=True)
except:
    os.system('apt-get update -qq && apt-get install -y abcmidi')

def is_valid_abc(text):
    return all(h in text for h in ['X:', 'M:', 'K:'])

def to_midi(abc_path, midi_path):
    try:
        r = subprocess.run(['abc2midi', abc_path, '-o', midi_path], capture_output=True, timeout=20)
        return r.returncode == 0
    except:
        return False

midi_dir = f'{OUTPUT_DIR}/generated_midi'
os.makedirs(midi_dir, exist_ok=True)

valid = 0
converted = 0
for s in samples:
    if is_valid_abc(s['text']):
        valid += 1
    midi_path = f"{midi_dir}/sample_{s['id']:02d}.mid"
    if to_midi(s['path'], midi_path):
        converted += 1
        s['midi'] = midi_path

pct_valid = 100 * valid / len(samples)
pct_midi = 100 * converted / len(samples)
perplexity = float(np.exp(best['val_loss']))

metrics = {
    'model': best_name,
    'params': best['params'],
    'val_loss': best['val_loss'],
    'perplexity': perplexity,
    'samples': len(samples),
    'valid_pct': pct_valid,
    'midi_pct': pct_midi
}

with open(f'{OUTPUT_DIR}/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"‚úÖ Perplexity: {perplexity:.2f}")
print(f"‚úÖ Valid ABC: {valid}/{len(samples)} ({pct_valid:.1f}%)")
print(f"‚úÖ MIDI converted: {converted}/{len(samples)} ({pct_midi:.1f}%)")
print(f"\n‚úÖ All outputs saved to: {OUTPUT_DIR}")

## üéâ COMPLETE!

### Files Created

**In Drive (`/MyDrive/NYU_ML_Project/`):**

üìÅ `Data/processed_v3/`
- `music_bpe.json` ‚Äî BPE tokenizer (vocab=5000)
- `train.bin`, `val.bin`, `test.bin` ‚Äî 98/1/1 split
- `meta.pkl` ‚Äî token counts

üìÅ `checkpoints_v3/`
- `gpt_tiny_final.pt` ... `lstm_xl_final.pt` ‚Äî 9 trained models
- `*_checkpoint.pt` ‚Äî resume points

üìÅ `outputs/`
- `scaling_results.json` ‚Äî all training results
- `scaling_params.json` ‚Äî fitted Œ± exponents
- `scaling_laws.png` ‚Äî log-log plots
- `metrics.json` ‚Äî perplexity, % valid, % MIDI
- `generated_samples/*.abc` ‚Äî 12 generated songs
- `generated_midi/*.mid` ‚Äî MIDI files

---

### Next Steps

1. **Download results:** Right-click `outputs/` ‚Üí Download
2. **Listen to music:** Open `.mid` files in any MIDI player
3. **Write report (6-10 pages):**
   - Intro + motivation
   - Dataset (178K ABC files, 100M+ tokens)
   - Methods (BPE tokenization, 9 models, 1 epoch)
   - Results (scaling laws, Œ± exponents, plots)
   - Analysis (sample quality, MIDI conversion rates)
   - Conclusion
4. **Submit by Dec 15, 2025**

---

### Key Results

Check these files for your report:
- **GPT Œ±:** `outputs/scaling_params.json` ‚Üí `gpt.alpha`
- **LSTM Œ±:** `outputs/scaling_params.json` ‚Üí `lstm.alpha`
- **Best model:** `outputs/metrics.json` ‚Üí `model`
- **Perplexity:** `outputs/metrics.json` ‚Üí `perplexity`
- **Sample quality:** `outputs/metrics.json` ‚Üí `valid_pct`, `midi_pct`

---

**Total runtime:** ~20-24 hours (mostly Cell 6 training)