# Scaling Laws & HPO with nanoGPT

This notebook demonstrates how to run scaling studies (e.g., batch size, depth) and visualize the results.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from experiments import ExperimentRunner

%matplotlib inline

## 1. Setup Experiment Runner
We define the base configuration and the output directory.

In [None]:
# Ensure a base config exists. We can use a CLI override approach completely, or point to a file.
# Let's use a dummy base and override everything relevant.
runner = ExperimentRunner(base_config_path='', output_root='experiments_out')

# Common overrides for fast testing
base_params = {
    'dataset': 'shakespeare_char',
    'device': 'cpu', # Use 'cuda' for real runs
    'compile': False,
    'eval_iters': 1,
    'log_interval': 10,
    'max_iters': 50,
    'block_size': 64,
    'n_embd': 64,
    'n_head': 4,
    'n_layer': 2
}


## 2. Batch Scaling Study
Vary batch size and observe performance/throughput.

In [None]:
batch_sizes = [2, 4, 8]
runner.run_grid(
    grid_name='batch_scaling',
    base_params=base_params,
    grid_params={'batch_size': batch_sizes}
)

### Analyze Batch Scaling Results

In [None]:
metrics = runner.load_metrics(grid_name_filter='batch_scaling')
df = pd.DataFrame(metrics)

if not df.empty:
    # Extract config values if needed
    df['batch_size'] = df['config'].apply(lambda c: c.get('batch_size'))
    
    plt.figure(figsize=(10, 5))
    sns.lineplot(data=df, x='batch_size', y='best_val_loss', marker='o')
    plt.title('Batch Size vs Validation Loss')
    plt.xlabel('Batch Size')
    plt.ylabel('Best Val Loss')
    plt.grid(True)
    plt.show()

## 3. Depth Scaling Study
Vary `n_layer` to see scaling.

In [None]:
layers = [2, 4]
runner.run_grid(
    grid_name='depth_scaling',
    base_params=base_params,
    grid_params={'n_layer': layers}
)

In [None]:
metrics = runner.load_metrics(grid_name_filter='depth_scaling')
df = pd.DataFrame(metrics)

if not df.empty:
    df['n_layer'] = df['config'].apply(lambda c: c.get('n_layer'))
    
    plt.figure(figsize=(10, 5))
    sns.lineplot(data=df, x='n_layer', y='best_val_loss', marker='o')
    plt.title('Depth (Layers) vs Validation Loss')
    plt.grid(True)
    plt.show()

## 4. HPO: Learning Rate vs Weight Decay
Grid search for Hps.

In [None]:
lrs = [1e-3, 6e-4]
wds = [0.0, 0.1]

runner.run_grid(
    grid_name='hpo_lr_wd',
    base_params=base_params,
    grid_params={'learning_rate': lrs, 'weight_decay': wds}
)

In [None]:
metrics = runner.load_metrics(grid_name_filter='hpo_lr_wd')
df = pd.DataFrame(metrics)

if not df.empty:
    df['learning_rate'] = df['config'].apply(lambda c: c.get('learning_rate'))
    df['weight_decay'] = df['config'].apply(lambda c: c.get('weight_decay'))
    
    pivot = df.pivot(index='learning_rate', columns='weight_decay', values='best_val_loss')
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(pivot, annot=True, fmt='.4f', cmap='viridis_r')
    plt.title('HPO: LR vs Weight Decay (Val Loss)')
    plt.show()