# Exp0: Clean baseline (thin notebook)

Purpose:
- Quick sanity check of the classical baseline on clean data.
- Uses reusable modules in `src/sir` to avoid code duplication.

How to use:
1) Adjust the config cell (seed, limit, test_size, etc.).
2) Run all cells to compute metrics on a small subset.
3) For full runs and CSV outputs, use:
   `python scripts/exp0_run.py --test-size 0.1 --max-test 200 --n-starts 5`


In [None]:
from pathlib import Path
import sys
import numpy as np

repo_root = Path.cwd()
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

from src.sir.config import DEFAULTS, set_global_seed
from src.sir.datasets import load_sir_pkl, build_Xy_I_only, train_val_test_split
from src.sir.baseline import fit_mse
from src.sir.metrics import per_param_metrics, timing_summary


In [None]:
# Configuration
seed = 42
set_global_seed(seed)
rng = np.random.default_rng(seed)

data_path = DEFAULTS.data_path
limit = 5000  # reduce for quick sanity checks
test_size = 0.10
val_size = 0.10
n_starts = 3
max_test = 200


In [None]:
data = load_sir_pkl(data_path, limit=limit, rng=rng)
X, y = build_Xy_I_only(data, normalize=None)

splits = train_val_test_split(
    X, y, test_size=test_size, val_size=val_size, rng=rng, return_indices=True
)


In [None]:
X_test = splits['X_test']
y_test = splits['y_test']

idx = rng.choice(X_test.shape[0], size=min(max_test, X_test.shape[0]), replace=False)
X_fit = X_test[idx]
y_fit = y_test[idx]

preds = []
times = []
for i in range(X_fit.shape[0]):
    fit = fit_mse(X_fit[i], n_starts=n_starts, rng=np.random.default_rng(seed + i))
    preds.append(fit.params[:2])
    times.append(sum(fit.times))

preds = np.asarray(preds)
metrics = per_param_metrics(y_fit, preds)
metrics.update(timing_summary(np.asarray(times)))
metrics


Run full benchmark with:

```bash
python scripts/exp0_run.py --test-size 0.1 --max-test 200 --n-starts 5
```
