# Best individual model selection
Load saved artifacts from previous models (XGBoost, LightGBM, Random Forest, Linear Regression), compare validation/test metrics, and record the top-performing individual model under `artifacts/best_individual_model`.

Load artifacts produced by prior notebooks (`artifacts/xgboost`, `artifacts/lightgbm`, `artifacts/random_forest`, `artifacts/linear_regression`), gather metrics, rank by validation RMSE (tie-break on test RMSE), and write a summary/comparison to `artifacts/best_individual_model`.

In [1]:
from pathlib import Path
import json
from datetime import datetime
import pandas as pd
import numpy as np

In [2]:
# paths to artifacts
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / 'notebooks/model_evaluation_final/artifacts/data/split_config.json').exists():
        _repo = p
        break
    if (p / 'data/master_dataset.parquet').exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start
ARTIFACTS_DIR = REPO_ROOT / 'notebooks/model_evaluation_final/artifacts'
MODEL_FOLDERS = ['xgboost', 'lightgbm', 'random_forest', 'linear_regression']
EXISTING_FOLDERS = [ARTIFACTS_DIR / f for f in MODEL_FOLDERS if (ARTIFACTS_DIR / f).exists()]
BEST_DIR = ARTIFACTS_DIR / 'best_individual_model'
BEST_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR, EXISTING_FOLDERS, BEST_DIR

(PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts'),
 [PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/xgboost'),
  PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/lightgbm'),
  PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/random_forest'),
  PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/linear_regression')],
 PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/best_individual_model'))

In [3]:
# helper functions
def load_json(path: Path):
    if not path.exists():
        return None
    with open(path) as f:
        return json.load(f)


def extract_best_from_metrics(metrics: dict):
    """Return (best_name, best_block) from a metrics dict with flexible shapes."""
    if metrics is None:
        return None, None
    # preferred: explicit 'best' entry
    if isinstance(metrics.get('best'), dict):
        block = metrics['best']
        name = metrics.get('best_model') or metrics.get('best_name') or block.get('name') or 'best'
        return name, block
    # legacy explicit best_metrics block
    if isinstance(metrics.get('best_metrics'), dict):
        block = metrics['best_metrics']
        name = metrics.get('best_name') or metrics.get('best_model') or block.get('name') or 'best_metrics'
        return name, block
    # keyed best_model name
    if isinstance(metrics.get('best_model'), str):
        name = metrics['best_model']
        block = metrics.get(name) or metrics.get('best_metrics')
        if isinstance(block, dict):
            return name, block
    # keyed best_name
    if isinstance(metrics.get('best_name'), str):
        name = metrics['best_name']
        block = metrics.get(name) or metrics.get('best_metrics')
        if isinstance(block, dict):
            return name, block
    # fall back to first dict with 'val' metrics
    for k, v in metrics.items():
        if isinstance(v, dict) and 'val' in v:
            return k, v
    return None, None


def flatten_metrics(folder: Path):
    metrics_path = folder / 'metrics.json'
    hyperparams_path = folder / 'hyperparams.json'
    metrics = load_json(metrics_path)
    if metrics is None:
        return None
    best_name, best_block = extract_best_from_metrics(metrics)
    if best_block is None:
        return None
    val = best_block.get('val', {}) or {}
    test = best_block.get('test', {}) or {}
    row = {
        'model_folder': folder.name,
        'model_name': best_name,
        'val_rmse': val.get('rmse'),
        'val_mae': val.get('mae'),
        'val_r2': val.get('r2'),
        'test_rmse': test.get('rmse'),
        'test_mae': test.get('mae'),
        'test_r2': test.get('r2'),
        'has_test': len(test) > 0,
        'source_path': str(folder),
        'best_cv': metrics.get('best_cv')
    }
    hyper = load_json(hyperparams_path)
    return row, metrics, hyper

In [4]:
# load and aggregate metrics
rows = []
details = {}
for folder in EXISTING_FOLDERS:
    extracted = flatten_metrics(folder)
    if extracted is None:
        print(f"Skipping {folder.name}: no metrics found")
        continue
    row, metrics_raw, hyper_raw = extracted
    rows.append(row)
    details[folder.name] = {
        'metrics_raw': metrics_raw,
        'hyper_raw': hyper_raw
    }


comparison_df = pd.DataFrame(rows) if rows else pd.DataFrame()
if not comparison_df.empty:
    comparison_df = comparison_df.sort_values(by=['val_rmse', 'test_rmse'], ascending=[True, True], na_position='last').reset_index(drop=True)
comparison_df

Unnamed: 0,model_folder,model_name,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2,has_test,source_path,best_cv
0,xgboost,xgb_tuned,0.009073,0.005583,0.45382,0.00487,0.00412,-0.750092,True,/Users/aayushrijal/Documents/GitHub/volatility...,"{'rmse': {'mean': 0.011617455066421985, 'std':..."
1,lightgbm,lgbm_tuned,0.009286,0.005615,0.427788,0.004717,0.003923,-0.642417,True,/Users/aayushrijal/Documents/GitHub/volatility...,"{'rmse': {'mean': 0.0113725325645134, 'std': 0..."
2,random_forest,rf_baseline,0.009302,0.005954,0.425829,0.005006,0.004405,-0.84931,True,/Users/aayushrijal/Documents/GitHub/volatility...,
3,linear_regression,ridge_base,0.009371,0.005724,0.417381,0.004635,0.003977,-0.585514,True,/Users/aayushrijal/Documents/GitHub/volatility...,


In [5]:
# select top two models by val RMSE (tie-break test RMSE)
if comparison_df.empty:
    top_rows = []
    print("No models found to compare.")
else:
    top_rows = comparison_df.head(2).to_dict(orient='records')
    print("Top models selected (by val_rmse, tie test_rmse):")
    display(pd.DataFrame(top_rows))

top_rows

Top models selected (by val_rmse, tie test_rmse):


Unnamed: 0,model_folder,model_name,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2,has_test,source_path,best_cv
0,xgboost,xgb_tuned,0.009073,0.005583,0.45382,0.00487,0.00412,-0.750092,True,/Users/aayushrijal/Documents/GitHub/volatility...,"{'rmse': {'mean': 0.011617455066421985, 'std':..."
1,lightgbm,lgbm_tuned,0.009286,0.005615,0.427788,0.004717,0.003923,-0.642417,True,/Users/aayushrijal/Documents/GitHub/volatility...,"{'rmse': {'mean': 0.0113725325645134, 'std': 0..."


[{'model_folder': 'xgboost',
  'model_name': 'xgb_tuned',
  'val_rmse': 0.00907279671175083,
  'val_mae': 0.00558345122421248,
  'val_r2': 0.453820241414309,
  'test_rmse': 0.004869681452443808,
  'test_mae': 0.0041195188750506005,
  'test_r2': -0.7500920503923685,
  'has_test': True,
  'source_path': '/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/xgboost',
  'best_cv': {'rmse': {'mean': 0.011617455066421985,
    'std': 0.0030266469104378384},
   'mae': {'mean': 0.007408219298831768, 'std': 0.0009563987357859141},
   'r2': {'mean': 0.14651438041199663, 'std': 0.35973434965695955}}},
 {'model_folder': 'lightgbm',
  'model_name': 'lgbm_tuned',
  'val_rmse': 0.009286497847619708,
  'val_mae': 0.005615033211137025,
  'val_r2': 0.4277877303640396,
  'test_rmse': 0.004717498467452408,
  'test_mae': 0.003922820627835008,
  'test_r2': -0.6424165838684288,
  'has_test': True,
  'source_path': '/Users/aayushrijal/Documents/GitHub/volatility_fo

In [None]:
# persist results to best_individual_model
if not top_rows:
    print("No best models to persist.")
else:
    first_row = top_rows[0]
    second_row = top_rows[1] if len(top_rows) > 1 else None

    def build_entry(row):
        if row is None:
            return None
        return {
            'folder': Path(row['source_path']).name,
            'name': row['model_name'],
            'val': {
                'rmse': row.get('val_rmse'),
                'mae': row.get('val_mae'),
                'r2': row.get('val_r2')
            }
        }

    metrics_payload = {
        'first': build_entry(first_row),
        'second': build_entry(second_row),
        'ranking': {
            'primary': 'val_rmse',
            'secondary': 'test_rmse (if available)'
        },
        'timestamp': datetime.utcnow().isoformat() + 'Z'
    }
    with open(BEST_DIR / 'metrics.json', 'w') as f:
        json.dump(metrics_payload, f, indent=2)

    comparison_df.to_parquet(BEST_DIR / 'comparison.parquet', index=False)
    comparison_df.to_csv(BEST_DIR / 'comparison.csv', index=False)

    hyper_payload = {}
    def fetch_hyper(row, label):
        if row is None:
            return
        folder = Path(row['source_path']).name
        raw = details.get(folder, {})
        if raw.get('hyper_raw') is not None:
            hyper_payload[label] = raw['hyper_raw']

    fetch_hyper(first_row, 'first')
    fetch_hyper(second_row, 'second')
    if hyper_payload:
        with open(BEST_DIR / 'best_hyperparams.json', 'w') as f:
            json.dump(hyper_payload, f, indent=2)

    print(f"Saved best model metrics to {BEST_DIR}")

Saved best model metrics to /Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/best_individual_model
