# Ensemble of top individual models
Load the top-two individual models from `artifacts/best_individual_model`, stack them with a light meta-learner, and save the winner (ensemble vs best single) under `artifacts/best_overall_model`.

### Plan (keep the loose style from 3_xgboost)
- Pull top-two picks from `best_individual_model` (no other folders) and load train/val/test from `artifacts/data` saved by 0_data_prep.ipynb.
- Load their stored predictions, align val/test, and train a Ridge stacker.
- Also try a simple blend (weighted average) on val to see if it beats stacking.
- Compare all vs best individual by val RMSE (tie test RMSE if present), then drop results + hyperparams into `best_overall_model`. 

In [1]:
from pathlib import Path
import json
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# paths to artifacts (same path dance as other notebooks)
_start = Path.cwd().resolve()
_candidates = [_start] + list(_start.parents)
_repo = None
for p in _candidates:
    if (p / 'notebooks/model_evaluation_final/artifacts/best_individual_model/metrics.json').exists():
        _repo = p
        break
    if (p / 'data/master_dataset.parquet').exists():
        _repo = p
        break
REPO_ROOT = _repo if _repo else _start
ARTIFACTS_DIR = REPO_ROOT / 'notebooks/model_evaluation_final/artifacts'
DATA_DIR = ARTIFACTS_DIR / 'data'
BEST_INDIVIDUAL_DIR = ARTIFACTS_DIR / 'best_individual_model'
BEST_OVERALL_DIR = ARTIFACTS_DIR / 'best_overall_model'
BEST_OVERALL_DIR.mkdir(parents=True, exist_ok=True)

BEST_INDIVIDUAL_DIR, BEST_OVERALL_DIR

(PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/best_individual_model'),
 PosixPath('/Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/best_overall_model'))

In [3]:
# load data artifacts from 0_data_prep output (for sanity)
with open(DATA_DIR / 'feature_columns.json') as f:
    feature_cols = json.load(f)
with open(DATA_DIR / 'split_config.json') as f:
    split_cfg = json.load(f)

y_train = pd.read_parquet(DATA_DIR / 'y_train.parquet')[split_cfg['target_col']].values
y_val = pd.read_parquet(DATA_DIR / 'y_val.parquet')[split_cfg['target_col']].values
X_val_shape = pd.read_parquet(DATA_DIR / 'X_val.parquet').shape
has_test = (DATA_DIR / 'y_test.parquet').exists()
y_test = pd.read_parquet(DATA_DIR / 'y_test.parquet')[split_cfg['target_col']].values if has_test else np.array([])

print('Loaded data from artifacts/data:')
print('  X_val shape:', X_val_shape)
print('  y_val len  :', len(y_val))
print('  has_test   :', has_test)


Loaded data from artifacts/data:
  X_val shape: (788, 20)
  y_val len  : 788
  has_test   : True


In [4]:
# load top-two metadata and hyperparameters
with open(BEST_INDIVIDUAL_DIR / 'metrics.json') as f:
    best_metrics = json.load(f)
with open(BEST_INDIVIDUAL_DIR / 'best_hyperparams.json') as f:
    best_hyper = json.load(f)

top_first = best_metrics.get('first')
top_second = best_metrics.get('second')
assert top_first is not None, 'No first-ranked model found in best_individual_model/metrics.json'

print('Top picks:')
print('  first :', top_first)
print('  second:', top_second)
best_hyper.get('first'), best_hyper.get('second')

Top picks:
  first : {'folder': 'xgboost', 'name': 'xgb_tuned', 'train': {'rmse': 0.008045408197754638, 'mae': 0.005604441560017746, 'r2': 0.7170392862840006}, 'val': {'rmse': 0.00907279671175083, 'mae': 0.00558345122421248, 'r2': 0.453820241414309}, 'test': {'rmse': 0.004869681452443808, 'mae': 0.0041195188750506005, 'r2': -0.7500920503923685}}
  second: {'folder': 'lightgbm', 'name': 'lgbm_tuned', 'train': {'rmse': 0.009036037548994211, 'mae': 0.0056933769244386125, 'r2': 0.6430675534761805}, 'val': {'rmse': 0.009286497847619708, 'mae': 0.005615033211137025, 'r2': 0.4277877303640396}, 'test': {'rmse': 0.004717498467452408, 'mae': 0.003922820627835008, 'r2': -0.6424165838684288}}


({'best_model_name': 'xgb_tuned',
  'best_params': {'colsample_bytree': 0.7,
   'gamma': 0,
   'learning_rate': 0.03,
   'max_depth': 3,
   'min_child_weight': 4,
   'n_estimators': 300,
   'reg_lambda': 1.5,
   'subsample': 0.8}},
 {'best_model_name': 'lgbm_tuned',
  'best_params': {'colsample_bytree': 0.7,
   'learning_rate': 0.03,
   'max_depth': 4,
   'min_child_samples': 40,
   'n_estimators': 300,
   'num_leaves': 31,
   'reg_alpha': 0.1,
   'reg_lambda': 2.0,
   'subsample': 0.7}})

In [5]:
# helper: metrics + IO (keep it simple)
def compute_metrics(y_true, y_pred):
    return {
        'rmse': float(np.sqrt(mean_squared_error(y_true, y_pred))),
        'mae': float(mean_absolute_error(y_true, y_pred)),
        'r2': float(r2_score(y_true, y_pred))
    }

def load_preds(folder_name: str):
    folder_path = ARTIFACTS_DIR / folder_name
    preds_path = folder_path / 'preds.parquet'
    if not preds_path.exists():
        raise FileNotFoundError(f"Predictions file not found for {folder_name}: {preds_path}")
    df = pd.read_parquet(preds_path)
    # Expect columns: split, y_true, y_pred
    return df

def align_split(pred_a: pd.DataFrame, pred_b: pd.DataFrame, split: str):
    a = pred_a[pred_a['split'] == split].reset_index(drop=True)
    b = pred_b[pred_b['split'] == split].reset_index(drop=True)
    if len(a) == 0 or len(b) == 0:
        return None
    if len(a) != len(b):
        raise ValueError(f'Mismatch in {split} rows between models: {len(a)} vs {len(b)}')
    if not np.allclose(a['y_true'], b['y_true']):
        raise ValueError(f'y_true mismatch for split={split}')
    out = pd.DataFrame({
        'y_true': a['y_true'].values,
        f"pred_{top_first['folder']}": a['y_pred'].values,
        f"pred_{top_second['folder'] if top_second else 'second'}": b['y_pred'].values
    })
    return out

In [6]:
# load predictions for the top two models
preds_first = load_preds(top_first['folder'])
preds_second = load_preds(top_second['folder']) if top_second else None

val_table = align_split(preds_first, preds_second, 'val') if preds_second is not None else None
test_table = align_split(preds_first, preds_second, 'test') if preds_second is not None else None
print('val rows:', len(val_table) if val_table is not None else 0)
print('test rows:', len(test_table) if test_table is not None else 0)
val_table.head() if val_table is not None else None

val rows: 788
test rows: 30


Unnamed: 0,y_true,pred_xgboost,pred_lightgbm
0,0.043646,0.03781,0.041518
1,0.044733,0.037102,0.036367
2,0.045174,0.040993,0.04184
3,0.037856,0.035198,0.034689
4,0.038448,0.034968,0.033638


In [7]:
# fit meta-learner on validation predictions
if val_table is None:
    raise ValueError('Validation predictions missing for one of the top models; cannot build ensemble.')

X_val = val_table[[c for c in val_table.columns if c.startswith('pred_')]].values
y_val = val_table['y_true'].values
meta_model = Ridge(alpha=0.1)
meta_model.fit(X_val, y_val)

ensemble_val_pred = meta_model.predict(X_val)
ensemble_val_metrics = compute_metrics(y_val, ensemble_val_pred)
print('Ensemble (val) metrics:', ensemble_val_metrics)
ensemble_val_metrics

Ensemble (val) metrics: {'rmse': 0.009929750314505598, 'mae': 0.005936287309028706, 'r2': 0.34577083981396606}


{'rmse': 0.009929750314505598,
 'mae': 0.005936287309028706,
 'r2': 0.34577083981396606}

In [8]:
# simple blend (weighted average) on validation
pred_cols = [c for c in val_table.columns if c.startswith('pred_')]
assert len(pred_cols) == 2, 'Expected exactly two prediction columns for blending'
p1 = val_table[pred_cols[0]].values
p2 = val_table[pred_cols[1]].values
weights = np.linspace(0, 1, 21)  # 0.0 to 1.0 step 0.05

best_blend = None
for w in weights:
    blended = w * p1 + (1 - w) * p2
    m = compute_metrics(y_val, blended)
    if (best_blend is None) or (m['rmse'] < best_blend['metrics']['rmse']):
        best_blend = {'weight': float(w), 'metrics': m}

blend_weight = best_blend['weight']
blend_val_metrics = best_blend['metrics']
print('Blend (val) weight:', blend_weight)
print('Blend (val) metrics:', blend_val_metrics)
blend_val_metrics

Blend (val) weight: 1.0
Blend (val) metrics: {'rmse': 0.00907279671175083, 'mae': 0.00558345122421248, 'r2': 0.453820241414309}


{'rmse': 0.00907279671175083,
 'mae': 0.00558345122421248,
 'r2': 0.453820241414309}

In [9]:
# evaluate ensemble on test split if available
ensemble_test_metrics = None
if test_table is not None:
    X_test = test_table[[c for c in test_table.columns if c.startswith('pred_')]].values
    y_test = test_table['y_true'].values
    ensemble_test_pred = meta_model.predict(X_test)
    ensemble_test_metrics = compute_metrics(y_test, ensemble_test_pred)

print('Ensemble (test) metrics:', ensemble_test_metrics)
ensemble_test_metrics

Ensemble (test) metrics: {'rmse': 0.006828471173684929, 'mae': 0.005975153346164495, 'r2': -2.4411749026122562}


{'rmse': 0.006828471173684929,
 'mae': 0.005975153346164495,
 'r2': -2.4411749026122562}

In [10]:
# blend test metrics using same weight
blend_test_metrics = None
if test_table is not None:
    p1_test = test_table[pred_cols[0]].values
    p2_test = test_table[pred_cols[1]].values
    blended_test = blend_weight * p1_test + (1 - blend_weight) * p2_test
    blend_test_metrics = compute_metrics(test_table['y_true'].values, blended_test)

print('Blend (test) metrics:', blend_test_metrics)
blend_test_metrics

Blend (test) metrics: {'rmse': 0.004869681452443808, 'mae': 0.0041195188750506005, 'r2': -0.7500920503923685}


{'rmse': 0.004869681452443808,
 'mae': 0.0041195188750506005,
 'r2': -0.7500920503923685}

In [11]:
# compare candidates: best individual vs stacking vs blending
candidates = {
    'best_individual': {
        'name': top_first['name'],
        'source': f"{top_first['folder']}/{top_first['name']}",
        'train': top_first.get('train'),
        'val': top_first['val'],
        'test': top_first.get('test')
    },
    'stacked_ensemble': {
        'name': 'stacked_ensemble',
        'source': 'stacking_ridge',
        'train': None,  # meta-learner trained on val only
        'val': ensemble_val_metrics,
        'test': ensemble_test_metrics
    },
    'blended_ensemble': {
        'name': 'blended_ensemble',
        'source': f"blend_weight_{blend_weight}",
        'train': None,
        'val': blend_val_metrics,
        'test': blend_test_metrics,
        'blend_weight': blend_weight
    }
}

# pick best by val rmse, tie-break on test rmse when both available
best_key = None
for key, entry in candidates.items():
    if best_key is None:
        best_key = key
        continue
    cur = entry
    best = candidates[best_key]
    if cur['val']['rmse'] < best['val']['rmse']:
        best_key = key
    elif np.isclose(cur['val']['rmse'], best['val']['rmse']):
        cur_test = cur.get('test')
        best_test = best.get('test')
        if cur_test is not None and best_test is not None and cur_test.get('rmse') is not None and best_test.get('rmse') is not None:
            if cur_test['rmse'] < best_test['rmse']:
                best_key = key

best_entry = candidates[best_key]
print('Comparison (val RMSE):')
for k, v in candidates.items():
    print(f"  {k}: {v['val']['rmse']}")
print('Winner:', best_key)

best_overall_name = best_entry['name']
best_overall_source = best_entry['source']
best_overall_metrics = best_entry['val']
best_overall_train = best_entry.get('train')
best_overall_test = best_entry.get('test')
comparison = {
    'best_individual_val_rmse': candidates['best_individual']['val']['rmse'],
    'stacked_val_rmse': candidates['stacked_ensemble']['val']['rmse'],
    'blended_val_rmse': candidates['blended_ensemble']['val']['rmse'],
    'winner': best_overall_name
}
comparison

Comparison (val RMSE):
  best_individual: 0.00907279671175083
  stacked_ensemble: 0.009929750314505598
  blended_ensemble: 0.00907279671175083
Winner: best_individual


{'best_individual_val_rmse': 0.00907279671175083,
 'stacked_val_rmse': 0.009929750314505598,
 'blended_val_rmse': 0.00907279671175083,
 'winner': 'xgb_tuned'}

In [12]:
# persist results to best_overall_model (only the winner)
timestamp = datetime.utcnow().isoformat() + 'Z'
is_stacked = best_key == 'stacked_ensemble'
is_blended = best_key == 'blended_ensemble'

if is_stacked:
    metrics_payload = {
        'best_overall': {
            'type': 'stacked_ensemble',
            'name': best_overall_name,
            'source': best_overall_source,
            'train': None,  # meta-learner uses val only
            'val': ensemble_val_metrics,
            'test': ensemble_test_metrics,
            'components': {
                'first': {
                    'name': top_first['name'],
                    'source': f"{top_first['folder']}/{top_first['name']}",
                    'train': top_first.get('train'),
                    'val': top_first['val'],
                    'test': top_first.get('test')
                },
                'second': {
                    'name': top_second['name'],
                    'source': f"{top_second['folder']}/{top_second['name']}",
                    'train': top_second.get('train'),
                    'val': top_second['val'],
                    'test': top_second.get('test')
                } if top_second is not None else None
            }
        },
        'timestamp': timestamp
    }
    hyperparams_payload = {
        'best_overall': {
            'type': 'stacked_ensemble',
            'meta': {
                'type': 'Ridge',
                'params': meta_model.get_params()
            },
            'components': {
                'first': best_hyper.get('first'),
                'second': best_hyper.get('second') if top_second is not None else None
            }
        },
        'timestamp': timestamp
    }
elif is_blended:
    metrics_payload = {
        'best_overall': {
            'type': 'blended_ensemble',
            'name': best_overall_name,
            'source': best_overall_source,
            'blend_weight': blend_weight,
            'train': None,
            'val': blend_val_metrics,
            'test': blend_test_metrics,
            'components': {
                'first': {
                    'name': top_first['name'],
                    'source': f"{top_first['folder']}/{top_first['name']}",
                    'train': top_first.get('train'),
                    'val': top_first['val'],
                    'test': top_first.get('test')
                },
                'second': {
                    'name': top_second['name'],
                    'source': f"{top_second['folder']}/{top_second['name']}",
                    'train': top_second.get('train'),
                    'val': top_second['val'],
                    'test': top_second.get('test')
                } if top_second is not None else None
            }
        },
        'timestamp': timestamp
    }
    hyperparams_payload = {
        'best_overall': {
            'type': 'blended_ensemble',
            'blend': {
                'weight': blend_weight
            },
            'components': {
                'first': best_hyper.get('first'),
                'second': best_hyper.get('second') if top_second is not None else None
            }
        },
        'timestamp': timestamp
    }
else:
    metrics_payload = {
        'best_overall': {
            'type': 'individual',
            'name': top_first['name'],
            'source': f"{top_first['folder']}/{top_first['name']}",
            'train': top_first.get('train'),
            'val': top_first['val'],
            'test': top_first.get('test')
        },
        'timestamp': timestamp
    }
    hyperparams_payload = {
        'best_overall': {
            'type': 'individual',
            'hyperparams': best_hyper.get('first')
        },
        'timestamp': timestamp
    }

with open(BEST_OVERALL_DIR / 'metrics.json', 'w') as f:
    json.dump(metrics_payload, f, indent=2)
with open(BEST_OVERALL_DIR / 'hyperparams.json', 'w') as f:
    json.dump(hyperparams_payload, f, indent=2)

comparison_df = pd.DataFrame({
    'candidate': ['best_individual', 'stacked_ensemble', 'blended_ensemble'],
    'val_rmse': [
        candidates['best_individual']['val']['rmse'],
        candidates['stacked_ensemble']['val']['rmse'],
        candidates['blended_ensemble']['val']['rmse']
    ]
})
comparison_df.to_csv(BEST_OVERALL_DIR / 'comparison.csv', index=False)

print(f"Saved best-overall bundle to {BEST_OVERALL_DIR}")
metrics_payload

Saved best-overall bundle to /Users/aayushrijal/Documents/GitHub/volatility_forecast/notebooks/model_evaluation_final/artifacts/best_overall_model


{'best_overall': {'type': 'individual',
  'name': 'xgb_tuned',
  'source': 'xgboost/xgb_tuned',
  'train': {'rmse': 0.008045408197754638,
   'mae': 0.005604441560017746,
   'r2': 0.7170392862840006},
  'val': {'rmse': 0.00907279671175083,
   'mae': 0.00558345122421248,
   'r2': 0.453820241414309},
  'test': {'rmse': 0.004869681452443808,
   'mae': 0.0041195188750506005,
   'r2': -0.7500920503923685}},
 'timestamp': '2026-02-03T22:37:49.935064Z'}

### Notes
- Ranking uses validation RMSE primary, test RMSE (if present) secondary.
- Base model picks come only from `best_individual_model` (top two). Data sanity pulled from `artifacts/data` (0_data_prep output).
- Two ensemble flavors: Ridge stacker and weighted blend (grid over 0..1). Winner is whichever beats the others on val RMSE (tie-break test).
- Adjust Ridge alpha or blend grid if you want finer search.