# Notebook 06 — Ensemble & Stacking (Per-Distance)
**RaceDayAI ML Prediction Engine (Plan 07)**

Per-distance meta-learners that combine supervised (NB03), Bayesian (NB04), and neural (NB05)
predictions. Adaptive weighting by data-richness tier. Calibration checks.

**Reads:** `model_predictions_{dist}.csv`, `bayesian_predictions_{dist}.csv`,
`neural_predictions_{dist}.csv`, `athlete_profile.csv`
**Writes:** `ensemble_predictions_70.3.csv`, `ensemble_predictions_140.6.csv`,
`ensemble_evaluation.csv`

In [1]:
import pandas as pd
import numpy as np
import warnings
from pathlib import Path
from time import time
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
import lightgbm as lgb
warnings.filterwarnings('ignore')

BASE = Path('.').resolve().parent
CLEANED = BASE / 'data' / 'cleaned'

MODEL_DISTANCES = ['70.3', '140.6']

## 1. Load All Model Predictions (Per Distance)

In [2]:
dist_data = {}

for DIST in MODEL_DISTANCES:
    print(f"\n--- Loading {DIST} ---")
    data = {}

    # Supervised predictions (from NB03)
    try:
        sup = pd.read_csv(CLEANED / f'model_predictions_{DIST}.csv', low_memory=False)
        data['sup'] = sup
        print(f"  Supervised: {len(sup):,}")
    except FileNotFoundError:
        print(f"  ⚠ model_predictions_{DIST}.csv not found")
        continue

    # Bayesian predictions (from NB04)
    try:
        bayes = pd.read_csv(CLEANED / f'bayesian_predictions_{DIST}.csv', low_memory=False)
        data['bayes'] = bayes
        print(f"  Bayesian: {len(bayes):,}")
    except FileNotFoundError:
        print(f"  ⚠ bayesian_predictions_{DIST}.csv not found — will skip Bayesian features")

    # Neural predictions (from NB05)
    try:
        neural = pd.read_csv(CLEANED / f'neural_predictions_{DIST}.csv', low_memory=False)
        data['neural'] = neural
        print(f"  Neural: {len(neural):,}")
    except FileNotFoundError:
        # Fallback: try combined file
        try:
            neural_all = pd.read_csv(CLEANED / 'neural_predictions.csv', low_memory=False)
            neural = neural_all[neural_all['event_distance'] == DIST].copy()
            data['neural'] = neural
            print(f"  Neural (from combined): {len(neural):,}")
        except FileNotFoundError:
            print(f"  ⚠ neural_predictions not found — will skip Neural features")

    dist_data[DIST] = data

# Athlete profiles for data-richness features
profiles = pd.read_csv(CLEANED / 'athlete_profile.csv',
                        usecols=['athlete_hash', 'total_races', 'consistency_cv'],
                        low_memory=False)
print(f"\nProfiles: {len(profiles):,}")


--- Loading 70.3 ---
  Supervised: 204,598
  Bayesian: 2,197,121
  Neural: 2,197,121

--- Loading 140.6 ---
  Supervised: 230,464
  Bayesian: 1,541,692
  Neural: 1,541,692

Profiles: 1,629,366


## 2. Build Meta-Feature Matrix (Per Distance)

In [3]:
meta_dfs = {}

for DIST in MODEL_DISTANCES:
    if DIST not in dist_data or 'sup' not in dist_data[DIST]:
        continue
    print(f"\n{'='*70}")
    print(f"  META-FEATURES: {DIST}")
    print(f"{'='*70}")

    data = dist_data[DIST]
    meta = data['sup'].copy()

    # Merge Bayesian
    if 'bayes' in data:
        b = data['bayes'][['athlete_hash', 'event_year', 'bayes_pred', 'bayes_std']].copy()
        # Deduplicate if needed
        b = b.drop_duplicates(subset=['athlete_hash', 'event_year'])
        if 'event_year' in meta.columns and 'event_year' in b.columns:
            meta = meta.merge(b, on=['athlete_hash', 'event_year'], how='left')
        else:
            meta = meta.merge(b, on='athlete_hash', how='left')
        print(f"  + Bayesian: {meta['bayes_pred'].notna().sum():,} matched")

    # Merge Neural
    if 'neural' in data:
        n = data['neural'][['athlete_hash', 'neural_pred']].drop_duplicates(subset='athlete_hash')
        meta = meta.merge(n, on='athlete_hash', how='left')
        print(f"  + Neural: {meta['neural_pred'].notna().sum():,} matched")

    # Merge profile features
    meta = meta.merge(profiles, on='athlete_hash', how='left')

    # Compute IQR from quantile columns if present
    if 'q25' in meta.columns and 'q75' in meta.columns:
        meta['quantile_iqr'] = meta['q75'] - meta['q25']

    # Data richness
    meta['n_prior_races'] = meta['total_races'].fillna(0)
    meta['input_confidence'] = pd.cut(meta['n_prior_races'],
                                       bins=[-1, 0, 2, 5, 100],
                                       labels=[0, 1, 2, 3]).astype(float)

    meta_dfs[DIST] = meta
    print(f"  Meta-feature matrix: {len(meta):,} rows")
    print(f"  Columns: {list(meta.columns)}")


  META-FEATURES: 70.3
  + Bayesian: 204,598 matched
  + Neural: 204,598 matched
  Meta-feature matrix: 204,598 rows
  Columns: ['athlete_hash', 'event_distance', 'event_year', 'total_sec', 'pred_xgb', 'pred_lgb', 'pred_cat', 'pred_rf', 'pred_ridge', 'pred_xgb_tuned', 'q05', 'q25', 'q50', 'q75', 'q95', 'bayes_pred', 'bayes_std', 'neural_pred', 'total_races', 'consistency_cv', 'quantile_iqr', 'n_prior_races', 'input_confidence']

  META-FEATURES: 140.6
  + Bayesian: 230,464 matched
  + Neural: 230,464 matched
  Meta-feature matrix: 230,464 rows
  Columns: ['athlete_hash', 'event_distance', 'event_year', 'total_sec', 'pred_xgb', 'pred_lgb', 'pred_cat', 'pred_rf', 'pred_ridge', 'pred_xgb_tuned', 'q05', 'q25', 'q50', 'q75', 'q95', 'bayes_pred', 'bayes_std', 'neural_pred', 'total_races', 'consistency_cv', 'quantile_iqr', 'n_prior_races', 'input_confidence']


## 3. Define Meta-Features (Per Distance)

The stacking meta-model sees individual model predictions as features, plus data-richness signals.

In [4]:
meta_features_by_dist = {}

for DIST in MODEL_DISTANCES:
    if DIST not in meta_dfs:
        continue
    meta = meta_dfs[DIST]

    pred_cols = []
    # Supervised model predictions
    for col in ['pred_xgb', 'pred_lgb', 'pred_cat', 'pred_rf', 'pred_ridge',
                'pred_xgb_tuned', 'pred_chained']:
        if col in meta.columns and meta[col].notna().sum() > 100:
            pred_cols.append(col)

    # Bayesian
    if 'bayes_pred' in meta.columns and meta['bayes_pred'].notna().sum() > 100:
        pred_cols.append('bayes_pred')
    if 'bayes_std' in meta.columns and meta['bayes_std'].notna().sum() > 100:
        pred_cols.append('bayes_std')

    # Neural
    if 'neural_pred' in meta.columns and meta['neural_pred'].notna().sum() > 100:
        pred_cols.append('neural_pred')

    # Quantile features
    if 'q50' in meta.columns:
        pred_cols.append('q50')
    if 'quantile_iqr' in meta.columns:
        pred_cols.append('quantile_iqr')

    # Data richness
    pred_cols.extend(['n_prior_races', 'input_confidence'])

    # Fill NaN
    for col in pred_cols:
        if meta[col].isna().any():
            meta[col] = meta[col].fillna(meta[col].median())

    meta_features_by_dist[DIST] = pred_cols
    print(f"\n{DIST} meta-features ({len(pred_cols)}):")
    for col in pred_cols:
        print(f"  {col}: mean={meta[col].mean():.1f}  std={meta[col].std():.1f}")


70.3 meta-features (13):
  pred_xgb: mean=21203.2  std=3077.5
  pred_lgb: mean=21203.5  std=3055.5
  pred_cat: mean=21204.9  std=3053.2
  pred_rf: mean=21201.5  std=3049.8
  pred_ridge: mean=21024.7  std=2531.7
  pred_xgb_tuned: mean=21205.6  std=3115.5
  bayes_pred: mean=21155.9  std=2514.1
  bayes_std: mean=2581.8  std=0.0
  neural_pred: mean=21324.0  std=2675.7
  q50: mean=21007.9  std=2986.1
  quantile_iqr: mean=1434.1  std=1345.6
  n_prior_races: mean=4.9  std=6.1
  input_confidence: mean=1.8  std=0.8

140.6 meta-features (13):
  pred_xgb: mean=44097.8  std=5871.0
  pred_lgb: mean=44098.4  std=5823.5
  pred_cat: mean=44100.2  std=5814.7
  pred_rf: mean=44096.4  std=5786.9
  pred_ridge: mean=44116.7  std=5168.1
  pred_xgb_tuned: mean=44101.4  std=5896.6
  bayes_pred: mean=44132.9  std=4550.0
  bayes_std: mean=4335.1  std=0.0
  neural_pred: mean=44222.8  std=5181.4
  q50: mean=43924.9  std=5651.5
  quantile_iqr: mean=2576.6  std=2545.2
  n_prior_races: mean=4.4  std=6.5
  input_con

## 4. Train Meta-Learners (Per Distance)

In [5]:
ensemble_results = {}
best_models = {}

for DIST in MODEL_DISTANCES:
    if DIST not in meta_dfs or DIST not in meta_features_by_dist:
        continue
    print(f"\n{'='*70}")
    print(f"  META-LEARNER: {DIST}")
    print(f"{'='*70}")

    meta = meta_dfs[DIST]
    META_FEATURES = meta_features_by_dist[DIST]

    # Random split for meta-learner (grouped by athlete)
    meta_athletes = meta['athlete_hash'].unique()
    rng = np.random.RandomState(42)
    rng.shuffle(meta_athletes)
    split_idx = len(meta_athletes) // 2
    train_ath = set(meta_athletes[:split_idx])
    meta_train = meta[meta['athlete_hash'].isin(train_ath)]
    meta_test = meta[~meta['athlete_hash'].isin(train_ath)]

    X_meta_train = meta_train[META_FEATURES].values
    y_meta_train = meta_train['total_sec'].values
    X_meta_test = meta_test[META_FEATURES].values
    y_meta_test = meta_test['total_sec'].values

    print(f"  Meta-train: {len(meta_train):,} | Meta-test: {len(meta_test):,}")

    # Ridge meta-learner
    scaler_meta = StandardScaler()
    X_mt_sc = scaler_meta.fit_transform(X_meta_train)
    X_mtest_sc = scaler_meta.transform(X_meta_test)

    ridge_meta = Ridge(alpha=1.0)
    ridge_meta.fit(X_mt_sc, y_meta_train)
    pred_ridge_meta = ridge_meta.predict(X_mtest_sc)
    mae_ridge = mean_absolute_error(y_meta_test, pred_ridge_meta)
    r2_ridge = r2_score(y_meta_test, pred_ridge_meta)
    print(f"\n  Ridge: MAE={mae_ridge/60:.1f}min  R²={r2_ridge:.4f}")

    # Ridge weights
    print("    Weights:")
    for feat, w in zip(META_FEATURES, ridge_meta.coef_):
        print(f"      {feat:20s}: {w:.4f}")

    # LightGBM meta-learner
    lgb_meta = lgb.LGBMRegressor(
        n_estimators=200, max_depth=4, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, verbose=-1,
    )
    lgb_meta.fit(X_meta_train, y_meta_train)
    pred_lgb_meta = lgb_meta.predict(X_meta_test)
    mae_lgb = mean_absolute_error(y_meta_test, pred_lgb_meta)
    r2_lgb = r2_score(y_meta_test, pred_lgb_meta)
    print(f"  LightGBM: MAE={mae_lgb/60:.1f}min  R²={r2_lgb:.4f}")

    # LightGBM feature importance
    fi_meta = pd.DataFrame({'feature': META_FEATURES, 'importance': lgb_meta.feature_importances_})
    fi_meta = fi_meta.sort_values('importance', ascending=False)
    print("    Feature importance:")
    for _, row in fi_meta.iterrows():
        print(f"      {row['feature']:20s}: {row['importance']:.0f}")

    # Pick best
    best_meta = 'lgb' if mae_lgb < mae_ridge else 'ridge'
    best_pred = pred_lgb_meta if best_meta == 'lgb' else pred_ridge_meta
    best_mae = min(mae_lgb, mae_ridge)
    print(f"\n  Best meta-learner: {best_meta} (MAE={best_mae/60:.1f}min)")

    best_models[DIST] = {
        'type': best_meta,
        'lgb': lgb_meta, 'ridge': ridge_meta, 'scaler': scaler_meta,
        'meta_test': meta_test, 'y_meta_test': y_meta_test,
        'best_pred': best_pred, 'best_mae': best_mae,
    }

    # Individual model comparisons
    comparisons = {}
    for col in ['pred_xgb', 'pred_lgb', 'pred_cat', 'pred_rf', 'pred_ridge',
                'pred_xgb_tuned', 'pred_chained', 'bayes_pred', 'neural_pred']:
        if col in meta_test.columns and meta_test[col].notna().sum() > 100:
            mae_i = mean_absolute_error(y_meta_test, meta_test[col].values)
            comparisons[col] = mae_i
    comparisons[f'ensemble_{best_meta}'] = best_mae

    ensemble_results[DIST] = comparisons


  META-LEARNER: 70.3
  Meta-train: 102,337 | Meta-test: 102,261

  Ridge: MAE=12.1min  R²=0.8606
    Weights:
      pred_xgb            : -671.3576
      pred_lgb            : 154.4814
      pred_cat            : 399.8085
      pred_rf             : 272.7131
      pred_ridge          : -2.0391
      pred_xgb_tuned      : 3018.9578
      bayes_pred          : -36.5237
      bayes_std           : 0.0000
      neural_pred         : 450.4011
      q50                 : -427.6430
      quantile_iqr        : 10.0300
      n_prior_races       : -37.7927
      input_confidence    : 6.7283
  LightGBM: MAE=12.0min  R²=0.8613
    Feature importance:
      pred_xgb_tuned      : 611
      neural_pred         : 445
      quantile_iqr        : 252
      pred_rf             : 225
      bayes_pred          : 216
      n_prior_races       : 199
      pred_ridge          : 193
      q50                 : 172
      pred_xgb            : 139
      pred_cat            : 96
      pred_lgb            : 89
  

## 5. Compare Ensemble vs Individual Models (Per Distance)

In [6]:
for DIST in MODEL_DISTANCES:
    if DIST not in ensemble_results:
        continue
    print(f"\n{'='*70}")
    print(f"  {DIST}: ENSEMBLE vs INDIVIDUAL MODELS")
    print(f"{'='*70}")

    comparisons = ensemble_results[DIST]
    for name, mae_val in sorted(comparisons.items(), key=lambda x: x[1]):
        marker = " ← ENSEMBLE" if 'ensemble' in name else ""
        print(f"  {name:25s}: MAE={mae_val/60:.1f}min{marker}")

    # Simple average baseline
    meta_test = best_models[DIST]['meta_test']
    y_meta_test = best_models[DIST]['y_meta_test']
    avg_cols = [c for c in ['pred_xgb', 'pred_lgb', 'pred_cat']
                if c in meta_test.columns and meta_test[c].notna().sum() > 100]
    if avg_cols:
        simple_avg = meta_test[avg_cols].mean(axis=1).values
        mae_avg = mean_absolute_error(y_meta_test, simple_avg)
        print(f"  {'simple_average':25s}: MAE={mae_avg/60:.1f}min")


  70.3: ENSEMBLE vs INDIVIDUAL MODELS
  ensemble_lgb             : MAE=12.0min ← ENSEMBLE
  pred_xgb_tuned           : MAE=12.2min
  pred_xgb                 : MAE=12.9min
  pred_rf                  : MAE=13.0min
  pred_lgb                 : MAE=13.5min
  pred_cat                 : MAE=13.6min
  neural_pred              : MAE=18.8min
  pred_ridge               : MAE=27.1min
  bayes_pred               : MAE=28.3min
  simple_average           : MAE=13.1min

  140.6: ENSEMBLE vs INDIVIDUAL MODELS
  ensemble_ridge           : MAE=19.8min ← ENSEMBLE
  pred_xgb_tuned           : MAE=19.8min
  pred_xgb                 : MAE=20.8min
  pred_rf                  : MAE=21.7min
  pred_lgb                 : MAE=22.0min
  pred_cat                 : MAE=22.1min
  neural_pred              : MAE=32.6min
  pred_ridge               : MAE=45.1min
  bayes_pred               : MAE=55.0min
  simple_average           : MAE=21.3min


## 6. Adaptive Weighting by Tier (Per Distance)

In [7]:
for DIST in MODEL_DISTANCES:
    if DIST not in best_models:
        continue
    print(f"\n--- {DIST}: Ensemble by data-richness tier ---")

    bm = best_models[DIST]
    meta_test = bm['meta_test']
    y_meta_test = bm['y_meta_test']
    best_pred = bm['best_pred']

    tiers = {
        'Tier 0 (0 races)': meta_test['n_prior_races'] == 0,
        'Tier 1 (1-2 races)': (meta_test['n_prior_races'] >= 1) & (meta_test['n_prior_races'] <= 2),
        'Tier 2 (3-5 races)': (meta_test['n_prior_races'] >= 3) & (meta_test['n_prior_races'] <= 5),
        'Tier 3 (5+ races)': meta_test['n_prior_races'] > 5,
    }

    for tier_name, mask in tiers.items():
        n = mask.sum()
        if n < 50:
            continue
        mae_tier = mean_absolute_error(y_meta_test[mask], best_pred[mask])
        # Best individual for comparison
        best_indiv = float('inf')
        for col in ['pred_xgb', 'pred_lgb', 'pred_cat']:
            if col in meta_test.columns and meta_test[col].notna().sum() > 100:
                mae_i = mean_absolute_error(y_meta_test[mask], meta_test.loc[mask, col].values)
                best_indiv = min(best_indiv, mae_i)
        improvement = (best_indiv - mae_tier) / best_indiv * 100 if best_indiv < float('inf') else 0
        print(f"  {tier_name:25s}: n={n:>6,}  ensemble={mae_tier/60:.1f}min  "
              f"best_indiv={best_indiv/60:.1f}min  {improvement:+.1f}%")


--- 70.3: Ensemble by data-richness tier ---
  Tier 1 (1-2 races)       : n=47,178  ensemble=4.9min  best_indiv=5.4min  +10.9%
  Tier 2 (3-5 races)       : n=27,411  ensemble=16.2min  best_indiv=17.4min  +6.9%
  Tier 3 (5+ races)        : n=27,672  ensemble=20.1min  best_indiv=21.0min  +4.3%

--- 140.6: Ensemble by data-richness tier ---
  Tier 1 (1-2 races)       : n=60,122  ensemble=7.5min  best_indiv=8.2min  +8.5%
  Tier 2 (3-5 races)       : n=28,683  ensemble=28.9min  best_indiv=30.2min  +4.4%
  Tier 3 (5+ races)        : n=27,190  ensemble=37.5min  best_indiv=38.5min  +2.7%


## 7. Calibration Check — Quantile Coverage (Per Distance)

In [8]:
for DIST in MODEL_DISTANCES:
    if DIST not in best_models:
        continue
    meta_test = best_models[DIST]['meta_test']
    y_meta_test = best_models[DIST]['y_meta_test']

    print(f"\n--- {DIST}: Quantile calibration ---")
    for q_val, q_col in [(5, 'q05'), (25, 'q25'), (50, 'q50'), (75, 'q75'), (95, 'q95')]:
        if q_col in meta_test.columns:
            coverage = (y_meta_test <= meta_test[q_col].values).mean()
            expected = q_val / 100
            status = '✓' if abs(coverage - expected) < 0.03 else '✗'
            print(f"  q{q_val:02d}: expected={expected:.2f}  actual={coverage:.3f}  {status}")


--- 70.3: Quantile calibration ---
  q05: expected=0.05  actual=0.044  ✓
  q25: expected=0.25  actual=0.247  ✓
  q50: expected=0.50  actual=0.500  ✓
  q75: expected=0.75  actual=0.749  ✓
  q95: expected=0.95  actual=0.953  ✓

--- 140.6: Quantile calibration ---
  q05: expected=0.05  actual=0.043  ✓
  q25: expected=0.25  actual=0.249  ✓
  q50: expected=0.50  actual=0.499  ✓
  q75: expected=0.75  actual=0.752  ✓
  q95: expected=0.95  actual=0.954  ✓


## 8. Save Outputs (Per Distance)

In [9]:
all_eval_rows = []

for DIST in MODEL_DISTANCES:
    if DIST not in meta_dfs or DIST not in best_models:
        continue
    print(f"\n--- Saving {DIST} ---")

    meta = meta_dfs[DIST]
    bm = best_models[DIST]
    META_FEATURES = meta_features_by_dist[DIST]

    # Generate ensemble predictions for full dataset
    X_all = meta[META_FEATURES].values
    if bm['type'] == 'lgb':
        meta['ensemble_pred'] = bm['lgb'].predict(X_all)
    else:
        meta['ensemble_pred'] = bm['ridge'].predict(bm['scaler'].transform(X_all))

    # Output columns
    out_cols = ['athlete_hash', 'event_distance', 'total_sec', 'ensemble_pred']
    for col in ['pred_xgb', 'pred_lgb', 'pred_xgb_tuned', 'pred_chained',
                'bayes_pred', 'neural_pred', 'n_prior_races',
                'q05', 'q25', 'q50', 'q75', 'q95']:
        if col in meta.columns:
            out_cols.append(col)

    ensemble_out = meta[[c for c in out_cols if c in meta.columns]].copy()
    fname = f'ensemble_predictions_{DIST}.csv'
    ensemble_out.to_csv(CLEANED / fname, index=False)
    print(f"  {fname}: {len(ensemble_out):,}")

    # Evaluation rows
    for model_name, mae_val in ensemble_results.get(DIST, {}).items():
        all_eval_rows.append({'distance': DIST, 'model': model_name, 'MAE_sec': mae_val,
                              'MAE_min': mae_val/60})

# Combined evaluation summary
if all_eval_rows:
    eval_df = pd.DataFrame(all_eval_rows).sort_values(['distance', 'MAE_sec'])
    eval_df.to_csv(CLEANED / 'ensemble_evaluation.csv', index=False)
    print(f"\nensemble_evaluation.csv: {len(eval_df)} rows")
    print(f"\nFinal rankings:")
    for DIST in MODEL_DISTANCES:
        sub = eval_df[eval_df['distance'] == DIST].head(10)
        if len(sub) == 0:
            continue
        print(f"\n  {DIST}:")
        for _, row in sub.iterrows():
            print(f"    {row['model']:25s}  MAE={row['MAE_min']:.1f}min")

print("\n✅ ENSEMBLE COMPLETE (per-distance)")


--- Saving 70.3 ---
  ensemble_predictions_70.3.csv: 204,598

--- Saving 140.6 ---
  ensemble_predictions_140.6.csv: 230,464

ensemble_evaluation.csv: 18 rows

Final rankings:

  70.3:
    ensemble_lgb               MAE=12.0min
    pred_xgb_tuned             MAE=12.2min
    pred_xgb                   MAE=12.9min
    pred_rf                    MAE=13.0min
    pred_lgb                   MAE=13.5min
    pred_cat                   MAE=13.6min
    neural_pred                MAE=18.8min
    pred_ridge                 MAE=27.1min
    bayes_pred                 MAE=28.3min

  140.6:
    ensemble_ridge             MAE=19.8min
    pred_xgb_tuned             MAE=19.8min
    pred_xgb                   MAE=20.8min
    pred_rf                    MAE=21.7min
    pred_lgb                   MAE=22.0min
    pred_cat                   MAE=22.1min
    neural_pred                MAE=32.6min
    pred_ridge                 MAE=45.1min
    bayes_pred                 MAE=55.0min

✅ ENSEMBLE COMPLETE (per-dist