# Stacking & Blending Ensemble

In [1]:
import os
import sys
import tempfile
import warnings
warnings.filterwarnings('ignore')

sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize

import mlflow
import mlflow.sklearn

from sklearn.base import clone
from sklearn.linear_model import ElasticNet, HuberRegressor, Lasso, Ridge
from lightgbm import LGBMRegressor

from src.config import MLFLOW_TRACKING_URI, DATA_DIR
from src.features.preprocessors import load_dataset
from src.modeling.metrics import calculate_metrics

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print('MLflow tracking URI:', MLFLOW_TRACKING_URI)


[32m2026-02-26 11:30:14.324[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m12[0m - [1mPROJ_ROOT path is: C:\Users\admin\Documents\1WBS\energy_prices[0m


MLflow tracking URI: sqlite:///C:/Users/admin/Documents/1WBS/energy_prices/models/mlflow.db


In [2]:
# Config

BASE_MODEL_RUN_IDS = [
    'd091f1c8342b4c649f86931157f07617',
    'd8deb4fefd374fbfbd0fe86ab0c61fc9',
    'ecbb6e460ff3430fb9bb5411ebd09200',
    'fc43d42dd7ea45cd827823ade033d091',
    'daa9083db9664b42979f1793543e642f',
    'cdde162175474a468a8894876a96ca61',
    '31b03ec78a17495a8f00e19c03a3335b',
    'b1dfd4b6ba58496fa6c3449b59de21dc'
]

EXPERIMENT_NAME = 'ensemble_stacking'

# Three-way split fractions (train = 1 − OOF − test)
ENSEMBLE_TEST_FRACTION = 0.05
ENSEMBLE_OOF_FRACTION  = 0.15

# Trailing window for time-varying blend weights
BLEND_ROLLING_WINDOW = 24 * 150  # hours

# Meta-learner hyperparameters
META_RIDGE_ALPHA       = 1.0
META_LASSO_ALPHA       = 1.0
META_ENET_ALPHA        = 1.0
META_ENET_L1_RATIO     = 0.5
META_HUBER_EPSILON     = 1.35
META_LGBM_N_ESTIMATORS = 100
META_LGBM_LR           = 0.05

RANDOM_STATE = 42

train_frac = 1 - ENSEMBLE_OOF_FRACTION - ENSEMBLE_TEST_FRACTION
print(f'Split:  train={train_frac:.0%}  |  OOF={ENSEMBLE_OOF_FRACTION:.0%}  |  test={ENSEMBLE_TEST_FRACTION:.0%}')

Split:  train=80%  |  OOF=15%  |  test=5%


## Load base models from MLflow

In [3]:
base_models = []

for run_id in BASE_MODEL_RUN_IDS:
    run = mlflow.get_run(run_id)
    params = run.data.params

    dataset_run_id = params['dataset_run_id']
    group_size     = int(params.get('group_size', 1))
    model_class    = params['model_class']

    fitted_pipeline = mlflow.sklearn.load_model(f'runs:/{run_id}/model')
    X, y_df, meta = load_dataset(run_id=dataset_run_id)
    y_array = y_df.values

    base_models.append({
        'run_id':           run_id,
        'model_class':      model_class,
        'group_size':       group_size,
        'fitted_pipeline':  fitted_pipeline,
        'X':                X,
        'y_array':          y_array,
        'y_df':             y_df,
        'index':            X.index,
    })
    print(f'{run_id[:8]}: {model_class}, gs={group_size}, X={X.shape}')

print(f'\nLoaded {len(base_models)} base models.')

2026/02/26 11:30:15 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/26 11:30:15 INFO alembic.runtime.migration: Will assume non-transactional DDL.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:15.930[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=9be233856ef149d193d72d77536d1534[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:16.472[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(4055, 159), y=(4055, 24)[0m


d091f1c8: MultiOutputRegressor, gs=1, X=(4055, 159)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:16.650[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=9be233856ef149d193d72d77536d1534[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:16.916[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(4055, 159), y=(4055, 24)[0m


d8deb4fe: MultiOutputRegressor, gs=1, X=(4055, 159)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:17.951[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=247ccd98f2614cc5aa0bf834c1f1835e[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:18.207[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(97320, 85), y=(97320, 1)[0m


ecbb6e46: XGBRegressor, gs=24, X=(97320, 85)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:18.733[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=247ccd98f2614cc5aa0bf834c1f1835e[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:19.074[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(97320, 85), y=(97320, 1)[0m


fc43d42d: XGBRegressor, gs=24, X=(97320, 85)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:19.935[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=8f6fb5a7566d456eae9e3ba24e7e3a9e[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:20.310[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(97320, 61), y=(97320, 1)[0m


daa9083d: CatBoostRegressor, gs=24, X=(97320, 61)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:20.881[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=93706fa242c44a10acda359c4042c19c[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:21.151[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(97320, 30), y=(97320, 1)[0m


cdde1621: CatBoostRegressor, gs=24, X=(97320, 30)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:21.958[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=8f6fb5a7566d456eae9e3ba24e7e3a9e[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:22.432[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(97320, 61), y=(97320, 1)[0m


31b03ec7: LGBMRegressor, gs=24, X=(97320, 61)


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2026-02-26 11:30:23.233[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m159[0m - [1mLoading dataset from run_id=93706fa242c44a10acda359c4042c19c[0m


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

[32m2026-02-26 11:30:23.497[0m | [1mINFO    [0m | [36msrc.features.preprocessors[0m:[36mload_dataset[0m:[36m207[0m - [1mLoaded dataset: X=(97320, 30), y=(97320, 1)[0m


b1dfd4b6: LGBMRegressor, gs=24, X=(97320, 30)

Loaded 8 base models.


## Prediction flattening helpers

In [4]:
def flatten_predictions(preds, index, group_size):
    """Convert model predictions to a hourly pd.Series."""
    if isinstance(preds, np.ndarray) and preds.ndim == 2 and preds.shape[1] == 1:
        preds = preds.ravel()

    if group_size == 24:
        return pd.Series(preds, index=index)

    elif group_size == 1:
        if not (isinstance(preds, np.ndarray) and preds.ndim == 2
                and preds.shape[1] == 24):
            raise ValueError(
                f'Expected (N_days, 24) for group_size=1, '
                f'got shape {getattr(preds, "shape", type(preds))}'
            )
        hourly_timestamps = []
        hourly_values = []
        for i, day in enumerate(index):
            for h in range(24):
                hourly_timestamps.append(day + pd.Timedelta(hours=h))
                hourly_values.append(preds[i, h])
        ts_index = pd.DatetimeIndex(hourly_timestamps)
        if ts_index.tz is None:
            ts_index = ts_index.tz_localize('UTC')
        return pd.Series(hourly_values, index=ts_index)

    else:
        raise ValueError(f'Unexpected group_size={group_size}. Expected 1 or 24.')


def build_full_y_series(model_info):
    """Build a full hourly y Series from a base model's dataset."""
    y_df       = model_info['y_df']
    index      = model_info['index']
    group_size = model_info['group_size']

    if group_size == 24:
        return pd.Series(y_df.iloc[:, 0].values, index=index)
    else:
        return flatten_predictions(y_df.values, index, group_size=1)


for model_info in base_models:
    model_info['y_series_full'] = build_full_y_series(model_info)
    ys = model_info['y_series_full']
    if ys.index.duplicated().any():
        model_info['y_series_full'] = ys[~ys.index.duplicated(keep='first')]

print(f'Built y_series_full for {len(base_models)} models, '
      f'length={len(base_models[0]["y_series_full"])}')

Built y_series_full for 8 models, length=97309


## Generate OOF and test predictions

In [5]:
for model_info in base_models:
    run_id          = model_info['run_id']
    group_size      = model_info['group_size']
    X               = model_info['X']
    y_array         = model_info['y_array']
    fitted_pipeline = model_info['fitted_pipeline']

    # Three-way split (group-aligned boundaries)
    oof_start_idx  = int(len(X) * (1 - ENSEMBLE_OOF_FRACTION - ENSEMBLE_TEST_FRACTION))
    oof_start_idx  = (oof_start_idx // group_size) * group_size

    test_start_idx = int(len(X) * (1 - ENSEMBLE_TEST_FRACTION))
    test_start_idx = (test_start_idx // group_size) * group_size

    X_train = X.iloc[:oof_start_idx]
    X_oof   = X.iloc[oof_start_idx:test_start_idx]
    X_test  = X.iloc[test_start_idx:]

    y_train = y_array[:oof_start_idx]
    y_oof   = y_array[oof_start_idx:test_start_idx]

    model_info['X_train'] = X_train
    model_info['X_oof']   = X_oof
    model_info['X_test']  = X_test

    # OOF predictions: clone pipeline, refit on train only, predict on OOF
    cloned = clone(fitted_pipeline)
    cloned.fit(X_train, y_train)
    oof_preds  = cloned.predict(X_oof)
    oof_series = flatten_predictions(oof_preds, X_oof.index, group_size)
    if oof_series.index.duplicated().any():
        oof_series = oof_series[~oof_series.index.duplicated(keep='first')]

    # Test predictions: original pipeline (trained on full training set)
    test_preds  = fitted_pipeline.predict(X_test)
    test_series = flatten_predictions(test_preds, X_test.index, group_size)
    if test_series.index.duplicated().any():
        test_series = test_series[~test_series.index.duplicated(keep='first')]

    model_info['oof_series']  = oof_series
    model_info['test_series'] = test_series

    print(f'{run_id[:8]}: OOF={len(oof_series)}h, test={len(test_series)}h')

d091f1c8: OOF=14614h, test=4872h
d8deb4fe: OOF=14614h, test=4872h
ecbb6e46: OOF=14590h, test=4872h
fc43d42d: OOF=14590h, test=4872h
daa9083d: OOF=14590h, test=4872h
cdde1621: OOF=14590h, test=4872h
31b03ec7: OOF=14590h, test=4872h
b1dfd4b6: OOF=14590h, test=4872h


## Align predictions

In [6]:
for m in base_models:
    for key in ('oof_series', 'test_series'):
        s = m[key]
        n_dup = s.index.duplicated().sum()
        if n_dup > 0:
            print(f"  {m['run_id'][:8]} {key}: dropping {n_dup} duplicate timestamp(s)")
            m[key] = s[~s.index.duplicated(keep='first')]

print('Duplicate check done.')


Duplicate check done.


In [7]:
def _to_utc(s: pd.Series) -> pd.Series:
    """Normalise a Series to UTC and drop any remaining duplicate timestamps."""
    if s.index.tz is not None:
        s = s.tz_convert('UTC')
    else:
        s = s.tz_localize('UTC')
    if s.index.duplicated().any():
        s = s[~s.index.duplicated(keep='first')]
    return s


# Normalise all prediction and target series to UTC before alignment.
# Hourly models carry Europe/Berlin (CET/CEST), daily-pivot models may produce
# a different tz object from Timedelta arithmetic; mixing them confuses pandas.
for m in base_models:
    m['oof_series']    = _to_utc(m['oof_series'])
    m['test_series']   = _to_utc(m['test_series'])
    m['y_series_full'] = _to_utc(m['y_series_full'])

# Inner join: keep only rows where ALL models have OOF predictions
oof_df = pd.DataFrame(
    {m['run_id']: m['oof_series'] for m in base_models}
).dropna()

# Inner join for test predictions
test_df = pd.DataFrame(
    {m['run_id']: m['test_series'] for m in base_models}
).dropna()

print(f'OOF aligned : {oof_df.shape}  '
      f'{oof_df.index[0]} -> {oof_df.index[-1]}')
print(f'Test aligned: {test_df.shape}  '
      f'{test_df.index[0]} -> {test_df.index[-1]}')


def get_actual_prices_at(timestamps):
    """Return actual energy prices (EUR/MWh) at the requested hourly timestamps."""
    sorted_models = sorted(
        base_models, key=lambda m: len(m['y_series_full']), reverse=True
    )
    for m in sorted_models:
        aligned = m['y_series_full'].reindex(timestamps)
        if aligned.isna().sum() == 0:
            return aligned.values

    # Fallback: mean across all models' y series
    combined = pd.concat(
        [m['y_series_full'] for m in base_models], axis=1
    ).mean(axis=1)
    result = combined.reindex(timestamps)
    n_missing = int(result.isna().sum())
    if n_missing > 0:
        print(f'Warning: {n_missing} timestamps have no actual price.')
    return result.values


y_oof_true  = get_actual_prices_at(oof_df.index)
y_test_true = get_actual_prices_at(test_df.index)

print(f'\ny_oof_true : {y_oof_true.shape},  NaNs: {np.isnan(y_oof_true).sum()}')
print(f'y_test_true: {y_test_true.shape}, NaNs: {np.isnan(y_test_true).sum()}')


OOF aligned : (14589, 8)  2023-11-22 23:00:00+00:00 -> 2025-07-22 21:00:00+00:00
Test aligned: (4871, 8)  2025-07-22 22:00:00+00:00 -> 2026-02-10 22:00:00+00:00

y_oof_true : (14589,),  NaNs: 0
y_test_true: (4871,), NaNs: 0


## Blending

In [8]:
n_models = len(base_models)

# Combine OOF + test for leaky upper-bound weights
combined_df     = pd.concat([oof_df, test_df])
y_combined_true = np.concatenate([y_oof_true, y_test_true])

_x0     = np.ones(n_models) / n_models
_bounds = [(0, 1)] * n_models
_cons   = [{'type': 'eq', 'fun': lambda w: w.sum() - 1}]


def make_rmse_obj(vals, y):
    def _obj(w):
        return np.sqrt(np.mean(((vals * w).sum(axis=1) - y) ** 2))
    return _obj


def make_mae_obj(vals, y):
    def _obj(w):
        return np.mean(np.abs((vals * w).sum(axis=1) - y))
    return _obj


# RMSE-optimised (leaky)
res = minimize(make_rmse_obj(combined_df.values, y_combined_true),
               x0=_x0, method='SLSQP', bounds=_bounds, constraints=_cons)
blend_weights     = res.x
y_blend_pred      = (test_df.values * blend_weights).sum(axis=1)
blend_metrics     = calculate_metrics(y_test_true, y_blend_pred)

# MAE-optimised (leaky)
res_mae = minimize(make_mae_obj(combined_df.values, y_combined_true),
                   x0=_x0, method='SLSQP', bounds=_bounds, constraints=_cons)
blend_weights_mae = res_mae.x
y_blend_mae_pred  = (test_df.values * blend_weights_mae).sum(axis=1)
blend_mae_metrics = calculate_metrics(y_test_true, y_blend_mae_pred)

print('Leaky RMSE blend:', {k: f'{v:.4f}' for k, v in blend_metrics.items()})
print('Leaky MAE  blend:', {k: f'{v:.4f}' for k, v in blend_mae_metrics.items()})

Leaky RMSE blend: {'rmse': '18.2913', 'mae': '10.6756', 'me': '-0.6924', 'r2': '0.8416'}
Leaky MAE  blend: {'rmse': '18.2102', 'mae': '10.5312', 'me': '-0.8310', 'r2': '0.8430'}


### Leakage-free blend (OOF only)

In [9]:
_x0_lf   = np.ones(n_models) / n_models
_bnd_lf  = [(0, 1)] * n_models
_cons_lf = [{'type': 'eq', 'fun': lambda w: w.sum() - 1}]


def blend_oof_rmse(weights):
    return np.sqrt(np.mean(((oof_df.values * weights).sum(axis=1) - y_oof_true) ** 2))


def blend_oof_mae(weights):
    return np.mean(np.abs((oof_df.values * weights).sum(axis=1) - y_oof_true))


# RMSE-optimised (leakage-free)
res_lf = minimize(blend_oof_rmse, x0=_x0_lf, method='SLSQP',
                  bounds=_bnd_lf, constraints=_cons_lf)
blend_weights_lf     = res_lf.x
y_blend_lf_pred      = (test_df.values * blend_weights_lf).sum(axis=1)
blend_lf_metrics     = calculate_metrics(y_test_true, y_blend_lf_pred)
blend_lf_oof_metrics = calculate_metrics(y_oof_true,
                                          (oof_df.values * blend_weights_lf).sum(axis=1))

# MAE-optimised (leakage-free)
res_lf_mae = minimize(blend_oof_mae, x0=_x0_lf, method='SLSQP',
                      bounds=_bnd_lf, constraints=_cons_lf)
blend_weights_lf_mae     = res_lf_mae.x
y_blend_lf_mae_pred      = (test_df.values * blend_weights_lf_mae).sum(axis=1)
blend_lf_mae_metrics     = calculate_metrics(y_test_true, y_blend_lf_mae_pred)
blend_lf_mae_oof_metrics = calculate_metrics(y_oof_true,
                                              (oof_df.values * blend_weights_lf_mae).sum(axis=1))

print('LF RMSE — OOF:', {k: f'{v:.4f}' for k, v in blend_lf_oof_metrics.items()})
print('LF RMSE — Test:', {k: f'{v:.4f}' for k, v in blend_lf_metrics.items()})
print('LF MAE  — OOF:', {k: f'{v:.4f}' for k, v in blend_lf_mae_oof_metrics.items()})
print('LF MAE  — Test:', {k: f'{v:.4f}' for k, v in blend_lf_mae_metrics.items()})

LF RMSE — OOF: {'rmse': '24.7070', 'mae': '13.4467', 'me': '3.1905', 'r2': '0.7894'}
LF RMSE — Test: {'rmse': '18.7589', 'mae': '11.0095', 'me': '-0.6307', 'r2': '0.8334'}
LF MAE  — OOF: {'rmse': '24.8655', 'mae': '13.2310', 'me': '2.7210', 'r2': '0.7867'}
LF MAE  — Test: {'rmse': '18.4340', 'mae': '10.7368', 'me': '-0.6292', 'r2': '0.8391'}


### Time-varying blend (recent OOF performance)

In [10]:
# Inverse-MAE weights from trailing OOF window
window      = min(BLEND_ROLLING_WINDOW, len(oof_df))
oof_tail    = oof_df.iloc[-window:]
y_tail_true = y_oof_true[-window:]

recent_mae       = np.abs(oof_tail.values - y_tail_true[:, np.newaxis]).mean(axis=0)
inv_mae          = 1.0 / (recent_mae + 1e-6)
blend_weights_tv = inv_mae / inv_mae.sum()

y_blend_tv_pred  = (test_df.values * blend_weights_tv).sum(axis=1)
blend_tv_metrics = calculate_metrics(y_test_true, y_blend_tv_pred)

print(f'Time-varying blend ({window/24:.0f}-day window):',
      {k: f'{v:.4f}' for k, v in blend_tv_metrics.items()})

Time-varying blend (150-day window): {'rmse': '18.0223', 'mae': '10.2924', 'me': '-1.3482', 'r2': '0.8462'}


### Greedy model addition (time-varying blend)

In [14]:
# Per-model OOF MAE over the trailing window
model_oof_mae = {}
for m in base_models:
    rid = m['run_id']
    mae = np.abs(oof_tail[rid].values - y_tail_true).mean()
    model_oof_mae[rid] = mae

# Sort models best (lowest MAE) to worst
ranked_ids = sorted(model_oof_mae, key=model_oof_mae.get)

# Map run_id -> short label
labels = {m['run_id']: f"{m['model_class']}({m['run_id'][:8]})" for m in base_models}

print(f'Model ranking by OOF MAE ({window/24:.0f}-day window):\n')
for i, rid in enumerate(ranked_ids, 1):
    print(f'  {i}. {labels[rid]:45s} MAE={model_oof_mae[rid]:.4f}')

# Incrementally add models and compute time-varying blend
print(f'\nGreedy addition (test-set metrics):')
print(f'{"N":>3}  {"Added model":45s}  {"RMSE":>8}  {"MAE":>8}  {"R2":>8}')
print('-' * 80)

for n in range(1, len(ranked_ids) + 1):
    subset_ids = ranked_ids[:n]
    oof_sub = oof_tail[subset_ids].values
    test_sub = test_df[subset_ids].values
    y_sub = y_tail_true

    sub_mae = np.abs(oof_sub - y_sub[:, np.newaxis]).mean(axis=0)
    sub_inv = 1.0 / (sub_mae + 1e-6)
    sub_w = sub_inv / sub_inv.sum()

    y_pred = (test_sub * sub_w).sum(axis=1)
    m = calculate_metrics(y_test_true, y_pred)
    added = labels[ranked_ids[n - 1]]
    print(f'{n:3d}  {added:45s}  {m["rmse"]:8.4f}  {m["mae"]:8.4f}  {m["r2"]:8.4f}')

Model ranking by OOF MAE (150-day window):

  1. MultiOutputRegressor(d091f1c8)                MAE=14.5427
  2. CatBoostRegressor(cdde1621)                   MAE=15.5628
  3. LGBMRegressor(b1dfd4b6)                       MAE=16.1601
  4. CatBoostRegressor(daa9083d)                   MAE=16.4702
  5. MultiOutputRegressor(d8deb4fe)                MAE=16.5434
  6. LGBMRegressor(31b03ec7)                       MAE=17.1583
  7. XGBRegressor(ecbb6e46)                        MAE=17.7814
  8. XGBRegressor(fc43d42d)                        MAE=19.6132

Greedy addition (test-set metrics):
  N  Added model                                        RMSE       MAE        R2
--------------------------------------------------------------------------------
  1  MultiOutputRegressor(d091f1c8)                  21.4958   13.4450    0.7812
  2  CatBoostRegressor(cdde1621)                     18.3617   10.9633    0.8404
  3  LGBMRegressor(b1dfd4b6)                         17.8550   10.4536    0.8491
  4  CatBo

## Stacking

In [11]:
stacking_results = {}

# Ridge (positive=True available since sklearn 1.1)
try:
    ridge_meta = Ridge(alpha=META_RIDGE_ALPHA, positive=True)
    ridge_meta.fit(oof_df.values, y_oof_true)
except TypeError:
    ridge_meta = Ridge(alpha=META_RIDGE_ALPHA)
    ridge_meta.fit(oof_df.values, y_oof_true)

y_ridge_pred  = ridge_meta.predict(test_df.values)
ridge_metrics = calculate_metrics(y_test_true, y_ridge_pred)
stacking_results['Stack (Ridge)'] = {
    'metrics': ridge_metrics, 'model': ridge_meta, 'predictions': y_ridge_pred,
}
print('Ridge:', {k: f'{v:.4f}' for k, v in ridge_metrics.items()})

# Lasso
lasso_meta = Lasso(alpha=META_LASSO_ALPHA, max_iter=10_000)
lasso_meta.fit(oof_df.values, y_oof_true)

y_lasso_pred  = lasso_meta.predict(test_df.values)
lasso_metrics = calculate_metrics(y_test_true, y_lasso_pred)
stacking_results['Stack (Lasso)'] = {
    'metrics': lasso_metrics, 'model': lasso_meta, 'predictions': y_lasso_pred,
}
print('Lasso:', {k: f'{v:.4f}' for k, v in lasso_metrics.items()})

# ElasticNet
enet_meta = ElasticNet(alpha=META_ENET_ALPHA, l1_ratio=META_ENET_L1_RATIO, max_iter=10_000)
enet_meta.fit(oof_df.values, y_oof_true)

y_enet_pred  = enet_meta.predict(test_df.values)
enet_metrics = calculate_metrics(y_test_true, y_enet_pred)
stacking_results['Stack (ElasticNet)'] = {
    'metrics': enet_metrics, 'model': enet_meta, 'predictions': y_enet_pred,
}
print('ElasticNet:', {k: f'{v:.4f}' for k, v in enet_metrics.items()})

# Huber (robust to price spikes)
huber_meta = HuberRegressor(epsilon=META_HUBER_EPSILON, max_iter=500)
huber_meta.fit(oof_df.values, y_oof_true)

y_huber_pred  = huber_meta.predict(test_df.values)
huber_metrics = calculate_metrics(y_test_true, y_huber_pred)
stacking_results['Stack (Huber)'] = {
    'metrics': huber_metrics, 'model': huber_meta, 'predictions': y_huber_pred,
}
print('Huber:', {k: f'{v:.4f}' for k, v in huber_metrics.items()})

# LightGBM (shallow trees to limit overfitting on OOF)
lgbm_meta = LGBMRegressor(
    n_estimators=META_LGBM_N_ESTIMATORS,
    learning_rate=META_LGBM_LR,
    max_depth=3,
    random_state=RANDOM_STATE,
    verbose=-1,
)
lgbm_meta.fit(oof_df.values, y_oof_true)

y_lgbm_pred  = lgbm_meta.predict(test_df.values)
lgbm_metrics = calculate_metrics(y_test_true, y_lgbm_pred)
stacking_results['Stack (LightGBM)'] = {
    'metrics': lgbm_metrics, 'model': lgbm_meta, 'predictions': y_lgbm_pred,
}
print('LightGBM:', {k: f'{v:.4f}' for k, v in lgbm_metrics.items()})

Ridge: {'rmse': '18.9769', 'mae': '11.7342', 'me': '-4.7112', 'r2': '0.8295'}
Lasso: {'rmse': '19.1526', 'mae': '11.8759', 'me': '-5.8581', 'r2': '0.8263'}
ElasticNet: {'rmse': '19.1692', 'mae': '11.8988', 'me': '-5.9209', 'r2': '0.8260'}
Huber: {'rmse': '19.6825', 'mae': '11.8917', 'me': '-7.5049', 'r2': '0.8166'}
LightGBM: {'rmse': '20.4502', 'mae': '12.2794', 'me': '-5.1741', 'r2': '0.8020'}


## Results comparison

In [12]:
rows = []

# Base model results (from MLflow, their own test period)
for m in base_models:
    run = mlflow.get_run(m['run_id'])
    rm  = run.data.metrics
    rows.append({
        'Model': f"{m['model_class']} ({m['run_id'][:8]})",
        'RMSE': rm.get('rmse', float('nan')),
        'MAE':  rm.get('mae',  float('nan')),
        'R2':   rm.get('r2',   float('nan')),
        'Note': 'own test period',
    })

# Leaky blends (OOF + test)
for label, metrics in [
    ('Blend RMSE (leaky)',  blend_metrics),
    ('Blend MAE (leaky)',   blend_mae_metrics),
]:
    rows.append({'Model': label, 'RMSE': metrics['rmse'], 'MAE': metrics['mae'],
                 'R2': metrics['r2'], 'Note': 'leakage'})

# Leakage-free blends
for label, metrics in [
    ('Blend RMSE (LF)',     blend_lf_metrics),
    ('Blend MAE (LF)',      blend_lf_mae_metrics),
    ('Blend time-varying',  blend_tv_metrics),
]:
    rows.append({'Model': label, 'RMSE': metrics['rmse'], 'MAE': metrics['mae'],
                 'R2': metrics['r2'], 'Note': 'ensemble test'})

# Stacking
for name, res in stacking_results.items():
    mm = res['metrics']
    rows.append({'Model': name, 'RMSE': mm['rmse'], 'MAE': mm['mae'],
                 'R2': mm['r2'], 'Note': 'ensemble test'})

results_df = pd.DataFrame(rows).sort_values('RMSE').reset_index(drop=True)
results_df.style.format({'RMSE': '{:.2f}', 'MAE': '{:.2f}', 'R2': '{:.4f}'})

Unnamed: 0,Model,RMSE,MAE,R2,Note
0,Blend time-varying,18.02,10.29,0.8462,ensemble test
1,Blend MAE (leaky),18.21,10.53,0.843,leakage
2,Blend RMSE (leaky),18.29,10.68,0.8416,leakage
3,Blend MAE (LF),18.43,10.74,0.8391,ensemble test
4,Blend RMSE (LF),18.76,11.01,0.8334,ensemble test
5,Stack (Ridge),18.98,11.73,0.8295,ensemble test
6,Stack (Lasso),19.15,11.88,0.8263,ensemble test
7,Stack (ElasticNet),19.17,11.9,0.826,ensemble test
8,Stack (Huber),19.68,11.89,0.8166,ensemble test
9,LGBMRegressor (b1dfd4b6),20.06,11.87,0.8452,own test period


## Log to MLflow

In [13]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

base_run_ids_str       = ','.join(BASE_MODEL_RUN_IDS)
base_model_classes_str = ','.join([m['model_class'] for m in base_models])

_split_params = {
    'ensemble_train_fraction': 1 - ENSEMBLE_OOF_FRACTION - ENSEMBLE_TEST_FRACTION,
    'ensemble_oof_fraction':   ENSEMBLE_OOF_FRACTION,
    'ensemble_test_fraction':  ENSEMBLE_TEST_FRACTION,
    'n_base_models':           len(BASE_MODEL_RUN_IDS),
    'base_model_run_ids':      base_run_ids_str,
}


def _log_npy_artifact(arr, filename, subdir='predictions'):
    with tempfile.NamedTemporaryFile(suffix='.npy', delete=False) as f:
        tmp = f.name
    np.save(tmp, arr)
    mlflow.log_artifact(tmp, artifact_path=subdir)
    os.unlink(tmp)


# Log blending runs
blend_run_specs = [
    ('ensemble_blending_rmse',   'blending_leaky',  'OOF+test', 'RMSE',     blend_metrics,         blend_weights,        y_blend_pred),
    ('ensemble_blending_mae',    'blending_leaky',  'OOF+test', 'MAE',      blend_mae_metrics,     blend_weights_mae,    y_blend_mae_pred),
    ('ensemble_blending_lf',     'blending_lf',     'OOF',      'RMSE',     blend_lf_metrics,      blend_weights_lf,     y_blend_lf_pred),
    ('ensemble_blending_lf_mae', 'blending_lf',     'OOF',      'MAE',      blend_lf_mae_metrics,  blend_weights_lf_mae, y_blend_lf_mae_pred),
    ('ensemble_blending_tv',     'blending_tv_lf',  'OOF_tail', 'inv_MAE',  blend_tv_metrics,      blend_weights_tv,     y_blend_tv_pred),
]

for run_name, method_tag, weight_set, objective, metrics, weights, preds in blend_run_specs:
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.log_params({
            **_split_params,
            'method':                  'blending',
            'objective':               objective,
            'weight_optimisation_set': weight_set,
            'blend_rolling_window':    BLEND_ROLLING_WINDOW if 'tv' in run_name else 'N/A',
        })
        mlflow.log_metrics(metrics)
        mlflow.set_tags({'model_type': 'ensemble', 'method': method_tag,
                         'base_models': base_model_classes_str})
        mlflow.log_dict({'weights': weights.tolist(), 'run_ids': BASE_MODEL_RUN_IDS},
                        f'{run_name}_weights.json')
        _log_npy_artifact(preds, f'{run_name}_predictions.npy')
        print(f'Logged {run_name}: {run.info.run_id}')

# Log stacking runs
meta_extra_params = {
    'Stack (Ridge)':      {'meta_ridge_alpha':    META_RIDGE_ALPHA},
    'Stack (Lasso)':      {'meta_lasso_alpha':    META_LASSO_ALPHA},
    'Stack (ElasticNet)': {'meta_enet_alpha':     META_ENET_ALPHA,
                           'meta_enet_l1_ratio':  META_ENET_L1_RATIO},
    'Stack (Huber)':      {'meta_huber_epsilon':  META_HUBER_EPSILON},
    'Stack (LightGBM)':   {'meta_lgbm_n_estimators': META_LGBM_N_ESTIMATORS,
                           'meta_lgbm_lr':            META_LGBM_LR,
                           'meta_lgbm_max_depth':     3},
}

for method_name, res in stacking_results.items():
    meta_learner = method_name.split('(')[1].rstrip(')')
    run_name     = f'ensemble_stack_{meta_learner.lower()}'

    with mlflow.start_run(run_name=run_name) as run:
        params = {**_split_params, 'method': 'stacking', 'meta_learner': meta_learner}
        params.update(meta_extra_params.get(method_name, {}))
        mlflow.log_params(params)
        mlflow.log_metrics(res['metrics'])
        mlflow.set_tags({'model_type': 'ensemble', 'method': 'stacking',
                         'base_models': base_model_classes_str})
        mlflow.sklearn.log_model(res['model'], 'meta_model')
        _log_npy_artifact(res['predictions'], f'{run_name}_predictions.npy')
        print(f'Logged {method_name}: {run.info.run_id}')

Logged ensemble_blending_rmse: bf4afc365286479eae3b73acd02c9045
Logged ensemble_blending_mae: fb5f14360c414738927347cc38fa393e
Logged ensemble_blending_lf: a70d1828087644b3ba1fd54a959a14ef




Logged ensemble_blending_lf_mae: 18366c669d88416fba6701fedee6ea9f
Logged ensemble_blending_tv: 2231e492c3b5462fab8df0908dcdddab




Logged Stack (Ridge): d39463ae5b1e4f2f92e4bb11fcf7bf61




Logged Stack (Lasso): d9afc48f1f7743599647ecaa5637f439




Logged Stack (ElasticNet): 4c52187a311f46f1bf578828bc036e17




Logged Stack (Huber): 84c2fc31e0d2425d9cb64e5f65b7833c
Logged Stack (LightGBM): 638f3e6370c7435090c97876e57e7492
