# Task 8 · Machine Learning Models for Demand Forecasting

This notebook trains an XGBoost model on the engineered HEMS dataset, evaluates it under the same split as the statistical benchmark, and prepares artefacts for the report and dashboard.

In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd

ROOT = Path.cwd().resolve()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from src.modeling_ml import (
    set_seed,
    build_ml_dataset,
    train_regressor_with_fallback,
    predict_any,
    evaluate_forecast,
)
from src.plotting import (
    plot_feature_importance,
    plot_forecast_overlay_multimodel,
    plot_metrics_comparison,
    plot_learning_curve,
)


In [None]:
import xgboost as xgb
print(f"Using XGBoost version: {xgb.__version__}")

import pandas as pd

def align_by_index(y_true, y_pred, idx_true, idx_pred):
    s_true = pd.Series(np.asarray(y_true), index=pd.Index(idx_true))
    s_pred = pd.Series(np.asarray(y_pred), index=pd.Index(idx_pred))
    common = s_true.index.intersection(s_pred.index)
    return common, s_true.loc[common].astype(float), s_pred.loc[common].astype(float)


In [None]:
set_seed(42)

DATA_PATH = ROOT / "data" / "processed" / "task5_features.parquet"
FIG_PATH = ROOT / "reports" / "figures"
TABLE_PATH = ROOT / "reports" / "tables"
STAT_METRICS_PATH = TABLE_PATH / "model_candidates_metrics.csv"
STAT_PREDICTIONS_PATH = TABLE_PATH / "stats_single_split_predictions.csv"

FIG_PATH.mkdir(parents=True, exist_ok=True)
TABLE_PATH.mkdir(parents=True, exist_ok=True)

features_df = pd.read_parquet(DATA_PATH).reset_index().rename(columns={"index": "timestamp"})
features_df["timestamp"] = pd.to_datetime(features_df["timestamp"], utc=True)
features_df = features_df.sort_values("timestamp")
print(f"Loaded features: {features_df['timestamp'].min()} → {features_df['timestamp'].max()} | Rows: {len(features_df):,}")


## Feature set and evaluation split

We reuse the engineered time-of-day and weather features from Task 5 and hold out the final seven days as the validation window.

In [None]:
feature_cols = [
    "hour_sin",
    "hour_cos",
    "is_weekend",
    "cooling_degree",
    "heating_degree",
    "temp_irradiance_interaction",
    "Temperature",
    "Pressure (hPa)",
    "Cloud_cover (%)",
    "Wind_speed_10m (km/h)",
    "Shortwave_radiation (W/m²)",
    "direct_radiation (W/m²)",
    "diffuse_radiation (W/m²)",
    "direct_normal_irradiance (W/m²)",
    "Price",
]

validation_start = features_df["timestamp"].max() - pd.Timedelta(days=7)
train_mask = features_df["timestamp"] < validation_start
val_mask = features_df["timestamp"] >= validation_start

train_df = features_df.loc[train_mask].copy()
val_df = features_df.loc[val_mask].copy()

print(f"Training window: {train_df['timestamp'].min()} → {train_df['timestamp'].max()} | {len(train_df):,} rows")
print(f"Validation window: {val_df['timestamp'].min()} → {val_df['timestamp'].max()} | {len(val_df):,} rows")


In [None]:
X_train, y_train, idx_train = build_ml_dataset(train_df, target="Demand", feature_cols=feature_cols)
X_val, y_val, idx_val = build_ml_dataset(val_df, target="Demand", feature_cols=feature_cols)

assert len(X_train) == len(y_train) == len(idx_train)
assert len(X_val) == len(y_val) == len(idx_val)

print(f"Training samples: {len(X_train):,} | Validation samples: {len(X_val):,}")


## Hyperparameters

We favour moderate depth, shrinkage, and subsampling to capture nonlinear load drivers while suppressing overfitting.

In [None]:
xgb_params = {
    "n_estimators": 600,
    "learning_rate": 0.06,
    "max_depth": 6,
    "subsample": 0.85,
    "colsample_bytree": 0.9,
    "reg_lambda": 1.2,
    "min_child_weight": 3,
}

hyperparam_table = pd.DataFrame(
    [
        {"parameter": "n_estimators", "value": 600, "rationale": "Sufficient capacity with early stopping to follow daily ramps."},
        {"parameter": "learning_rate", "value": 0.06, "rationale": "Small eta to smooth updates and retain stability."},
        {"parameter": "max_depth", "value": 6, "rationale": "Captures interactions without memorising hourly noise."},
        {"parameter": "subsample", "value": 0.85, "rationale": "Introduces diversity and guards against overfitting."},
        {"parameter": "colsample_bytree", "value": 0.9, "rationale": "Keeps most weather features while reducing collinearity."},
        {"parameter": "reg_lambda", "value": 1.2, "rationale": "L2 regularisation for generalisation across seasons."},
        {"parameter": "min_child_weight", "value": 3, "rationale": "Controls leaf noise in low-demand overnight periods."},
        {"parameter": "tree_method", "value": "hist", "rationale": "Fast histogram-based GPU/CPU training."},
    ]
)
hyperparam_table.to_csv(TABLE_PATH / "ml_hyperparams.csv", index=False)
hyperparam_table


In [None]:
model, eval_history = train_xgboost(
    X_train,
    y_train,
    X_val=X_val,
    y_val=y_val,
    params=xgb_params,
    seed=42,
)

y_pred = predict_xgboost(model, X_val)
common_idx, s_true, s_pred = align_by_index(y_val, y_pred, idx_val, idx_val)
metrics = evaluate_forecast(s_true.values, s_pred.values)

ml_split_metrics = pd.DataFrame([{**metrics, "model_name": "XGBoost", "evaluation": "Whole-train split"}])
ml_split_metrics.to_csv(TABLE_PATH / "ml_split_metrics.csv", index=False)

ml_split_predictions = pd.DataFrame(
    {
        "timestamp": common_idx,
        "Actual": s_true.values,
        "XGBoost": s_pred.values,
    }
)
ml_split_predictions.to_csv(TABLE_PATH / "ml_split_predictions.csv", index=False)

metrics


In [None]:
if eval_history:
    history = eval_history.get("validation_1", {})
    train_history = eval_history.get("validation_0", {})
    iterations = range(len(history.get("rmse", [])))
    learning_df = pd.DataFrame(
        {
            "iteration": list(iterations),
            "Training RMSE": train_history.get("rmse", []),
            "Validation RMSE": history.get("rmse", []),
        }
    )
    fig_learning = plot_learning_curve(learning_df, style="academic")
    fig_learning.write_image(str(FIG_PATH / "ml_learning_curve.png"), width=1100, height=600, scale=2)
    fig_learning.write_image(str(FIG_PATH / "ml_learning_curve.pdf"), width=1100, height=600, scale=2)
    fig_learning
else:
    print("No validation history recorded.")


In [None]:
importance_df = pd.DataFrame(
    {
        "feature": feature_cols,
        "importance": getattr(model, "feature_importances_", np.zeros(len(feature_cols))),
    }
)
importance_df.to_csv(TABLE_PATH / "ml_feature_importance.csv", index=False)
fig_importance = plot_feature_importance(importance_df, style="academic")
fig_importance.write_image(str(FIG_PATH / "ml_feat_importance.png"), width=1100, height=700, scale=2)
fig_importance.write_image(str(FIG_PATH / "ml_feat_importance.pdf"), width=1100, height=700, scale=2)
fig_importance


In [None]:
overlay_day = ml_split_predictions.copy()
fig_overlay = plot_forecast_overlay_multimodel(overlay_day, style="academic")
fig_overlay.write_image(str(FIG_PATH / "ml_forecast_overlay.png"), width=1100, height=600, scale=2)
fig_overlay.write_image(str(FIG_PATH / "ml_forecast_overlay.pdf"), width=1100, height=600, scale=2)
fig_overlay


In [None]:

if best_stat and STAT_PREDICTIONS_PATH.exists():
    stat_preds = pd.read_csv(STAT_PREDICTIONS_PATH, parse_dates=["timestamp"])
    stat_series = (
        stat_preds[stat_preds["model_name"] == best_stat]
        .set_index("timestamp")
        .sort_index()["y_pred"].astype(float)
    )
    stat_series = stat_series.reindex(common_idx)
else:
    stat_series = pd.Series(np.nan, index=common_idx)

stat_overlay = ml_split_predictions.copy()
stat_overlay["Statistical"] = stat_series.values

fig_overlay_multi = plot_forecast_overlay_multimodel(stat_overlay, style="academic")
fig_overlay_multi.write_image(str(FIG_PATH / "ml_forecast_overlay_multi.png"), width=1100, height=600, scale=2)
fig_overlay_multi.write_image(str(FIG_PATH / "ml_forecast_overlay_multi.pdf"), width=1100, height=600, scale=2)
fig_overlay_multi


In [None]:

combined_records = [
    {"model": "XGBoost", "evaluation": "Whole-train split", **metrics}
]

if best_stat and not stat_metrics.empty:
    stat_row = stat_metrics[stat_metrics["model_name"] == best_stat]
    if not stat_row.empty:
        row = stat_row.iloc[0]
        combined_records.append(
            {
                "model": best_stat,
                "evaluation": "Whole-train split",
                "MAE": row.get("MAE", np.nan),
                "RMSE": row.get("RMSE", np.nan),
                "nRMSE": row.get("nRMSE", np.nan),
            }
        )

combined_df = pd.DataFrame(combined_records)
combined_df.to_csv(TABLE_PATH / "best_stat_vs_ml_metrics.csv", index=False)

metrics_long = combined_df.melt(id_vars=["model", "evaluation"], value_vars=["MAE", "RMSE", "nRMSE"], var_name="metric", value_name="value")
fig_metrics = plot_metrics_comparison(metrics_long, style="academic")
fig_metrics.write_image(str(FIG_PATH / "ml_metrics_comparison.png"), width=1100, height=600, scale=2)
fig_metrics.write_image(str(FIG_PATH / "ml_metrics_comparison.pdf"), width=1100, height=600, scale=2)
fig_metrics


## Report notes

- **Model choice:** XGBoost captures nonlinear interactions between weather and demand while remaining fast to train on hourly data.
- **Hyperparameters:** Moderate depth, small learning rate, and subsampling mitigate overfitting; early stopping prevents unnecessary boosting rounds.
- **Comparison vs statistical model:** XGBoost typically improves ramp predictions and weekend behaviour, whereas the SARIMA benchmark remains competitive overnight.
- **Limitations & next steps:** Incorporating additional lag features, calibrating prediction intervals, and analysing SHAP values would strengthen interpretability and robustness.