## Goals: Training of the *Final* model

This notebook train on the full *baseline_dataset* the model used for final prediction on evaluation data.
We are training here a single model that aims to generalize on Brazil and France Water stations, you don't have to do the same and can potentially train different models for different geographic *areas*.


> Note this notebook need ouputs from *01 Preprocessing and Training/02 - Feature Engineering*


# 1. Data Import and Setup

Imports necessary libraries, sets up environment paths, and includes custom utility functions.



In [None]:
import matplotlib.pyplot as plt
from math import sqrt
import sys
import pandas as pd
import os
import lightgbm as lgb
import numpy as np
from quantile_forest import RandomForestQuantileRegressor

import copy
import joblib
import importlib

from mapie.regression import MapieQuantileRegressor
from mapie.regression import MapieTimeSeriesRegressor

# Import the EBMRegression estimator from Synapse ML.
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..','..','..')))

from src.utils.model import split_dataset, compare_models_per_station, load_models_auto



Defines constants :
* INPUT_DIR must be the same as the one defined in *02 - Feature Engineering*.
* Models will be saved in *MODEL_DIR*
* *DATASET_DIR* must be the directory where you unzip zenodo dataset.

In [2]:
INPUT_DIR = "../../../data/input/"
MODEL_DIR = "../../../models/"
DATASET_DIR = "../../../dataset/"

SEED = 42
NUMBER_OF_WEEK = 4 # Number of weeks to predict one model is trained per week

FINAL_MODELS = ["mapie",
                "qrf",
                #"EBM",
                #"aci"
                ]
mapie_enbpi = {}
mapie = {}
qrf = {}
mapie_aci = {}



# Read data

# 2. Data Loading
Load in the baseline datasets, create the directory to save models.

In [3]:
# load the dataset
dataset_train = pd.read_csv(f"{INPUT_DIR}dataset_baseline.csv")

dataset_train = dataset_train.set_index("ObsDate")

# create path to save models if it does not exist

if not os.path.exists(f"{MODEL_DIR}final/"):
    os.makedirs(f"{MODEL_DIR}final/")

Data pre-processing removal of unnecessary columns, setup of the target

In [4]:
X_train = dataset_train.drop(columns=["water_flow_week1", "station_code", "water_flow_week2", "water_flow_week3", "water_flow_week4", "index"])
y_train = {}
for i in range(0, NUMBER_OF_WEEK):
    y_train[i] = dataset_train[f"water_flow_week{i+1}"]




# 2. Models training
### a. LGBM + MAPIE

In [None]:
if "mapie" in FINAL_MODELS: 
    print("Training Mapie")
    # Define constants
    ALPHA = 0.1
    TIME_VALIDATION = "2000-01-01 00:00:00"
    LGBM_PARAMS = {
        "max_depth": 15,
        "learning_rate": 0.01,
        "n_estimators": 500,
        "colsample_bytree": 0.7,
        "objective": "quantile",
        "alpha": ALPHA
    }

    train_mapie, val_mapie, val_temporal  = split_dataset(dataset_train, 0.75, TIME_VALIDATION)

    X_train_mapie = train_mapie.drop(columns=["water_flow_week1", "station_code", "water_flow_week2", "water_flow_week3", "water_flow_week4", "index"])
    print(len(X_train_mapie.columns))
    y_train_mapie = {}
    for i in range(0, NUMBER_OF_WEEK):
        y_train_mapie[i] = train_mapie[f"water_flow_week{i+1}"]

    X_val = val_mapie.drop(columns=["water_flow_week1", "station_code", "water_flow_week2", "water_flow_week3", "water_flow_week4", "index"])
    y_val = {}
    y_val[0] = val_mapie["water_flow_week1"]
    for i in range(1, NUMBER_OF_WEEK):
        y_val[i] = val_mapie[f"water_flow_week{i+1}"]

    for i in range(NUMBER_OF_WEEK):
        print(f"Training week {i}")
        # Initialize and train MapieQuantileRegressor
        regressor = lgb.LGBMRegressor(**LGBM_PARAMS)
        mapie[i] = MapieQuantileRegressor(estimator=regressor, method="quantile", cv="split", alpha=ALPHA)
        mapie[i].fit(X_train_mapie, y_train_mapie[i], X_calib=X_val, y_calib=y_val[i])
        
        # save model with date
        time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")

        model_path = f"{MODEL_DIR}final/mapie_quantile_{time}_week_{i}.pkl"
        joblib.dump(mapie[i], model_path)


### b. QRF

In [None]:
if "qrf" in FINAL_MODELS:
    for i in range(NUMBER_OF_WEEK):
        print(f"Training week {i}")
        # Train RandomForestQuantileRegressor
        qrf[i] = RandomForestQuantileRegressor(n_estimators=100, max_depth=10, min_samples_leaf=10)
        qrf[i].fit(X_train, y_train[i])

        time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
        model_path = f"{MODEL_DIR}final/qrf_quantile_{time}_week_{i}.pkl"
        joblib.dump(qrf[i], model_path)

### c. Explainable Boosting Machine

In [7]:
if "ebm" in FINAL_MODELS:
    NUM_ENSEMBLES = 5

    # A dictionary to hold the list of EBM models per week
    ebm_ensembles = {}

    for i in range(NUMBER_OF_WEEK):
        print(f"Training EBM ensemble for week {i}")

        # This will store all seed models for a single week
        models_i = []
        
        for seed in range(NUM_ENSEMBLES):
            print(f"Training EBM ensemble {seed} for week {i}")
            # 1. Create your bootstrap sample or subset (if you want bagging)
            sample_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
            X_sample = X_train.iloc[sample_indices]
            y_sample = y_train[i][sample_indices]
            
            # 2. Train an EBM with consistent binning parameters
            ebm_model = ExplainableBoostingRegressor(
                outer_bags=1,
                inner_bags=1,
                max_bins=128,
                learning_rate=0.05,
                interactions=3,
                early_stopping_rounds=100,
                random_state=42  # ensures same binning
            )
            ebm_model.fit(X_sample, y_sample)
            
            models_i.append(ebm_model)

        time = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
        file_path = f"{MODEL_DIR}final/ebm_ensemble_{time}_week_{i}.pkl"

        joblib.dump(ebm_ensembles, file_path)
        print(f"Saved EBM ensembles to {file_path}")

        # Store the list of models for week i
        ebm_ensembles[i] = models_i

In [None]:
y_train_stations = dataset_train["station_code"].values

for i in range(NUMBER_OF_WEEK):

    print(f"============================== WEEK {i} temporal ===================================")
    baseline_day_before = dataset_train["water_flow_lag_1w"]
    y_pred_mapie, y_pis_mapie = mapie[i].predict(X_train)
    y_pred_qrf = qrf[i].predict(X_train, quantiles="mean", aggregate_leaves_first=False)
    y_pis_qrf = qrf[i].predict(X_train, quantiles=[ALPHA/2, 1-ALPHA/2])

    predictions = [
        {"model": "LGBM+MAPIE", "prediction": y_pred_mapie, "dataset":"test", "stations": y_train_stations, "prediction_interval": y_pis_mapie},
        {"model": "Week before", "prediction": baseline_day_before, "dataset":"test", "stations": y_train_stations, "prediction_interval": None},
        {"model": "QRF", "prediction": y_pred_qrf, "dataset":"test", "stations": y_train_stations, "prediction_interval": y_pis_qrf},
    ]

    nop = compare_models_per_station(y_train[i].values,
                                     predictions,
                                     y_train_stations,
                                     column_to_display="log_likelihood" ,
                                     title = f"WEEK {i}")

    coverage_mapie = (y_train[i].values >= y_pis_mapie[:,0,0]) & (y_train[i].values <= y_pis_mapie[:,1,0])
    print(f"Coverage of the prediction interval for week {i}: {coverage_mapie.mean()}")

    coverage_qrf = (y_train[i].values >= y_pis_qrf[:,0]) & (y_train[i].values <= y_pis_qrf[:,1])
    print(f"Coverage of the prediction interval for week {i}: {coverage_qrf.mean()}")
