In [0]:
%pip install holidays tqdm skforecast==0.14.0 scikit-learn==1.5.0 xgboost==2.1.3 matplotlib==3.10.0

In [0]:
dbutils.library.restartPython()

In [0]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#from tqdm import tqdm

from src.models.holidays_related.custom_holidays import get_all_holidays

pd.options.display.float_format = "{:,.2f}".format

In [0]:
from xgboost import XGBRegressor
from skforecast.recursive import ForecasterRecursive
from src.models.model_features import window_features, features_exog, apply_functions

In [0]:
from src.data.columns import Columns

DATE_COL = Columns.DATE.value
AMOUNT_COL = Columns.AMOUNT.value

In [0]:
dbutils.widgets.text("file_date", "2023-01-01", "Data del file")
file_date = dbutils.widgets.get("file_date")

In [0]:
INPUT_FOLDER = "/Volumes/ts_catalog/ts_data/ts_input_clean/"
INPUT_FILE = f"serie_{file_date}"

In [0]:
INPUT_DATA_PATH = os.path.join(INPUT_FOLDER, INPUT_FILE)

In [0]:
from src.models.tracking_constants import ModelTracking

EXPERIMENT_NAME = ModelTracking.EXPERIMENT_NAME.value
TUNE_TRAIN_RUN_NAME = ModelTracking.TUNE_TRAIN_RUN_NAME.value
ARTIFACT_PATH = ModelTracking.ARTIFACT_PATH.value
ARTIFACT_NAME = ModelTracking.ARTIFACT_NAME.value

#### data loading

In [0]:
loaded_spark_df = spark.read.format("delta").load(INPUT_DATA_PATH)
data = loaded_spark_df.toPandas()

In [0]:
data.index = data[DATE_COL]
data = data.asfreq('B')

In [0]:
data

#### exogenous features computation

In [0]:
X = apply_functions(data, features_exog)
y = data[AMOUNT_COL]

In [0]:
X

In [0]:
y

In [0]:
y.index.min(), y.index.max()

#### Train/test split

In [0]:
years_of_interest = list(range(pd.Timestamp.today().year - 5, pd.Timestamp.today().year + 2))
print(years_of_interest)
list_holidays = [x.strftime("%Y-%m-%d") for x in get_all_holidays(years_of_interest)]

In [0]:
TEST_DAYS = 60
date_thr = y.index.max() - pd.offsets.CustomBusinessDay(TEST_DAYS, holidays=list_holidays)
date_thr

In [0]:
y_train = y.loc[y.index < date_thr]
y_test = y.loc[(y.index >= date_thr)]

X_train = X.loc[y_train.index]
X_test = X.loc[y_test.index]


In [0]:
y_train.shape, y_test.shape

In [0]:
window_features.transform_batch(y).head(10)

#### Hyperparam tuning grids and CV

In [0]:
from skforecast.model_selection import TimeSeriesFold

lags_grid = [11, 22]
param_grid = {
    "learning_rate": [0.05], #, 0.1],
    "max_depth": [5, 6], #, 7],
    "n_estimators": [50], #, 100, 200],
    "colsample_bytree": [0.5], #, 1],
}


cv = TimeSeriesFold(
    steps=60,
    initial_train_size=60,
    gap=0,
    refit=True,
    fixed_train_size=False,
)

#### Forecaster definition and Grid search

In [0]:
from skforecast.model_selection import grid_search_forecaster

In [0]:
forecaster = ForecasterRecursive(
    regressor=XGBRegressor(random_state=321),
    lags=100,
    window_features=window_features,
    forecaster_id="forecasting_series_y",
)

"""
forecaster.fit(
    y=y_train,
    exog=X_train,
    store_last_window=False,
    store_in_sample_residuals=True,
    random_state=412,
)
"""

In [0]:
results_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y_train,
    exog=X_train,
    cv=cv,
    param_grid=param_grid,
    lags_grid=lags_grid,
    metric="mean_absolute_error",
    return_best=True,
    verbose=False,
    show_progress=True,
    n_jobs=6,
)



### MLFlow Logging

In [0]:
import mlflow
import os
import pickle
import tempfile

from src.models.model_wrapper import SkforecastWrapper


mlflow.set_registry_uri("databricks") # you need the Model Registry/UC to be enabled 
#model_name_uc = "ts_catalog__models__my_forecaster_pyfunc" # you need the Model Registry/UC to be enabled

# SETTING EXPERIMENT
mlflow.set_experiment(EXPERIMENT_NAME)

with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp_file:
    with open(tmp_file.name, "wb") as f:
        pickle.dump(forecaster, f)
    pickle_path = tmp_file.name

with mlflow.start_run(run_name=TUNE_TRAIN_RUN_NAME) as run:
    best_params = results_grid.loc[0, "params"]
    best_lags = results_grid.loc[0, "lags"]
    best_metric = results_grid.loc[0, "mean_absolute_error"]
    mlflow.log_params(best_params)
    mlflow.log_param("lags", best_lags)
    mlflow.log_metric("mean_absolute_error", best_metric)
    
    mlflow.pyfunc.log_model(
        artifact_path=ARTIFACT_PATH,
        python_model=SkforecastWrapper(),
        artifacts={ARTIFACT_NAME: pickle_path},
        # registered_model_name=model_name_uc, # you need the Model Registry/UC to be enabled
        signature=SkforecastWrapper.get_signature()
    )

os.remove(pickle_path)


In [0]:
dbutils.notebook.exit("Execution stopped intentionally. Remove this line to continue.")

# BACKUP CELLS for improvements (not to be run now)

#### Predict + prediction interval con bootstrap

https://skforecast.org/0.14.0/user_guides/probabilistic-forecasting.html

In [0]:
def get_coverage(y_true, y_lower, y_upper):
    inside_interval = np.where((y_true >= y_lower) & (y_true <= y_upper), True, False)
    return inside_interval.mean()

In [0]:
from sklearn.metrics import mean_absolute_error

def plot_pi(y_true, y_pred, y_lb, y_ub, lbl_pi):

    for ind in (y_pred.index, y_lb.index, y_ub.index):
        assert y_true.index.equals(ind)

    fig, ax = plt.subplots(1, 1, figsize=(8, 4))

    y_true.plot(ax=ax, label="true", color="k", linestyle="--")

    ax.fill_between(y_lb.index, y_lb, y_ub, color="deepskyblue", alpha=0.9, label=lbl_pi)

    out_of_bounds = (y_true < y_lb) | (y_true > y_ub)
    ax.vlines(
        y_true.index[out_of_bounds],
        ymin=y_lb.min(),
        ymax=y_ub.max(),
        linewidth=0.5,
        color="b",
        linestyle="--",
        zorder=5,
        label="True value outside interval",
    )

    y_pred.plot(ax=ax, label="prediction", color="g")
    # ax.set_xlim('2024-11-01', '2024-12-10')
    ax.text(
        0.95,
        0.05,
        f"coverage = {get_coverage(y_true, y_lb, y_ub):.2f}",
        transform=ax.transAxes,
        fontsize=7,
        verticalalignment="bottom",
        horizontalalignment="right",
        color="black",
    )
    ax.text(
        0.95,
        0.00,
        f"mae = {mean_absolute_error(y_true, y_pred):.0f}",
        transform=ax.transAxes,
        fontsize=7,
        verticalalignment="bottom",
        horizontalalignment="right",
        color="black",
    )

    fig.legend(loc="lower left")


In [0]:
forecaster.set_out_sample_residuals(y_true=y_test_1.loc[bt_prediction.index], y_pred=bt_prediction["pred"])

predict_steps_2 = len(y_test_2)
last_window = pd.concat([y_train, y_test_1], axis=0)

In [0]:
pred_2_ord = forecaster.predict_interval(
    steps=predict_steps_2,
    last_window=last_window,
    exog=X_test_2,
    interval=[10, 90],
    n_boot=20,  # default 250, spesso usano 1000
    use_in_sample_residuals=False,  # True -> usa quelli ricavati in training set
    use_binned_residuals=False,  # feature sperimentale, per condizionare i residui alla predizione
)

In [0]:
plot_pi(
    y_true=y_test_2,
    y_pred=pred_2_ord["pred"],
    y_lb=pred_2_ord["lower_bound"],
    y_ub=pred_2_ord["upper_bound"],
    lbl_pi="ordinary bs PI",
)