In [0]:
%pip install -r requirements.txt

In [0]:
dbutils.library.restartPython()

In [0]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from src.models.holidays_related.custom_holidays import get_all_holidays
from src.data.columns import Columns
from src.models.model_features import features_exog, apply_functions

pd.options.display.float_format = "{:,.2f}".format

In [0]:
from src.data.columns import Columns

DATE_COL = Columns.DATE.value
AMOUNT_COL = Columns.AMOUNT.value
PRED_UB_COL = Columns.PRED_UB_COL.value
PRED_LB_COL = Columns.PRED_LB_COL.value

In [0]:
dbutils.widgets.text("file_date", "2023-01-01", "Data del file")
file_date = dbutils.widgets.get("file_date")

In [0]:
from config.paths import get_clean_input_file_path, get_output_file_path

INPUT_DATA_PATH = get_clean_input_file_path(file_date)
OUTPUT_DATA_PATH = get_output_file_path(file_date)

In [0]:
from src.models.tracking_constants import ModelTracking

EXPERIMENT_NAME = ModelTracking.EXPERIMENT_NAME.value
TUNE_TRAIN_RUN_NAME = ModelTracking.TUNE_TRAIN_RUN_NAME.value
ARTIFACT_PATH = ModelTracking.ARTIFACT_PATH.value
ARTIFACT_NAME = ModelTracking.ARTIFACT_NAME.value

In [0]:
PRED_HORIZON = 120

#### data loading

In [0]:
loaded_spark_df = spark.read.format("delta").load(INPUT_DATA_PATH)
data = loaded_spark_df.toPandas()

In [0]:
data.index = data[DATE_COL]
data = data.asfreq('B')

In [0]:
data

#### preparation of last window y

In [0]:
last_window = data[AMOUNT_COL]

In [0]:
last_window.index.min(), last_window.index.max()

#### preparation of future series

In [0]:
years_of_interest = list(range(pd.Timestamp.today().year - 5, pd.Timestamp.today().year + 2))
print(years_of_interest)
list_holidays = [x.strftime("%Y-%m-%d") for x in get_all_holidays(years_of_interest)]

In [0]:
from pandas.tseries.offsets import CustomBusinessDay

custom_bday = CustomBusinessDay(holidays=list_holidays)
future_dates = pd.date_range(
    start=last_window.index.max() + custom_bday,
    periods=PRED_HORIZON,
    freq=custom_bday
)
future_exog = pd.DataFrame({DATE_COL: future_dates})
future_exog.index = future_exog[DATE_COL]
future_exog = future_exog.asfreq('B')

future_exog

In [0]:
future_exog = apply_functions(future_exog, features_exog)

#### Forecaster loading from MLFlow

In [0]:
import mlflow
# from model_wrapper import SkforecastWrapper

mlflow.set_registry_uri("databricks") # you need the Model Registry/UC to be enabled

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

# Search for runs with the given run_name, sorted by start_time descending
runs = mlflow.search_runs(
    experiment_ids=[experiment_id],
    filter_string=f"tags.mlflow.runName = '{TUNE_TRAIN_RUN_NAME}'",
    order_by=["start_time DESC"]
)

if not runs.empty:
    last_run_id = runs.iloc[0].run_id
    print(f"Last run_id for run_name: {last_run_id}")
    # Load the model
    model = mlflow.pyfunc.load_model(f"runs:/{last_run_id}/{ARTIFACT_PATH}")
else:
    print("No runs found with the given run_name.")

In [0]:
input_dict = {
    "last_window": last_window,
    "exog": future_exog,
    "steps": len(future_exog),
    "interval": [10, 90],
    "n_boot": 10
}

y_pred = model.predict(input_dict)

In [0]:
y_pred_df = y_pred.reset_index()
y_pred_df.columns = [DATE_COL, AMOUNT_COL, PRED_LB_COL, PRED_UB_COL]

In [0]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))

ax.plot(y_pred_df[DATE_COL], y_pred_df[AMOUNT_COL], label="prediction", color="k")
ax.fill_between(y_pred_df[DATE_COL], y_pred_df[PRED_LB_COL], y_pred_df[PRED_UB_COL], color="deepskyblue", alpha=0.9, label= 'prediction interval')
last_window.plot(ax = ax, label = 'last_window', color='b')
fig.legend()
fig

In [0]:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))

ax.plot(y_pred_df[DATE_COL], y_pred_df[AMOUNT_COL], label="prediction", color="k")
ax.fill_between(y_pred_df[DATE_COL], y_pred_df[PRED_LB_COL], y_pred_df[PRED_UB_COL], color="deepskyblue", alpha=0.9, label= 'prediction interval')
fig.legend()
fig

In [0]:
y_pred_df.to_csv(OUTPUT_DATA_PATH, index=False)