# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **stock_quantity**
- Target Variable for Demand Forecasting: **y**

### Metrics for models avaliation
- RMSE - Root Mean Squared Error
- MAE - Mean Absolute Error

In [None]:
# Flag to control baseline training
TRAIN_BASE = False

# Flag to control hyperparameter search
TRAIN_SEARCH = False

# DATA ACQUISITION
## Import Libraries

In [2]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import plotly.express as px
from functools import reduce

# Specialized Libraries
import mlflow
import mlflow
import mlflow.pyfunc
from mlflow.models import infer_signature
from mlflow.client import MlflowClient

from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

from mlforecast import MLForecast
from mlforecast.target_transforms import AutoDifferences, LocalRobustScaler 
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.feature_engineering import transform_exog
from mlforecast.auto import AutoLinearRegression, AutoRidge, AutoRandomForest, AutoXGBoost
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import smape, rmse, mae
from utilsforecast.plotting import plot_series

from xgboost import XGBRegressor

# To import customized class and function
from smart_supply_chain_ai.utils.preprocess_functions import SimplePreprocessor, XDFPreparator

# Notebook mlflow Loggings
import logging
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)
pd.set_option('display.max_columns', None)


## Load Data

In [3]:
# Define data paths
data_path = os.path.join('..', 'data', 'processed')
docs_path = os.path.join('..', 'docs')
path_models = os.path.join('..', 'models')
json_path = os.path.join('../src','smart_supply_chain_ai' , 'utils/')

In [4]:
# Load Pickle file
compare_data = pd.read_pickle(data_path + '/data_for_compare.pkl')
read_data = pd.read_pickle(data_path + '/data_for_train.pkl')

In [5]:
# Create copies of the data
df = read_data.copy()
df_compare = compare_data.copy()

In [6]:
# View historical data
df.tail(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70876,2025-05-09,2025-05-04,False,All-Purpose Flour,1940872|P,Pantry,Baking Supplies,365.0,60.0,lb,3,BakeWell Supplies,1552913|S,90.0,50.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,9,5,44,55,51,5,Safe,28.095371,36.0
70877,2025-05-09,2025-05-02,False,Popcorn Kernels,1992802|P,Pantry,Snacks,365.0,90.0,lb,4,SnackTime Distributors,1414750|S,80.0,110.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,135,4,848,1060,1342,7,Safe,25.934508,39.0
70878,2025-05-09,2025-05-07,False,Plum,1998069|P,Fresh Foods,Fruits,5.0,2.0,lb,4,Stone Fruit Specialists,1405032|S,165.0,38.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,754,6,6156,7182,6187,2,Safe,19.17862,53.0


In [7]:
# View compare data
df_compare.head(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70879,2025-05-11,2025-05-01,False,Arborio Rice,1003530|P,Pantry,Grains & Rice,730.0,180.0,lb,4,GrainWorld Distributors,1792439|S,150.0,200.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,186,5,468,585,662,10,Safe,28.680627,35.0
70880,2025-05-11,2025-05-06,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,Wholesale Warehouse,1363063|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,243,4,285,380,308,5,Safe,64.442801,15.0
70881,2025-05-11,2025-05-05,False,Canned Tuna,1017723|P,Pantry,Canned Fish,1095.0,90.0,unit,4,PantryEssentials Ltd.,1141220|S,95.0,130.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,25,4,56,130,89,6,Safe,17.540236,58.0


# Prepare Data for Time Series Analysis

In [8]:
# Rename columns for mlforecast compatibility
df.rename(columns={'received_date':'ds', 'product_id': 'unique_id'}, inplace=True)

In [9]:
# Initialize LabelEncoder
le = LabelEncoder()
# Encode categorical columns
df[['category_encoded', 'sub_category_encoded', 'weather_severity_encoded']] = df[['category', 'sub_category', 'weather_severity']].apply(lambda col: le.fit_transform(col))

In [10]:
# Select relevant columns for sales volume forecasting
sales_df = df[['ds',
 'unique_id',
 'in_season',
 'category_encoded',
 'sub_category_encoded',
 'shelf_life_days',
 'stock_quantity',
 'sales_volume',
 ]].copy()

In [11]:
# Rename target column
sales_df.rename(columns={'sales_volume': 'y'}, inplace=True)

In [12]:
# Convert 'ds' column to datetime format
sales_df['ds'] = pd.to_datetime(sales_df['ds'], errors='coerce')

In [13]:
# Calculate the correlation matrix for all numeric columns
correlation_matrix = sales_df.select_dtypes(exclude=['category', 'object', 'datetime']).corr()

In [14]:
# Create a heatmap visualization of the correlation matrix
corr1 = correlation_matrix[correlation_matrix >= 0.7].replace(1.0, np.nan).dropna(how='all', axis=1)
corr1.dropna(how='all').replace(np.nan, '')

In [15]:
# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(correlation_matrix.shape), k=-1)
masked_corr = correlation_matrix.where(mask == 1)

# Create a heatmap visualization of the correlation matrix
fig_corr = px.imshow(masked_corr,
                    title='Correlation Matrix - Sales Volume',
                    color_continuous_scale='RdBu_r',  # Red-Blue reversed color scale
                    aspect="auto",                   # Automatic aspect ratio
                    text_auto=True,                 # Display correlation values on cells
                    zmin=-1, zmax=1)                 # Fix color scale from -1 to +1

# Adjust the figure dimensions
fig_corr.update_layout(width=800, height=800)

# Display the interactive heatmap
fig_corr.show()

# Setup Machine Learning Model's

## Applying Feature Engineering and Partitioning Data for Model Training

In [16]:
# Define columns to maintain
maintain_cols = ['received_date', 'product', 'product_id', 'sales_volume', 'category', 'sub_category', 'is_holiday', 'in_season', 'shelf_life_days', 'stock_quantity']

# Define column renaming pattern
pattern = {'received_date': 'ds', 'product_id': 'unique_id', 'sales_volume': 'y'}

# Select relevant columns from the dataset
select_df = read_data[maintain_cols].rename(columns=pattern)
df_future_exog = compare_data[maintain_cols].rename(columns=pattern)


In [17]:
# Count how many rows have the same ds and unique_id
select_df.duplicated(subset=['ds', 'unique_id']).sum()

5380

In [18]:
# Sort the DataFrame by the 'ds' column and reset the index, dropping the old index
select_df = select_df.sort_values(by='ds').reset_index(drop=True)

In [19]:
# Group both DataFrames by ds and unique_id, then apply aggregation rules to each column
agg_rules = {
    'y': 'sum',
    'category': 'last',
    'sub_category': 'last',
    'is_holiday': 'last',
    'in_season': 'last',
    'shelf_life_days': 'last',
    'stock_quantity': 'first',
    'product': 'first',
}

select_df = select_df.groupby(['ds', 'unique_id'], as_index=False).agg(agg_rules).reset_index(drop=True)
df_future_exog = df_future_exog.groupby(['ds', 'unique_id'], as_index=False).agg(agg_rules).reset_index(drop=True)

In [20]:
# Verify duplicates after Partitioning
print(f"Duplicates in select_df: {select_df.duplicated(subset=['ds', 'unique_id']).sum()}")
print(f"Duplicates in compare_data: {df_future_exog.duplicated(subset=['ds', 'unique_id']).sum()}")

Duplicates in select_df: 0
Duplicates in compare_data: 0


In [21]:
def transform_exogenous_features(data: pd.DataFrame) -> pd.DataFrame:
    # Generate lag-based features from the selected dataframe
    exog_feat = transform_exog(
        data[['ds', 'unique_id', 'stock_quantity']],  # Use only the relevant columns
        lags=[1, 2, 7],  # Create lag features for 1, 2, and 7 time steps
        lag_transforms={
            1: [ExpandingMean()],         # Apply expanding mean on lag 1
            7: [RollingMean(window_size=7)]  # Apply 7-day rolling mean on lag 7
        }
    )

    # Remove the original 'stock_quantity' column to avoid duplication
    exog_feat = exog_feat.drop(columns='stock_quantity')

    # Merge the generated exogenous features back into the main dataframe
    data_merged = data.merge(exog_feat, on=['ds', 'unique_id'], how='left')

    # Drop NaNs
    return data_merged.dropna().drop(columns='stock_quantity')

In [22]:
select_df = transform_exogenous_features(select_df)

## Set Up MLForecast Models to Predict Sales Volume on Train and Test Sets

In [23]:
# +++ Prepare MLForecast +++
# Initialize the temporal preprocessing pipeline with a specified pattern and static features
preprocessor = SimplePreprocessor()

# Configure model parameters
random_state = 42
params_rf = {
    "n_estimators": 200,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
}
params_xgb = {
    'objective':'reg:squarederror',  
    'n_estimators':200,              
    'learning_rate':0.1,             
    'max_depth':6,                   
    'subsample':0.8,                 
    'colsample_bytree':0.8,          
}

# Define Models Pipelines
pipes_models = {
    'linear': make_pipeline(preprocessor, LinearRegression()),
    'ridge': make_pipeline(preprocessor, Ridge()),
    'rf': make_pipeline(preprocessor, RandomForestRegressor(**params_rf, random_state=random_state)),
    'xgb': make_pipeline(preprocessor, XGBRegressor(**params_xgb, random_state=random_state))
}

# Define target transformations
target_transforms = [
    AutoDifferences(3),           
    LocalRobustScaler(scale='iqr')
    ]

# Define series transformations
lag_transforms = {
    1: [ExpandingMean()],              # Aplica ExpandingMean ao lag 1
    2: [RollingMean(window_size=2)],   # Aplica RollingMean(2) ao lag 2
    7: [RollingMean(window_size=7)],   # Aplica RollingMean(7) ao lag 7
}

lags=[1, 2, 7]
date_features=['dayofweek', 'month']
num_threads=4

# Configure MLforecast for model training with temporal feature engineering
fcst = MLForecast(
    models=pipes_models,
    freq='D',
    lags=lags,
    lag_transforms=lag_transforms,
    date_features=date_features,
    num_threads=num_threads,
    target_transforms=target_transforms,
)

# Start Mlflow

In [24]:
# MLflow experiment setup
mlflow.set_tracking_uri("file:../models/mlflow_data")
mlflow.set_experiment("Demand_Forecasting_Base_Line")

2025/11/19 19:35:21 INFO mlflow.tracking.fluent: Experiment with name 'Demand_Forecasting_Base_Line' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///home/rb/Projects/portfolio/smart-supply-chain-ai/notebooks/../models/mlflow_data/630183622070888025', creation_time=1763591721374, experiment_id='630183622070888025', last_update_time=1763591721374, lifecycle_stage='active', name='Demand_Forecasting_Base_Line', tags={}>

### Data setup

In [25]:
# Prediction length
horizon = 28

# Frequency (month, year, day, min, e.g.)
freq = 'D'

# Define static columns
static_ = ['category', 'sub_category', 'shelf_life_days']

# Split data into train and validation
splitted_df = select_df.groupby('unique_id').tail(horizon)
X_df = splitted_df.drop(columns=['y'] + static_)
valid = splitted_df[['ds', 'unique_id', 'y']]
train = select_df.drop(splitted_df.index).drop(columns=['product'])
# df_future_exog


### Parameters MLFlow

In [26]:
# Models parameters
model_params = {
    'rf': params_rf,
    'xgb': params_xgb,
    'linear': {},
    'ridge': {},
}

# MLforecast parameters initialization
lag_transforms_log = {
    k: [t.__class__.__name__ for t in v] 
    for k, v in lag_transforms.items()
}
target_transforms_log = [t.__class__.__name__ for t in target_transforms]

init_params = {
    'freq': fcst.freq,
    'lags': lags,
    'lag_transforms': lag_transforms_log,
    'date_features': date_features,
    'target_transforms': target_transforms_log,
    'num_threads': num_threads,
}

# Preprocessor parameters
preprocessor_params = {
    'static_features': preprocessor.static_features,
    'imputer_cat': 'most_frequent',
    'imputer_num': 'median',
    'scaler_num': 'PowerTransformer(yeo-johnson)',
}

# Concat for MLflow
mlflow_params = {
    'MLForecast_init': init_params,
    'Preprocessor': preprocessor_params,
    'Modelos': model_params,
}

In [27]:
# Best model registration name
REGISTERED_MODEL_NAME = "BestBaselineModel"

if TRAIN_BASE:
    # Start an MLflow run
    with mlflow.start_run(run_name='MLForescast_Base_line_model') as run:
        ## Parameters Log
        # Log Generic parameters
        mlflow.log_params(mlflow_params['MLForecast_init'])
        mlflow.log_params(mlflow_params['Preprocessor'])

        # Log specific parameter for each model
        for model_name, params in mlflow_params['Modelos'].items():
            # Add prefix in MLflow UI (e.g.: 'model_rf_n_estimators')
            prefixed_params = {f"model_{model_name}_{k}": v for k, v in params.items()}
            mlflow.log_params(prefixed_params)

        ## Train and Predict
        # Train models
        print('Training models...')
        fcst.fit(train, static_features=static_)

        # Log expected features
        first_model_name = list(fcst.models_.keys())[0]
        model_pipeline = fcst.models_[first_model_name]

        preprocessor = None
        if hasattr(model_pipeline, 'named_steps'):
            for step_name in ['preprocessor', 'columntransformer', 'simplepreprocessor']:
                preprocessor = model_pipeline.named_steps.get(step_name)
                if preprocessor:
                    break

        feature_source = preprocessor or model_pipeline
        if hasattr(feature_source, 'get_feature_names_out'):
            feature_names = feature_source.get_feature_names_out().tolist()
            mlflow.log_param("expected_features", feature_names)
        else:
            original_features = train.drop(columns=['y']).columns.tolist()
            mlflow.log_param("original_features", original_features)

        # Make Future DataFrame
        print('Creating future DataFrame...')
        future_df = fcst.make_future_dataframe(h=horizon)

        # Initialise Preparator
        preparator = XDFPreparator(exog_df=X_df, json_path=json_path)

        # Create Exogenous Features
        X_df = preparator.create_future_df(future_df)

        # Predict
        print('Predicting...')
        predictions_df = fcst.predict(h=horizon, X_df=X_df)

        # Evaluation
        print('Evaluating models...')
        evaluation_df = predictions_df.merge(valid[['unique_id', 'ds', 'y']], on=['unique_id', 'ds'])

        # Consolidate Evaluation
        metrics = evaluate(
            df=evaluation_df,
            metrics=[rmse, mae, smape],
            models=predictions_df.columns.drop(['unique_id', 'ds']).tolist(),
            id_col='unique_id',
            time_col='ds',
            target_col='y'
        )


        print('Metrics summary:')
        print(metrics)

        # Metric calculation
        best_rmse = float('inf')
        best_model_name = None

        # Wide to long
        metrics_long = pd.melt(metrics, 
                            id_vars=['unique_id', 'metric'], 
                            value_vars=['linear', 'ridge', 'rf', 'xgb'],
                            var_name='model', 
                            value_name='value')


        # Pivot for metrics in columns
        metrics_pivot = metrics_long.pivot_table(index=['unique_id', 'model'], 
                                                columns='metric', 
                                                values='value').reset_index()

        # Define the aggregation functions we want to apply (mean, std, median)
        agg_funcs = {
            "mean": "mean",
            "std": "std",
            "median": "median"
        }

        # Store intermediate aggregated DataFrames
        results = []

        # Loop through each aggregation function
        for name, func in agg_funcs.items():
            metric_ = (
                metrics_pivot
                # Group by model and calculate metrics (rmse, mae, smape)
                .groupby("model")[["rmse", "mae", "smape"]]
                .agg(func)  # Apply the aggregation function
                # Rename columns to indicate the aggregation type (e.g., rmse_mean_agg)
                .rename(columns=lambda c: f"{c}_{name}_agg")
                .reset_index()  # Reset index so 'model' becomes a column again
            )
            # Append the aggregated DataFrame to the results list
            results.append(metric_)

        # Merge all aggregated DataFrames together on 'model'
        metric_agg = reduce(lambda left, right: pd.merge(left, right, on="model"), results)

        # Merge the aggregated metrics back with the original pivot table
        metric_summary = metrics_pivot.merge(metric_agg, on='model', how='left')

        # Save Summary
        filename = "/base_line_metrics_summary.csv"
        metric_summary.to_csv(path_models + filename, index=False)

        # Log all metrics in MLflow
        metrics_list = ['rmse', 'mae', 'smape']
        agg_types = list(agg_funcs.keys()) # ['mean', 'std', 'median']

        for _, row in metric_agg.iterrows():
            model_name = row['model']
            
            print(f"\n--- Log para o Modelo: {model_name} ---")

            for metric in metrics_list:
                for agg in agg_types:
                    col_name = f"{metric}_{agg}_agg"
                    log_key = f"{model_name}_{col_name}"
                    log_value = row[col_name]
                    
                    # Log no MLflow
                    mlflow.log_metric(log_key, log_value)
                    # print(f"  {log_key}: {log_value:.4f}")

            # print(f"Metrics for {model_name}: RMSE={rmse_val:.2f}, MAE={mae_val:.2f}, SMAPE={smape_val:.2f}%")

        # Best Model 
        best_model_name = metric_agg.loc[metric_agg['rmse_mean_agg'].idxmin(), 'model']
        best_rmse = metric_agg['rmse_mean_agg'].min()

        # Log best model in mlflow
        mlflow.log_metric("Final_Best_RMSE_Mean_Agg", best_rmse)
        mlflow.set_tag("Best_Model_Selected", best_model_name)
        print(f"Best Model: {best_model_name} with RMSE: {best_rmse:.2f}")

        # Access best pipeline
        model_pipeline = fcst.models_[best_model_name]

        # Features names extraction
        input_data = fcst.preprocess(train, static_features=static_).head()

        # Infer the model signature and dtypes
        signature = infer_signature(input_data, model_pipeline.predict(input_data))

        # Save and best model registry
        mlflow.sklearn.log_model(
                sk_model=model_pipeline, 
                name=f"best_pipeline_{best_model_name}", 
                registered_model_name=REGISTERED_MODEL_NAME, 
                signature=signature,
                input_example=input_data.head(5)
            )

else:
    loaded_model_uri = f"models:/{REGISTERED_MODEL_NAME}/latest"

    # Load model
    print(f"Loading model from MLflow Model Registry: {loaded_model_uri}")
    loaded_model = mlflow.sklearn.load_model(loaded_model_uri)
    
    print("Model loaded.")

Training models...
Creating future DataFrame...
Predicting...
Evaluating models...
Metrics summary:
     unique_id metric     linear      ridge         rf        xgb
0    1003530|P   rmse  73.103644  73.103867  76.973090  76.356942
1    1007004|P   rmse  86.357853  86.359719  83.526162  82.255408
2    1009699|P   rmse  34.207649  34.207791  30.824450  31.976865
3    1017723|P   rmse   7.977720   7.977247   6.295355   6.628097
4    1018159|P   rmse  71.205105  71.227422  82.966885  82.227393
..         ...    ...        ...        ...        ...        ...
505  1945145|P  smape   0.293177   0.293177   0.215536   0.232225
506  1964630|P  smape   0.235770   0.235823   0.163848   0.182403
507  1992802|P  smape   0.151282   0.151270   0.159906   0.161418
508  1996239|P  smape   0.180934   0.180931   0.129989   0.130294
509  1998069|P  smape   0.150584   0.150596   0.142048   0.140978

[510 rows x 6 columns]

--- Log para o Modelo: linear ---

--- Log para o Modelo: rf ---

--- Log para o Mo

Successfully registered model 'BestBaselineModel'.
Created version '1' of model 'BestBaselineModel'.


## Structure Future Dates

In [28]:
future_df = fcst.make_future_dataframe(h=horizon)

regressor_cols = ['ds', 'unique_id', 'is_holiday', 'in_season', 'stock_quantity_lag1', 'stock_quantity_lag2', 'stock_quantity_lag7', 'stock_quantity_expanding_mean_lag1', 'stock_quantity_rolling_mean_lag7_window_size7']

X_regressor = valid[regressor_cols].copy()

X_df_final = future_df.merge(X_regressor, on=['unique_id', 'ds'], how='left')

if X_df_final.isnull().any().any():
    print("ALERTA: O DataFrame X_df_final cont√©m NaNs! Verifique a completude do seu DataFrame 'valid'.")

KeyError: "['is_holiday', 'in_season', 'stock_quantity_lag1', 'stock_quantity_lag2', 'stock_quantity_lag7', 'stock_quantity_expanding_mean_lag1', 'stock_quantity_rolling_mean_lag7_window_size7'] not in index"

### Holidays imputation 

In [None]:
country_holidays = holidays.country_holidays('Brazil')

def classify_holiday(date):
    if date in country_holidays:
        return True
    else:
        return False

In [None]:
X_df_final['is_holiday'] = X_df_final['ds'].apply(classify_holiday)

In [None]:
X_df_final[X_df_final['is_holiday'].isna()]

### Products Seasonality Imputation

In [None]:
# List of JSON filenames (without extension) to be loaded
arch_json = ['products','products_categories', 'suppliers']

# Dictionary to store the loaded JSON content
store_catalog = {}

# Loop through each filename, build the full path, and load the JSON data
for name in arch_json:
    file_path = os.path.join(json_path, f"{name}.json")  # Construct full file path
    with open(file_path, "r", encoding="utf-8") as f:     # Open the JSON file
        store_catalog[name] = json.load(f)                        # Load and store the data under its name

In [None]:
# Create a DataFrame of products with product names as a column
products = pd.DataFrame.from_dict(store_catalog['products']).T.reset_index().rename(columns={'index': 'product'})

# Select variables
products = products[['product', 'seasonality']]

In [None]:
# Create seasonal data
pid = read_data.copy()
pid = pid[['product', 'product_id']]
pid = pid.drop_duplicates().reset_index(drop=True)
season_products = products.merge(pid, on=['product'], how='right')
season_products.drop(columns=['product'], inplace=True)

In [None]:
# Merge seasonal products with Future Dataframe
X_df_final = X_df_final.merge(season_products, left_on= ['unique_id'], right_on=['product_id'], how='left')

In [None]:
def check_seasonality(row):
    """
    Checks whether the received month of a product aligns with its seasonal availability.
    """
    received_month = row['month_name']
    seasonality_list = row['seasonality']
    
    return received_month in seasonality_list


In [None]:
# Extracts the full month name from 'received_date' to support seasonality checks
X_df_final['month_name'] = X_df_final['ds'].dt.month_name()

# Apply function
X_df_final['in_season'] = X_df_final.apply(check_seasonality, axis=1)

# Removes the temporary column after seasonality classification is complete
drop_cols = ['month_name', 'seasonality', 'product_id']
X_df_final.drop(columns=drop_cols, inplace=True)

### Stock Quantity Imputation

In [None]:
X_df_final = X_df_final.groupby('unique_id').apply(lambda g: g.ffill()).reset_index(drop=True)

In [None]:
X_df_final

In [None]:
X_df_final = X_df_final.groupby('unique_id').apply(lambda g: g.bfill()).reset_index(drop=True)

X_df_final = X_df_final.sort_values(['unique_id', 'ds'])

In [None]:
X_df_final.isna().sum()

In [None]:
preds = fcst.predict(h=horizon, X_df=X_df_final)

In [None]:
df_valid = valid[['ds', 'unique_id', 'y', 'category', 'sub_category']]

In [None]:
df_valid.merge(preds, on=['unique_id', 'ds'], how='inner')

In [None]:
import pandas as pd
import holidays
import os
import json

class XDFPreparator:
    """
    Class to prepare the X_df_final DataFrame (regressors) for mlforecast.
    """
    def __init__(self, valid_df, json_path):
        """
        Initializes the preparator with validation data and the path to the JSONs.
        
        Args:
            valid_df (pd.DataFrame): The validation DataFrame containing already computed regressors
                                     (lags, rolling means, etc.).
            json_path (str): Path to the directory containing catalog JSON files 
                             (products, products_categories, etc.).
        """
        self.valid_df = valid_df
        self.json_path = json_path
        self.store_catalog = self._load_catalog_jsons()
        self.country_holidays = holidays.country_holidays('Brazil')
        self.products_seasonality = self._prepare_seasonality_data()

    def _load_catalog_jsons(self):
        """Loads the catalog JSON files."""
        arch_json = ['products', 'products_categories', 'suppliers']
        catalog = {}
        for name in arch_json:
            file_path = os.path.join(self.json_path, f"{name}.json")
            with open(file_path, "r", encoding="utf-8") as f:
                catalog[name] = json.load(f)
        return catalog

    def _prepare_seasonality_data(self):
        """Prepares the DataFrame with product seasonality information."""
        # 1. Extract seasonality from the product catalog
        products = pd.DataFrame.from_dict(self.store_catalog['products']).T.reset_index().rename(columns={'index': 'product'})
        products = products[['product', 'seasonality']]
        
        # 2. Merge with product_id from valid_df
        # Assuming 'valid_df' has the columns 'product' and 'product_id'
        pid = self.valid_df[['product', 'product_id']].drop_duplicates().reset_index(drop=True)
        season_products = products.merge(pid, on=['product'], how='right')
        season_products.drop(columns=['product'], inplace=True)
        return season_products

    def _classify_holiday(self, date):
        """Classifies whether a date is a national holiday."""
        return date in self.country_holidays

    def _check_seasonality(self, row):
        """Checks if the row's month is in the product's seasonality list."""
        received_month = row['month_name']
        seasonality_list = row['seasonality']
        return received_month in seasonality_list

    def _impute_holidays(self, df):
        """Imputes the 'is_holiday' column."""
        df['is_holiday'] = df['ds'].apply(self._classify_holiday)
        return df

    def _impute_seasonality(self, df):
        """Imputes the 'in_season' column."""
        # Merge with seasonality data
        df = df.merge(self.products_seasonality, 
                      left_on=['unique_id'], 
                      right_on=['product_id'], 
                      how='left')
        
        # Create month name column
        df['month_name'] = df['ds'].dt.month_name()
        
        # Apply seasonality check function
        df['in_season'] = df.apply(self._check_seasonality, axis=1)
        
        # Clean up temporary columns
        drop_cols = ['month_name', 'seasonality', 'product_id']
        df.drop(columns=drop_cols, inplace=True)
        return df

    def _impute_stock_quantity_lags(self, df):
        """Imputes lag and rolling mean columns for stock_quantity."""
        # Forward-fill for middle values
        df = df.groupby('unique_id').apply(lambda g: g.ffill()).reset_index(drop=True)
        # Backward-fill for initial values (that couldn't be filled)
        df = df.groupby('unique_id').apply(lambda g: g.bfill()).reset_index(drop=True)
        # Reorder at the end
        df = df.sort_values(['unique_id', 'ds'])
        return df

    def create_future_df(self, future_df):
        """
        Creates the final regressor DataFrame for mlforecast.

        Args:
            future_df (pd.DataFrame): Future DataFrame generated by fcst.make_future_dataframe(h=horizon).
            
        Returns:
            pd.DataFrame: The X_df_final with all regressors filled.
        """
        regressor_cols = [
            'ds', 'unique_id', 'is_holiday', 'in_season', 
            'stock_quantity_lag1', 'stock_quantity_lag2', 'stock_quantity_lag7', 
            'stock_quantity_expanding_mean_lag1', 'stock_quantity_rolling_mean_lag7_window_size7'
        ]
        
        # 1. Select and perform initial merge with existing regressors
        X_regressor = self.valid_df[regressor_cols].copy()
        X_df_final = future_df.merge(X_regressor, on=['unique_id', 'ds'], how='left')

        if X_df_final.isnull().any().any():
             print("Warning: Lag/rolling mean regressors will have NaNs in the future portion.")

        # 2. Impute Holidays
        X_df_final = self._impute_holidays(X_df_final)

        # 3. Impute Seasonality
        X_df_final = self._impute_seasonality(X_df_final)
        
        # 4. Impute Lags and Rolling Means (stock_quantity)
        X_df_final = self._impute_stock_quantity_lags(X_df_final)
        
        print(f"X_df_final completed. Remaining NaNs: {X_df_final.isnull().sum().sum()}")
        return X_df_final
