# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **stock_quantity**
- Target Variable for Demand Forecasting: **y**

### Metrics for models avaliation
- RMSE - Root Mean Squared Error
- MAE - Mean Absolute Error

# DATA ACQUISITION
## Import Libraries

In [1]:
# Standard Libraries
import os
import dill
import shutil
import numpy as np
import pandas as pd
import plotly.express as px
from functools import reduce
from typing import Dict, List, Any

# Specialized Libraries
import mlflow
import mlflow.pyfunc
from mlflow.models import infer_signature
from mlflow.client import MlflowClient
import mlflow.sklearn

from sklearn.linear_model import LinearRegression, Ridge 
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

from mlforecast import MLForecast
from mlforecast.target_transforms import AutoDifferences, LocalRobustScaler 
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.auto import AutoLinearRegression, AutoRidge, AutoRandomForest, AutoXGBoost
from utilsforecast.evaluation import evaluate
from utilsforecast.losses import smape, rmse, mae
from utilsforecast.plotting import plot_series

from xgboost import XGBRegressor

# To import customized class and function
from smart_supply_chain_ai.utils.preprocess_functions import TrainPredictPreprocessor, SimplePreprocessor, XDFPreparator, MLflowArtifactManager
from smart_supply_chain_ai.utils.process_class import MLflowForecastManager


# Notebook mlflow Loggings
import logging
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.ERROR)
pd.set_option('display.max_columns', None)


## Load Data

In [2]:
# Define data paths
data_path = os.path.join('..', 'data', 'processed')
docs_path = os.path.join('..', 'docs')
path_models = os.path.join('..', 'models')
json_path = os.path.join('../src','smart_supply_chain_ai' , 'utils/')

In [3]:
# Load Pickle file
compare_data = pd.read_pickle(data_path + '/data_for_compare.pkl')
read_data = pd.read_pickle(data_path + '/data_for_train.pkl')

# Load csv file
product_seasonality = pd.read_csv(data_path + '/product_seasonality.csv')

In [4]:
# Create copies of the data
df = read_data.copy()
df_compare = compare_data.copy()

In [5]:
# View historical data
df.tail(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70876,2025-05-09,2025-05-04,False,All-Purpose Flour,1940872|P,Pantry,Baking Supplies,365.0,60.0,lb,3,BakeWell Supplies,1552913|S,90.0,50.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,9,5,44,55,51,5,Safe,28.095371,36.0
70877,2025-05-09,2025-05-02,False,Popcorn Kernels,1992802|P,Pantry,Snacks,365.0,90.0,lb,4,SnackTime Distributors,1414750|S,80.0,110.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,135,4,848,1060,1342,7,Safe,25.934508,39.0
70878,2025-05-09,2025-05-07,False,Plum,1998069|P,Fresh Foods,Fruits,5.0,2.0,lb,4,Stone Fruit Specialists,1405032|S,165.0,38.0,Refrigerated,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Weekdays,False,False,Normal,754,6,6156,7182,6187,2,Safe,19.17862,53.0


In [6]:
# View compare data
df_compare.head(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status,inventory_turnover_rate,doi_inventory_turnover
70879,2025-05-11,2025-05-01,False,Arborio Rice,1003530|P,Pantry,Grains & Rice,730.0,180.0,lb,4,GrainWorld Distributors,1792439|S,150.0,200.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,186,5,468,585,662,10,Safe,28.680627,35.0
70880,2025-05-11,2025-05-06,False,Canned Tomatoes,1007004|P,Pantry,Canned Goods,1095.0,90.0,unit,2,Wholesale Warehouse,1363063|S,25.0,300.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,243,4,285,380,308,5,Safe,64.442801,15.0
70881,2025-05-11,2025-05-05,False,Canned Tuna,1017723|P,Pantry,Canned Fish,1095.0,90.0,unit,4,PantryEssentials Ltd.,1141220|S,95.0,130.0,Room Temperature,Mild to Temperate,No precipitation,Calm / Light Breeze,Normal,Sunday,False,True,High,25,4,56,130,89,6,Safe,17.540236,58.0


# Prepare Data for Time Series Analysis

In [7]:
# Rename columns for mlforecast compatibility
df.rename(columns={'received_date':'ds', 'product_id': 'unique_id'}, inplace=True)

In [8]:
# Initialize LabelEncoder
le = LabelEncoder()
# Encode categorical columns
df[['category_encoded', 'sub_category_encoded', 'weather_severity_encoded']] = df[['category', 'sub_category', 'weather_severity']].apply(lambda col: le.fit_transform(col))

In [9]:
# Select relevant columns for sales volume forecasting
sales_df = df[['ds',
 'unique_id',
 'in_season',
 'category_encoded',
 'sub_category_encoded',
 'shelf_life_days',
 'stock_quantity',
 'sales_volume',
 ]].copy()

In [10]:
# Rename target column
sales_df.rename(columns={'sales_volume': 'y'}, inplace=True)

In [11]:
# Convert 'ds' column to datetime format
sales_df['ds'] = pd.to_datetime(sales_df['ds'], errors='coerce')

In [12]:
# Calculate the correlation matrix for all numeric columns
correlation_matrix = sales_df.select_dtypes(exclude=['category', 'object', 'datetime']).corr()

In [13]:
# Create a heatmap visualization of the correlation matrix
corr1 = correlation_matrix[correlation_matrix >= 0.7].replace(1.0, np.nan).dropna(how='all', axis=1)
corr1.dropna(how='all').replace(np.nan, '')

In [14]:
# Hide the top half of the matrix to avoid repeating values
mask = np.tril(np.ones(correlation_matrix.shape), k=-1)
masked_corr = correlation_matrix.where(mask == 1)

# Create a heatmap visualization of the correlation matrix
fig_corr = px.imshow(masked_corr,
                    title='Correlation Matrix - Sales Volume',
                    color_continuous_scale='RdBu_r',  # Red-Blue reversed color scale
                    aspect="auto",                   # Automatic aspect ratio
                    text_auto=True,                 # Display correlation values on cells
                    zmin=-1, zmax=1)                 # Fix color scale from -1 to +1

# Adjust the figure dimensions
fig_corr.update_layout(width=800, height=800)

# Display the interactive heatmap
fig_corr.show()

# Setup Machine Learning Model's

## Applying Feature Engineering and Partitioning Data for Model Training

In [15]:
# Define columns to maintain
maintain_cols = ['received_date', 'product_id', 'sales_volume', 'category', 'sub_category', 'is_holiday', 'in_season', 'shelf_life_days', 'stock_quantity']

# Define column renaming pattern
pattern = {'received_date': 'ds', 'product_id': 'unique_id', 'sales_volume': 'y'}

# Select relevant columns from the dataset
select_df = read_data[maintain_cols].rename(columns=pattern)
predict_df = compare_data[maintain_cols].rename(columns=pattern)

In [16]:
predict_df.duplicated(subset=['ds', 'unique_id']).sum()

651

In [17]:
# Count how many rows have the same ds and unique_id
select_df.duplicated(subset=['ds', 'unique_id']).sum()

5380

In [18]:
# Sort the DataFrame by the 'ds' column and reset the index, dropping the old index
select_df = select_df.sort_values(by='ds').reset_index(drop=True)
predict_df = predict_df.sort_values(by='ds').reset_index(drop=True)

In [19]:
# Group both DataFrames by ds and unique_id, then apply aggregation rules to each column
agg_rules = {
    'y': 'sum',
    'category': 'last',
    'sub_category': 'last',
    'is_holiday': 'last',
    'in_season': 'last',
    'shelf_life_days': 'last',
    'stock_quantity': 'first',
    # 'product': 'first',
}

select_df = select_df.groupby(['ds', 'unique_id'], as_index=False).agg(agg_rules).reset_index(drop=True)
predict_df = predict_df.groupby(['ds', 'unique_id'], as_index=False).agg(agg_rules).reset_index(drop=True)

In [20]:
# Verify duplicates after Partitioning
print(f"Duplicates in data for train: {select_df.duplicated(subset=['ds', 'unique_id']).sum()}")
print(f"Duplicates in data for predict: {predict_df.duplicated(subset=['ds', 'unique_id']).sum()}")

Duplicates in data for train: 0
Duplicates in data for predict: 0


## Set Up MLForecast Models to Predict Sales Volume on Train and Test Sets

In [21]:
# +++ Prepare MLForecast +++
# Initialize the temporal preprocessing pipeline with a specified pattern and static features
pipeline_preprocessor = SimplePreprocessor()

# Configure model parameters
random_state = 42
params_rf = {
    "n_estimators": 400,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
}
params_xgb = {
    'objective':'reg:squarederror',  
    'n_estimators':400,              
    'learning_rate':0.05,             
    'max_depth':None,                   
    'subsample':0.8,                 
    'colsample_bytree':0.8,          
}

# Define Models Pipelines
pipes_models = {
    'linear': make_pipeline(pipeline_preprocessor, LinearRegression()),
    'ridge': make_pipeline(pipeline_preprocessor, Ridge()),
    'rf': make_pipeline(pipeline_preprocessor, RandomForestRegressor(**params_rf, random_state=random_state)),
    'xgb': make_pipeline(pipeline_preprocessor, XGBRegressor(**params_xgb, random_state=random_state))
}

# Define target transformations
target_transforms = [
    AutoDifferences(3),           
    LocalRobustScaler(scale='iqr')
    ]

# Define series transformations
lag_transforms = {
    1: [ExpandingMean()],              # Aplica ExpandingMean ao lag 1
    2: [RollingMean(window_size=2)],   # Aplica RollingMean(2) ao lag 2
    7: [RollingMean(window_size=7)],   # Aplica RollingMean(7) ao lag 7
}

lags=[1, 2, 7]
date_features=['dayofweek', 'month']
num_threads=4

# Configure MLforecast for model training with temporal feature engineering
fcst = MLForecast(
    models=pipes_models,
    freq='D',
    lags=lags,
    lag_transforms=lag_transforms,
    date_features=date_features,
    num_threads=num_threads,
    target_transforms=target_transforms,
)

# Start Mlflow

In [22]:
# MLflow experiment setup
mlflow.set_tracking_uri("file:../models/mlflow_data")

### Parameters MLFlow

In [23]:
# Models parameters
model_params = {
    'rf': params_rf,
    'xgb': params_xgb,
    'linear': {},
    'ridge': {},
}

# MLforecast parameters initialization
lag_transforms_log = {
    k: [t.__class__.__name__ for t in v] 
    for k, v in lag_transforms.items()
}
target_transforms_log = [t.__class__.__name__ for t in target_transforms]

init_params = {
    'freq': fcst.freq,
    'lags': lags,
    'lag_transforms': lag_transforms_log,
    'date_features': date_features,
    'target_transforms': target_transforms_log,
    'num_threads': num_threads,
}

# Preprocessor parameters
preprocessor_params = {
    'static_features': pipeline_preprocessor.static_features,
    'imputer_cat': 'most_frequent',
    'imputer_num': 'median',
    'scaler_num': 'PowerTransformer(yeo-johnson)',
}

# Concat for MLflow
mlflow_params = {
    'MLForecast_init': init_params,
    'Preprocessor': preprocessor_params,
    'Modelos': model_params,
}

### Prepare data for train, test and predict

In [24]:
# Prediction length
horizon = 28

# Frequency (month, year, day, min, e.g.)
frequency = 'D'

# Define static columns
features_static = ['category', 'sub_category', 'shelf_life_days']

# Split data into train and validation
valid_df = select_df.groupby('unique_id').tail(horizon)
# X_df_valid = valid.drop(columns=['y'] + features_static)
train_df = select_df.drop(valid_df.index)

# # Prepare the feature set for prediction
# X_df_predict = df_for_predict.drop(columns=['y'] + features_static)
# predict = df_for_predict[X_df_predict.columns]

In [25]:
# 1. Instatiate
base_manager = MLflowForecastManager(
    model_name='Linear_Ridge_Forest_Xgb_Base',
    experiment_name='Forescast_base',
    path_models=path_models,
    json_path=json_path,
    horizon=horizon,
    product_seasonality=product_seasonality,
    fcst=fcst
    )

2025/12/09 19:26:55 INFO mlflow.tracking.fluent: Experiment with name 'Forescast_base' does not exist. Creating a new experiment.


Manager initialized. MLflow Experiment: Forescast_base


In [26]:
TRAIN_BASE = True
# Train or load model
if TRAIN_BASE:
    # 1. Training model
    run_id = base_manager.train(
        train_df,
        run_name="First_Training",
        mlflow_params=mlflow_params,
        features_static=features_static
    ) 
    
    # 2. Prediction and Logging
    predictions = base_manager.predict(df_predict=valid_df)

    # 3. Save and log
    base_manager.log_and_save(df_validation=valid_df, df_predict=predictions, run_id=run_id)


else:
    # Loading (in another script/session)
    loaded_manager = base_manager
    loaded_manager.load(run_id="") 


Applying pre-processing and feature extraction...
Training models...
Training complete. Run ID: 07b158f5aba54377b4c1ca74d038eba8
Creating future DataFrame...
Applying XDFPreparator...
Applying TrainPredictPreprocessor (transform) to create lags...
Predicting...
Prediction finished.
Calculating and logging metrics...
--- Result of metric_agg (Models and Mean RMSE) ---
metric   model  rmse_mean_agg
1           rf     258.622493
2        ridge     262.897638
0       linear     262.903469
3          xgb     271.987969
---------------------------------------------------
Metrics successfully logged. Best Model: rf
Saving MLForecast object and artifacts...
Artifacts successfully logged to MLflow!
Temporary artifact directory removed.


In [None]:
# 2. Prediction with loaded model
today_forecast = loaded_manager.predict(df_predict_today)