# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **stock_quantity**
- Target Variable for Demand Forecasting: **sales_volume**

### Metrics for models avaliation
- RMSE - Root Mean Squared Error
- MAE - Mean Absolute Error

# DATA ACQUISITION
## Import Libraries

In [1]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import subprocess
import json

# Specialized Libraries
import mlflow
import logging
from sktime.forecasting.compose import ColumnEnsembleForecaster, TransformedTargetForecaster, ForecastingPipeline, make_reduction
from sktime.transformations.series.boxcox import BoxCoxTransformer  # Log-Transform (Target)
from sktime.transformations.series.impute import Imputer
from sktime.transformations.compose import FeatureUnion
from sktime.transformations.panel.reduce import Tabularizer
from sktime.transformations.series.holiday import HolidayFeatures  # Date/Time Features
from sktime.split import temporal_train_test_split
from holidays import country_holidays
from xgboost import XGBRegressor

# To handle heterogeneous columns in sktime
from sktime.transformations.compose import ColumnEnsembleTransformer, TransformerPipeline
from sktime.transformations.series.dummies import SeasonalDummiesOneHot


# Notebook mlflow Loggings
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.WARNING)
pd.set_option('display.max_columns', None)


## Load Data

In [2]:
# Define data paths
data_path = os.path.join('../data', 'processed/')
docs_path = os.path.join('../docs/')
path_models = os.path.join('../models/')

In [3]:
# Load the preprocessed DataFrame from a pickle file and create a copy for further manipulation
df_complete = pd.read_pickle(data_path + 'full_data_cleaned.pkl')
df = df_complete.copy()


In [4]:
df.head(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status
0,2022-12-09,2022-12-07,False,Dijon Mustard,1070686|P,Pantry,Condiments,365,90,unit,3,Condiment Masters,1184993|S,75,60,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,50,4,240,300,317,2,Safe
1,2022-12-09,2022-12-07,True,Orange,1362741|P,Fresh Foods,Fruits,14,7,lb,3,OrchardBest Fruits,1677419|S,200,120,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,590,6,3678,4291,4608,2,Safe
2,2022-12-09,2022-12-06,False,Asparagus,1308864|P,Fresh Foods,Vegetables,5,2,lb,5,Asparagus Experts,1197859|S,115,18,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,532,4,4652,5815,6765,3,Nearing


# Feature Engineering – Removing data leakage

In [5]:
# Extract key identifier and name columns for products and suppliers
# df_ids_names = df[['product', 'product_id', 'supplier', 'supplier_id']]


In [6]:
# Drop columns derived from statistical and business-specific calculations (numeric features)
# df.drop(columns=['maximum_days_on_sale', 'min_stock', 'max_stock', 'delivery_lag'], inplace=True)

# Drop categorical columns related to product classification and status
# df.drop(columns=['unit_of_measurement', 'temperature_classification', 'precipitation_classification', 'wind_classification', 'expiration_status'], inplace=True)

# Convert distance column to integer format
df['distance_km'] = df['distance_km'].astype(int)

# Drop datetime, boolean and string-based descriptive columns
# df.drop(columns=['lpo', 'product_id', 'supplier_id', 'storage_recommendation'], inplace=True)


# 1. Feature Definition

In [7]:
numeric_columns = ['shelf_life_days', 'distance_km', 'moq', 'lead_time', 'stock_quantity']
numeric_features = df[numeric_columns]

In [8]:
categorical_columns = ['in_season', 'product', 'category', 'sub_category', 'supplier_rating', 'supplier', 
                       'weather_severity', 'day_classification', 'is_holiday', 'is_weekend', 'sales_demand']

categorical_features = df[categorical_columns]

# 2. Feature Transformation Pipeline (X)

In [9]:
# Numeric Pipeline
numeric_transformer = Imputer(method='mean')

In [10]:
# Categorical Pipeline (sktime-compatible)
categorical_transformer = TransformerPipeline(steps=[
    ('imputer', Imputer(method='ffill')),
    ('onehot', SeasonalDummiesOneHot(freq='D'))
])

In [11]:
# Combine columns
preprocessor = ColumnEnsembleTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# 3. Forecasting Pipeline (Target Y + X + XGBoost)

In [12]:
# Target Transformation (Log-Transformation to stabilize variance)
target_transformer = TransformerPipeline(steps=[
    ('imputer', Imputer(method="mean")),
    ('log_transform', BoxCoxTransformer(method="fixed", lambda_fixed=0))  # Log(y + c)
])


In [None]:
n_est = 200
rate = 0.05

# Base Estimator (XGBoost)
xgb_estimator = XGBRegressor(
    n_estimators=200,  # Increased to start with a slightly stronger model
    learning_rate=0.05, 
    random_state=42,
    n_jobs=-1
)


In [14]:
# Forecasting Pipeline (Combines X with XGBoost)
# This step uses the time index (received_date) to create time-based features
# and combines them with the already transformed X features.
xgb_forecaster = ForecastingPipeline(
    steps=[
        # 1. Creates time features from the index
        ("time_features", HolidayFeatures(
            include_weekend=True, 
            calendar=country_holidays(country="BR")
        )), 
        # 2. Transforms X columns into regression-ready format
        ("preprocessor", preprocessor),
        # 3. Regressor (XGBoost) - as forecaster
        ("xgb", make_reduction(
            xgb_estimator,
            strategy="direct",
            window_length=28  # 4 cycles completed
        )) 
    ]
)


In [15]:
# Single Series Pipeline (Applies the Y transformation)
single_series_forecaster = TransformedTargetForecaster(
    steps=[
        ("target_transformer", target_transformer), 
        ("forecaster", xgb_forecaster)
    ]
)


# 4. Final Ensemble

In [16]:
sales_target_columns = numeric_columns + categorical_columns

# This step applies target and feature transformations individually for each product
# (y_sales_train is your multi-series DataFrame, X_sales_train is your feature DataFrame)
fc_xgb_final = ColumnEnsembleForecaster(
    forecasters=[
        ("xgb_pipeline_" + str(col_name), single_series_forecaster, col_name)
        for col_name in sales_target_columns  # Assuming sales_target_columns is the list of 170 products
    ]
)


# Transform for Wide Format
## 1. Data Preparation (Target - y)

In [None]:
# Supondo que 'df_original' seja o seu DataFrame completo de treino
# Coluna de identificação do produto (sua chave para a série)
ID_COLUMN = 'product_id' # Ou 'product', dependendo de qual você usa para distinguir as 170 séries

# Coluna Target
TARGET_COLUMN = 'sales_volume'

# Pivotar os dados de vendas (Target)
y_sales_train = (
    df_original.pivot_table(
        index='received_date', 
        columns=ID_COLUMN, 
        values=TARGET_COLUMN,
        # Agrega se houver mais de uma entrada por data/produto. Usamos sum se for demanda
        aggfunc='sum' 
    )
)

Remove data with dates on or after May 30, 2025, due to inconsistencies identified during exploratory data analysis (EDA).


# 1. Garantir que o índice é o correto para o sktime (DateTimeIndex)
y_sales_train.index = pd.to_datetime(y_sales_train.index)

# 2. Renomear as colunas (produtos) para facilitar o ColumnEnsembleForecaster
# (Opcional, mas recomendado)
y_sales_train.columns = [f"series_{c}" for c in y_sales_train.columns]

# Esta será a sua entrada 'y' para o fit: y_sales_train

## 2. Processing of Exogenous Features (X)

In [None]:
# 1. Selecionar features (incluindo o ID para mapeamento)
FEATURES = ['product_id'] + numeric_features + categorical_features 

# 2. Criar o DataFrame de Features
X_sales_train_long = df_original[FEATURES].copy()

# 3. Definir o Índice Composto
X_sales_train_long['received_date'] = pd.to_datetime(X_sales_train_long['received_date'])
X_sales_train_long = X_sales_train_long.set_index(['received_date', 'product_id'])

# Esta será a sua entrada 'X' para o fit: X_sales_train_long

## Split data

In [None]:
temporal_train_test_split()

# Start Mlflow

In [None]:
with mlflow.start_run(run_name="XGBoost_Initial_Baseline"):
    # 1. Log XGBoost hyperparameters
    mlflow.log_param("n_estimators", n_est)
    mlflow.log_param("learning_rate", rate)
    
    # 2. Log the features used (as artifact or JSON parameter)
    mlflow.log_param("numeric_features", numeric_features)
    mlflow.log_param("categorical_features", categorical_features)
    mlflow.log_param("target_transformation", "Log_Fixed_0")
    
    # 3. Train the model
    # fc_xgb_final.fit(y=y_sales_train, X=X_sales_train)
    
    # 4. Predict and Evaluate
    # ... y_pred = fc_xgb_final.predict(fh=fh)
    # ... rmse_overall = root_mean_squared_error(...)

    # 5. Log metrics
    mlflow.log_metric("RMSE_Overall", rmse_overall)
    mlflow.log_metric("MAE_Overall", mae_overall)

    # 6. Save the model (THE MOST IMPORTANT PART!)
    # Saves the entire pipeline (transformers, XGBoost, ensemble)
    mlflow.sktime.log_model(
        sktime_model=fc_xgb_final, 
        artifact_path="xgb_sktime_pipeline",
        # Includes scikit-learn/XGBoost if needed
        # registered_model_name="Demand_Forecasting_XGBoost" 
    )
