# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **stock_quantity**
- Target Variable for Demand Forecasting: **sales_volume**

### Metrics for models avaliation
- RMSE - Root Mean Squared Error
- MAE - Mean Absolute Error

# DATA ACQUISITION
## Import Libraries

In [1]:
# Standard Libraries
import pandas as pd
import numpy as np
import os


# Specialized Libraries
import mlflow
from mlflow.models import infer_signature
import logging
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

# To import customized class and function
from smart_supply_chain_ai.utils.create_data_functions import DateFeatureExtractor, LagFeatureCreator


# Notebook mlflow Loggings
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.WARNING)
pd.set_option('display.max_columns', None)

# Relative path mlflow
relative_path = os.path.join("..", "models", "mlflow_data")

# Setup MLflow
mlflow.set_tracking_uri(f"file:{os.path.abspath(relative_path)}")


## Load Data

In [2]:
# Define data paths
data_path = os.path.join('..', 'data', 'processed')
docs_path = os.path.join('..', 'docs')
path_models = os.path.join('..', 'models')

In [3]:
# Load the preprocessed DataFrame from a pickle file and create a copy for further manipulation
df_complete = pd.read_pickle(data_path + '/full_data_cleaned.pkl')
df = df_complete.copy()


In [4]:
df.head(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status
0,2022-12-09,2022-12-07,False,Dijon Mustard,1070686|P,Pantry,Condiments,365,90,unit,3,Condiment Masters,1184993|S,75,60,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,50,4,240,300,317,2,Safe
1,2022-12-09,2022-12-07,True,Orange,1362741|P,Fresh Foods,Fruits,14,7,lb,3,OrchardBest Fruits,1677419|S,200,120,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,590,6,3678,4291,4608,2,Safe
2,2022-12-09,2022-12-06,False,Asparagus,1308864|P,Fresh Foods,Vegetables,5,2,lb,5,Asparagus Experts,1197859|S,115,18,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,532,4,4652,5815,6765,3,Nearing


# Feature Engineering

In [5]:
# Convert distance column to integer format
df['distance_km'] = df['distance_km'].astype(int)

# Model for Training

In [6]:
n_est = 200       # Number of boosting iterations (trees in ensemble)
rate = 0.05       # Learning rate - controls contribution of each tree

In [7]:
# XGBoost Regressor Configuration
xgb_estimator = XGBRegressor(
    n_estimators=n_est,  # 200 trees for ensemble strength
    learning_rate=rate,  # 0.05 learning rate for stable convergence  
    random_state=42,     # Reproducible results
    n_jobs=-1           # Parallel processing enabled
)

# Pipeline

In [8]:
# Stage 1: Feature engineering only
feature_engineering_pipeline = Pipeline([
    # Create lag features
    ('lag_creator', LagFeatureCreator(
        shift_column='stock_quantity', 
        group_column='product_id', 
        lags=[1, 7, 14, 28])),
    
    # Create date features
    ('date_features', DateFeatureExtractor(date_column='received_date'))
])

# Stage 2: Comprehensive preprocessing (ALL transformations)
final_preprocessor = ColumnTransformer(transformers=[
    # ===== NUMERICAL FEATURES =====
    ('minmax', MinMaxScaler(), ['shelf_life_days', 'moq']),
    ('standard', StandardScaler(), ['distance_km', 'lead_time', 'year', 'month', 'day', 'day_of_week']),
    ('robust', RobustScaler(quantile_range=(25,75)), ['stock_quantity']),
    ('standard_rating', StandardScaler(), ['supplier_rating']),
    
    # ===== LAG FEATURES (scale the ones created in stage 1) =====
    ('lag_scaler', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler(quantile_range=(25,75)))
    ]), ['stock_quantity_lag_1', 'stock_quantity_lag_7', 'stock_quantity_lag_14', 'stock_quantity_lag_28']),
    
    # ===== CATEGORICAL FEATURES =====
    ('ohe_nominal', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['product_id']),
    ('ohe_cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
        ['category', 'sub_category', 'weather_severity', 'sales_demand']),
    
    # ===== PASSTHROUGH FEATURES =====
    ('pass', 'passthrough', ['in_season']),
], remainder='drop')

# Full pipeline
complete_pipeline = Pipeline([
    ('feature_engineering', feature_engineering_pipeline),
    ('final_preprocessing', final_preprocessor),
    ('model', xgb_estimator)
])


# Split data

In [9]:
# Cutoff date for temporal validation split
CUTOFF_DATE = pd.to_datetime('2025-04-01')

# Rationale for cutoff selection:
# - Based on EDA findings identifying data inconsistencies from May 2025 onward
# - Ensures model training on reliable, consistent historical data
# - Maintained for test set evaluation to simulate real-world forecasting scenario
# - Preserves data integrity while utilizing available data through September 2025

# Validation Approach:
# Train: Data before April 2025 (consistent period)
# Test: Data from April 2025 onward (assesses model performance on more recent, though partially inconsistent, data)

In [10]:
# Ensure chronological ordering for time series processing
df = df.sort_values(by='received_date', ascending=True).reset_index(drop=True)

In [11]:
# Create masks for training and testing
train_mask = df['received_date'] < CUTOFF_DATE
test_mask = df['received_date'] >= CUTOFF_DATE

# Training data
X_train = df.loc[train_mask].drop(columns='sales_volume').copy()
y_train = df.loc[train_mask, 'sales_volume'].copy()

# Testing data
X_test = df.loc[test_mask].drop(columns='sales_volume').copy()
y_test = df.loc[test_mask, 'sales_volume'].copy()

In [12]:
# Apply log transformation to target variable
y_train_log = np.log1p(y_train)

# Start Mlflow

In [13]:

# ROBUST SMAPE FUNCTION
def safe_smape(y_true, y_pred):
    """Robust sMAPE that avoids division by zero"""
    denominator = (np.abs(y_true) + np.abs(y_pred))
    # Avoid division by zero â€“ replace zeros with 1
    denominator = np.where(denominator == 0, 1, denominator)
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / denominator)

with mlflow.start_run(run_name="XGBoost_Initial_Baseline") as run:
    
    # 1. Log main parameters
    mlflow.log_params({
        "n_estimators": n_est,
        "learning_rate": rate,
        "target_transformation": "Log1p_to_Expm1",
        "model_type": "XGBoost",
        "train_samples": len(X_train),
        "test_samples": len(X_test),
        "forecast_horizon": X_test.shape[0]
    })
    
    # 2. Log tags for organization
    mlflow.set_tags({
        "project": "demand_forecasting",
        "model_family": "XGBoost", 
        "stage": "baseline",
        "pipeline_type": "feature_engineering_plus_preprocessing"
    })
    
    # 3. Train the model
    print("Training pipeline...")
    complete_pipeline.fit(X_train, y_train_log)
    
    # 4. INFER SIGNATURE AND INPUT EXAMPLE
    # Use complete_pipeline and ensure X_train is in its original format
    signature = infer_signature(X_train, complete_pipeline.predict(X_train))
    input_example = X_train.iloc[:5] 

    # 5. Prediction and evaluation
    print("Making predictions...")
    y_pred_log = complete_pipeline.predict(X_test)

    # 5.a. Inverse log transformation (exponential)
    y_pred = np.expm1(y_pred_log)

    # 5.b. Replace negative predictions with zero
    y_pred = np.maximum(y_pred, 0)  
    
    # 6. Log evaluation metrics
    metrics = {
        "RMSE": root_mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "sMAPE": safe_smape(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred)
    }
    
    # Log individual metrics for better tracking
    for metric_name, metric_value in metrics.items():
        mlflow.log_metric(metric_name, metric_value)
    
    # 7. Additional log: performance per product (optional)
    print("Sample predictions vs actual:")
    for i in range(min(5, len(y_test))):
        print(f"  Actual: {y_test.iloc[i]:.1f}, Pred: {y_pred[i]:.1f}")
    
    # 8. Save the model
    mlflow.sklearn.log_model(
        sk_model=complete_pipeline, 
        name="demand_forecasting_model",
        signature=signature,
        input_example=input_example,
        registered_model_name="Demand_Forecasting_XGBoost"
    )
    
    print(f"âœ… Run completed successfully: {run.info.run_id}")
    print(f"ðŸ“Š Metrics: {metrics}")


Training pipeline...
Making predictions...
Sample predictions vs actual:
  Actual: 989.0, Pred: 853.0
  Actual: 152.0, Pred: 117.4
  Actual: 17.0, Pred: 35.0
  Actual: 1013.0, Pred: 434.1
  Actual: 749.0, Pred: 906.2




âœ… Run completed successfully: 018c54fe1200437798ec2e44fee3c9b7
ðŸ“Š Metrics: {'RMSE': 189.43077087402344, 'MAE': 96.34623718261719, 'sMAPE': 26.422185703832923, 'MSE': 35884.01953125}


Successfully registered model 'Demand_Forecasting_XGBoost'.
Created version '1' of model 'Demand_Forecasting_XGBoost'.
