# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **stock_quantity**
- Target Variable for Demand Forecasting: **sales_volume**

### Metrics for models avaliation
- RMSE - Root Mean Squared Error
- MAE - Mean Absolute Error

# DATA ACQUISITION
## Import Libraries

In [None]:
# Standard Libraries
import pandas as pd
import numpy as np
import os


# Specialized Libraries
import mlflow
import logging
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# To import customized class and function
from smart_supply_chain_ai.utils.create_data_functions import DateFeatureExtractor, LagFeatureCreator


# Notebook mlflow Loggings
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.WARNING)
pd.set_option('display.max_columns', None)

# Relative path mlflow
relative_path = os.path.join("..", "models", "mlflow_data/")

# Setup MLflow
mlflow.set_tracking_uri(f"file:{relative_path}")


## Load Data

In [2]:
# Define data paths
data_path = os.path.join('..', 'data', 'processed/')
docs_path = os.path.join('..', 'docs/')
path_models = os.path.join('..', 'models/')

In [3]:
# Load the preprocessed DataFrame from a pickle file and create a copy for further manipulation
df_complete = pd.read_pickle(data_path + 'full_data_cleaned.pkl')
df = df_complete.copy()


In [4]:
df.head(3)

Unnamed: 0,received_date,lpo,in_season,product,product_id,category,sub_category,shelf_life_days,maximum_days_on_sale,unit_of_measurement,supplier_rating,supplier,supplier_id,distance_km,moq,storage_recommendation,temperature_classification,precipitation_classification,wind_classification,weather_severity,day_classification,is_holiday,is_weekend,sales_demand,sales_volume,lead_time,min_stock,max_stock,stock_quantity,delivery_lag,expiration_status
0,2022-12-09,2022-12-07,False,Dijon Mustard,1070686|P,Pantry,Condiments,365,90,unit,3,Condiment Masters,1184993|S,75,60,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,50,4,240,300,317,2,Safe
1,2022-12-09,2022-12-07,True,Orange,1362741|P,Fresh Foods,Fruits,14,7,lb,3,OrchardBest Fruits,1677419|S,200,120,Room Temperature,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,590,6,3678,4291,4608,2,Safe
2,2022-12-09,2022-12-06,False,Asparagus,1308864|P,Fresh Foods,Vegetables,5,2,lb,5,Asparagus Experts,1197859|S,115,18,Refrigerated,Warm,No precipitation,Gentle to Fresh Breeze,Moderate,Weekdays,False,False,Normal,532,4,4652,5815,6765,3,Nearing


# Feature Engineering

In [5]:
# Convert distance column to integer format
df['distance_km'] = df['distance_km'].astype(int)

# 1. Date Feature Extract 

In [6]:
# Initialize the DateFeatureExtractor to generate time-based features from the 'received_date' column
date_features = DateFeatureExtractor(date_column='received_date')


# 2. Lag Create for Product

In [7]:
# Initialize the LagFeatureCreator to generate lagged features (7, 14, and 28 days)
# for the 'stock_quantity' column within each 'product_id' group
lags = LagFeatureCreator(
    group_column='product_id', 
    shift_column='stock_quantity', 
    lags=[7, 14, 28])


# 3. Column Transformer Preprocessing

In [8]:
preprocess = ColumnTransformer(transformers=[
    # ===== NUMERICAL FEATURES =====
    # MinMax Scaling (range [0, 1]) for bounded numerical features
    ('minmax', MinMaxScaler(), ['shelf_life_days', 'moq']),           # Scales min/max order quantities and shelf life to consistent range
    
    # Standard Scaling (mean=0, std=1) for normally distributed features
    ('standard', StandardScaler(), ['distance_km', 'lead_time', 'year', 'month', 'day', 'day_of_week']),     # Standardizes temporal and distance features
    
    # Robust Scaling for outlier-prone numerical features
    ('robust', RobustScaler(), ['stock_quantity']),                   # Scales stock quantity using IQR, robust to extreme values
    
    # ===== CATEGORICAL FEATURES =====
    # One-Hot Encoding for nominal features with no inherent order
    ('ohe_nominal', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['product_id']), # Encodes product IDs into binary columns
    
    # One-Hot Encoding for multi-class categorical features
    ('ohe_cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), 
        ['category', 'sub_category', 'weather_severity', 'sales_demand']),  # Encodes product categories and external factors
    
    # ===== ORDINAL & SPECIAL FEATURES =====
    # Standard Scaling for ordinal feature (already numerically encoded)
    ('standard_rating', StandardScaler(), ['supplier_rating']),       # Standardizes supplier ratings (assumed ordinal numeric)
    
    # ===== BOOLEAN FEATURES =====
    # Pass-through for binary/boolean features (no transformation needed)
    ('passthrough', 'passthrough', ['in_season', 'is_weekend', 'is_holiday', 'is_business_day']), # Preserves binary flags as-is
    
    # ===== TIME SERIES LAG FEATURES =====
    # Pipeline for lag features: impute missing values then standardize
    ('lag_scaler', make_pipeline(
        SimpleImputer(strategy='mean'),      # Handles missing lag values with mean imputation
        StandardScaler()                     # Standardizes lag features to common scale
    ), ['stock_quantity_lag_7', 'stock_quantity_lag_14', 'stock_quantity_lag_28']), # Historical stock quantity lags

])

# 4. Model for Training

In [9]:
n_est = 200       # Number of boosting iterations (trees in ensemble)
rate = 0.05       # Learning rate - controls contribution of each tree

In [10]:
# XGBoost Regressor Configuration
xgb_estimator = XGBRegressor(
    n_estimators=n_est,  # 200 trees for ensemble strength
    learning_rate=rate,  # 0.05 learning rate for stable convergence  
    random_state=42,     # Reproducible results
    n_jobs=-1           # Parallel processing enabled
)

# 5. Combining All Pipeline

In [11]:
# End-to-end ML pipeline
pipeline = make_pipeline(
    date_features,  # Temporal feature extraction
    lags,           # Time series lag creation  
    preprocess,     # Feature scaling & encoding
    xgb_estimator   # XGBoost prediction model
)

# Split data

In [12]:
# Cutoff date for temporal validation split
CUTOFF_DATE = pd.to_datetime('2025-04-01')

# Rationale for cutoff selection:
# - Based on EDA findings identifying data inconsistencies from May 2025 onward
# - Ensures model training on reliable, consistent historical data
# - Maintained for test set evaluation to simulate real-world forecasting scenario
# - Preserves data integrity while utilizing available data through September 2025

# Validation Approach:
# Train: Data before April 2025 (consistent period)
# Test: Data from April 2025 onward (assesses model performance on more recent, though partially inconsistent, data)

In [None]:
# Ensure chronological ordering for time series processing
df = df.sort_values(by='received_date', ascending=True).reset_index(drop=True)

In [14]:
# Create masks for training and testing
train_mask = df['received_date'] < CUTOFF_DATE
test_mask = df['received_date'] >= CUTOFF_DATE

# Training data
X_train = df.loc[train_mask].drop(columns='sales_volume').copy()
y_train = df.loc[train_mask, 'sales_volume'].copy()

# Testing data
X_test = df.loc[test_mask].drop(columns='sales_volume').copy()
y_test = df.loc[test_mask, 'sales_volume'].copy()

In [None]:
# Apply log transformation to target variable
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Start Mlflow

In [None]:
with mlflow.start_run(run_name="XGBoost_Initial_Baseline") as run:
    
    # 1. Log main parameters
    mlflow.log_params({
        "n_estimators": n_est,
        "learning_rate": rate,
        "target_transformation": "Log1p_to_Expm1",
        "model_type": "XGBoost"
    })
    
    # 2. Log tags for organization
    mlflow.set_tags({
        "project": "demand_forecasting",
        "model_family": "XGBoost",
        "stage": "baseline"
    })
    
    # 3. Train the model
    pipeline.fit(X_train, y_train_log)
    
    # 4. Prediction and evaluation
    y_pred = pipeline.predict(X_test)

    # 4.a. Inverse log transformation (exponential)
    y_pred = np.expm1(y_pred_log)

    # 4.b. Replace negative predictions with zero
    y_pred[y_pred < 0] = 0
    
    # 5. Log evaluation metrics
    def smape(y_true, y_pred):
        return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))
    
    metrics = {
        "RMSE": root_mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "sMAPE": smape(y_test, y_pred)
    }
    mlflow.log_metrics(metrics)
    
    # 6. Log dataset details
    mlflow.log_params({
        "train_samples": len(X_train),
        "test_samples": len(X_test),
        "forecast_horizon": X_test.shape[0]
    })
    
    # 7. Save the model
    mlflow.sklearn.log_model(
        sk_model=pipeline, 
        artifact_path="xgb_pipeline",
        registered_model_name="Demand_Forecasting_XGBoost"
    )
    
    print(f"Run completed successfully: {run.info.run_id}")
