# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **Stock_Quantity**
- Target Variable for Demand Forecasting: **Sales_Volume**

### Metrics for models avaliation
- RMSE
- MAE

# DATA ACQUISITION
## Import Libraries

In [None]:
# Standart Libraries 
import pandas as pd
import numpy as np
import os

# Specialized Libraries
import mlflow

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Class and functions 
from smart_supply_chain_ai.utils.functions import DateFeatureExtractor, Differentiator


# Notebook Configurations
import warnings
warnings.filterwarnings('ignore')


## Load Data

In [50]:
# Define data paths
data_path = os.path.join('../data', 'processed')

In [51]:
df = pd.read_pickle(data_path + '/grocery.pkl')

In [52]:
df

Unnamed: 0,Product_ID,Product_Name,Category,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,...,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status,Stock_Value,Days_For_Expiration,Expiration_Status,Stock_Coverage_Days,Purchase_Order,Delivery_Lag
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,4.5,2024-08-16,...,48 Del Sol Trail,32,19,Discontinued,99.0,34,Safe,19,315.0,48
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,20.0,2024-11-01,...,36 3rd Place,85,1,Discontinued,900.0,-177,Expired,365,40.0,156
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,6.0,2024-08-03,...,3296 Walton Court,31,34,Backordered,180.0,50,Safe,10,498.0,54
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,1.5,2024-12-08,...,3 Westerfield Crossing,95,99,Active,18.0,-235,Expired,3,93.0,0
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,4.0,2024-07-03,...,15068 Scoville Court,62,25,Backordered,148.0,94,Safe,14,296.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,82-977-7752,Spinach,Fruits & Vegetables,57-473-8672,Shuffledrive,88,78,17,2.5,2024-09-06,...,58 Corscot Terrace,58,21,Active,220.0,59,Safe,17,42.5,0
986,62-393-9939,Cheddar Cheese,Dairy,93-877-9384,Gabcube,60,9,89,9.0,2024-06-01,...,5 Oxford Pass,95,63,Active,540.0,126,Safe,5,801.0,0
987,31-745-6850,Cabbage,Fruits & Vegetables,96-215-2767,Lajo,94,90,12,0.9,2024-10-03,...,081 Jana Lane,98,71,Active,84.6,29,Nearing,5,10.8,0
988,86-692-2312,Avocado Oil,Oils & Fats,77-783-4107,Dazzlesphere,30,48,52,10.0,2024-06-11,...,00616 Manitowish Parkway,22,78,Active,300.0,-42,Expired,4,520.0,0


# Feature Enginnering

In [53]:
# Time features
df['day_of_week'] = df['Date_Received'].dt.dayofweek.astype('category')
df['month'] = df['Date_Received'].dt.month.astype('category')
df['year'] = df['Date_Received'].dt.year.astype('category')
df['day_of_year'] = df['Date_Received'].dt.dayofyear

In [54]:
# Applying differentiation in non stationary variables
df['Delivery_Lag_diff'] = df['Delivery_Lag'].diff().fillna(0)
df['Days_For_Expiration_diff'] = df['Days_For_Expiration'].diff().fillna(0)

In [55]:
# Ascending date
df = df.sort_values(by='Date_Received').reset_index(drop=True)

## Split data

In [56]:
# Target Columns
y = df[['Sales_Volume', 'Stock_Quantity']]

In [57]:
# For modeling, we removed highly correlated columns and unique identifiers that did not add predictive value.
drop_columns = ['Product_ID', 'Supplier_ID', 'Last_Order_Date', 'Expiration_Date',
       'Warehouse_Location', 'Stock_Value', 'Days_For_Expiration', 
       'Purchase_Order', 'Delivery_Lag'] + y.columns.to_list()

In [58]:
# Feature Columns
X= df.drop(columns=drop_columns)

In [59]:
# Split data in train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

## Encode Features

### One Hot

In [60]:
# Encode Non numeric Variables
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [61]:
# Columns for One Hot
columns_ = ['Category', 'Status', 'Expiration_Status']

# Fit in X_train
encoder.fit(X_train[columns_])

# Transform X 
X_train_encoded = encoder.transform(X_train[columns_])
X_test_encoded = encoder.transform(X_test[columns_])

In [62]:
# Create encoded dataframes
encoded_columns_name = encoder.get_feature_names_out(columns_)
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_columns_name, index=X_train.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_columns_name, index=X_test.index)

In [63]:
# Union datasets
X_train = pd.concat([X_train.drop(columns=columns_), X_train_encoded], axis=1)
X_test = pd.concat([X_test.drop(columns=columns_), X_test_encoded], axis=1)

### Feature Extraction

In [64]:
# Combine the columns in list of list
X_train_to_hash = [[prod, supp] for prod, supp in zip(X_train['Product_Name'], X_train['Supplier_Name'])]
X_test_to_hash = [[prod, supp] for prod, supp in zip(X_test['Product_Name'], X_test['Supplier_Name'])]

In [65]:
# Create Feature Hashing
n_features=100
hasher = FeatureHasher(n_features=n_features, input_type="string")

In [66]:
# Appling
X_train_hashed = hasher.transform(X_train_to_hash)
X_test_hashed = hasher.transform(X_test_to_hash)

In [67]:
# Columns Names for data
hashed_column_names = [f'hashed_feature_{i}' for i in range(n_features)]

In [68]:
# Create DataFrame with dense matrix
X_train_hashed_df = pd.DataFrame(X_train_hashed.toarray(), columns=hashed_column_names, index=X_train.index)
X_test_hashed_df = pd.DataFrame(X_test_hashed.toarray(), columns=hashed_column_names, index=X_test.index)


In [69]:
# Create Final DataFrame
# Columns to remove
columns_rm = X_train[['Product_Name', 'Supplier_Name']].columns.to_list()

# Concatenate wit others DataFrames
X_train_final = pd.concat([
    X_train.drop(columns=columns_rm),
    X_train_hashed_df
], axis=1)

X_test_final = pd.concat([
    X_test.drop(columns=columns_rm),
    X_test_hashed_df
], axis=1)

In [70]:
X_test_final

Unnamed: 0,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Inventory_Turnover_Rate,Stock_Coverage_Days,day_of_week,month,year,day_of_year,...,hashed_feature_90,hashed_feature_91,hashed_feature_92,hashed_feature_93,hashed_feature_94,hashed_feature_95,hashed_feature_96,hashed_feature_97,hashed_feature_98,hashed_feature_99
792,90,57,25.00,2024-12-11,68,5,2,12,2024,346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
793,68,90,2.00,2024-12-12,13,28,3,12,2024,347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
794,69,90,4.00,2024-12-12,35,10,3,12,2024,347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
795,36,96,6.50,2024-12-12,73,5,3,12,2024,347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
796,94,18,6.00,2024-12-12,11,33,3,12,2024,347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,69,64,2.75,2025-02-23,6,60,6,2,2025,54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
986,61,19,2.00,2025-02-23,69,5,6,2,2025,54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
987,40,29,15.00,2025-02-24,87,4,0,2,2025,55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
988,50,80,8.00,2025-02-24,5,73,0,2,2025,55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
X_train_final

Unnamed: 0,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Inventory_Turnover_Rate,Stock_Coverage_Days,day_of_week,month,year,day_of_year,...,hashed_feature_90,hashed_feature_91,hashed_feature_92,hashed_feature_93,hashed_feature_94,hashed_feature_95,hashed_feature_96,hashed_feature_97,hashed_feature_98,hashed_feature_99
0,89,25,21.0,2024-02-25,62,5,6,2,2024,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,29,76,2.4,2024-02-26,4,91,0,2,2024,57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,90,77,4.0,2024-02-26,19,19,0,2,2024,57,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,96,85,2.5,2024-02-26,3,121,0,2,2024,57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60,21,8.9,2024-02-27,46,7,1,2,2024,58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,93,61,6.0,2024-12-10,100,3,1,12,2024,345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
788,12,85,5.0,2024-12-10,46,7,1,12,2024,345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
789,98,97,6.0,2024-12-10,21,17,1,12,2024,345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
790,48,25,3.0,2024-12-11,65,5,2,12,2024,346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0


# Pipeline

In [72]:
# load Data
df = pd.read_pickle(data_path + '/grocery.pkl')

In [73]:
df.columns

Index(['Product_ID', 'Product_Name', 'Category', 'Supplier_ID',
       'Supplier_Name', 'Stock_Quantity', 'Reorder_Level', 'Reorder_Quantity',
       'Unit_Price', 'Date_Received', 'Last_Order_Date', 'Expiration_Date',
       'Warehouse_Location', 'Sales_Volume', 'Inventory_Turnover_Rate',
       'Status', 'Stock_Value', 'Days_For_Expiration', 'Expiration_Status',
       'Stock_Coverage_Days', 'Purchase_Order', 'Delivery_Lag'],
      dtype='object')

In [74]:
# Split data in train test
X = df.drop(columns=["Sales_Volume", "Stock_Quantity", 'Product_ID', 'Supplier_ID', 
                     'Stock_Value', 'Purchase_Order', 'Last_Order_Date', 'Expiration_Date', 'Warehouse_Location'])
y = df[["Sales_Volume", "Stock_Quantity"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=67)

In [75]:
# Instatiate Class
extractor = functions.DateFeatureExtractor(date_column='Date_Received')
diff = functions.Differentiator(columns=['Delivery_Lag', 'Days_For_Expiration'])

In [76]:
# Transform data
X_train_processed = extractor.transform(X_train)
X_train_processed = diff.transform(X_train_processed)

X_test_processed = extractor.transform(X_test)
X_test_processed = diff.transform(X_test_processed)

In [77]:
y_test.select_dtypes(['int', 'float']).columns

Index(['Sales_Volume', 'Stock_Quantity'], dtype='object')

In [78]:
# Select columns
category_columns = X_test.select_dtypes('category').columns.to_list()
string_columns = X_test.select_dtypes('object').columns.to_list()
numeric_columns = (X_test.select_dtypes(['int', 'float']).columns.to_list()) + (y_test.select_dtypes(['int', 'float']).columns.to_list())

In [79]:
# Create Preprocessors
preprocessor = ColumnTransformer(
    transformers= [
        ('hash', FeatureHasher(n_features=100, input_type='string'), string_columns),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), category_columns),
        ('scale', StandardScaler(), numeric_columns)
    ],
    remainder='drop' # Remove columns that aren't in the list.
)

In [80]:
# Algoritms for train
# MultiOutputRegressor used because have more than 1 targets

date_column = 'Date_Received'
pipelines = {
    'RandomForest': Pipeline(steps=[
        ('date_features', DateFeatureExtractor(date_column=date_column)),
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(RandomForestRegressor(random_state=42)))
    ]),
    'LightGBM': Pipeline(steps=[
        ('date_features', DateFeatureExtractor(date_column=date_column)),
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(LGBMRegressor(random_state=42)))
    ]),
    'XGBoost': Pipeline(steps=[
        ('date_features', DateFeatureExtractor(date_column=date_column)),
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(XGBRegressor(random_state=42)))
    ])
}


In [None]:
# Train Models
# MLflow configurations
mlflow.set_experiment('Model_Base:Demand_Forecasting')

for model_name, current_pipeline in pipelines.items():
    with mlflow.start_run(run_name=f"Training_{model_name}"):
        print(f"Training {model_name}...")
        
        # Train the pipeline
        current_pipeline.fit(X_train, y_train)

        # Make predictions and evaluate
        preds = current_pipeline.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)

        # Log metrics and model with MLflow
        mlflow.log_metric("test_rmse", rmse)
        mlflow.log_metric("test_mae", mae)
        mlflow.sklearn.log_model(current_pipeline, f"{model_name}_model")

        print(f"  Test RMSE for {model_name}: {rmse:.2f}")

print("\nModel comparison completed. See the results in the MLflow UI.")