# Base Model for Demand Forecasting

The main objective is to optimize inventory and purchasing management, with a target of **reducing overstocking by 20%** within 6 months.

- Target Variable for Inventory Optimization: **Stock_Quantity**
- Target Variable for Demand Forecasting: **Sales_Volume**

### Metrics for models avaliation
- RMSE - Root Mean Squared Error
- MAE - Mean Absolute Error

# DATA ACQUISITION
## Import Libraries

In [None]:
# Standart Libraries 
import pandas as pd
import numpy as np
import os
import subprocess

# Specialized Libraries
import mlflow
import logging

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from statsmodels.tsa.api import VAR

# Class and functions 
from smart_supply_chain_ai.utils.functions import DateFeatureExtractor, Differentiator, TextTokenizer

# Notebook mlflow Loggings
import warnings
warnings.filterwarnings('ignore')
logging.getLogger().setLevel(logging.WARNING)

## Load Data

In [None]:
# Define data paths
data_path = os.path.join('../data', 'processed')

In [None]:
df = pd.read_pickle(data_path + '/grocery.pkl')

In [None]:
df

# Feature Enginnering

In [None]:
# Time features
df['day_of_week'] = df['Date_Received'].dt.dayofweek.astype('category')
df['month'] = df['Date_Received'].dt.month.astype('category')
df['year'] = df['Date_Received'].dt.year.astype('category')
df['day_of_year'] = df['Date_Received'].dt.dayofyear

In [None]:
# Applying differentiation in non stationary variables
df['Delivery_Lag_diff'] = df['Delivery_Lag'].diff().fillna(0)
df['Days_For_Expiration_diff'] = df['Days_For_Expiration'].diff().fillna(0)

In [None]:
# Ascending date
df = df.sort_values(by='Date_Received').reset_index(drop=True)

## Split data

In [None]:
# Target Columns
y = df[['Sales_Volume', 'Stock_Quantity']]

In [None]:
# For modeling, we removed highly correlated columns and unique identifiers that did not add predictive value.
drop_columns = ['Product_ID', 'Supplier_ID', 'Last_Order_Date', 'Expiration_Date',
       'Warehouse_Location', 'Stock_Value', 'Days_For_Expiration', 
       'Purchase_Order', 'Delivery_Lag'] + y.columns.to_list()

In [None]:
# Feature Columns
X= df.drop(columns=drop_columns)

In [None]:
# Split data in train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

## Encode Features

### One Hot

In [None]:
# Encode Non numeric Variables
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [None]:
# Columns for One Hot
columns_ = ['Category', 'Status', 'Expiration_Status']

# Fit in X_train
encoder.fit(X_train[columns_])

# Transform X 
X_train_encoded = encoder.transform(X_train[columns_])
X_test_encoded = encoder.transform(X_test[columns_])

In [None]:
# Create encoded dataframes
encoded_columns_name = encoder.get_feature_names_out(columns_)
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_columns_name, index=X_train.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_columns_name, index=X_test.index)

In [None]:
# Union datasets
X_train = pd.concat([X_train.drop(columns=columns_), X_train_encoded], axis=1)
X_test = pd.concat([X_test.drop(columns=columns_), X_test_encoded], axis=1)

### Feature Extraction

In [None]:
# Combine the columns in list of list
X_train_to_hash = [[prod, supp] for prod, supp in zip(X_train['Product_Name'], X_train['Supplier_Name'])]
X_test_to_hash = [[prod, supp] for prod, supp in zip(X_test['Product_Name'], X_test['Supplier_Name'])]

In [None]:
# Create Feature Hashing
n_features=100
hasher = FeatureHasher(n_features=n_features, input_type="string")

In [None]:
# Appling
X_train_hashed = hasher.transform(X_train_to_hash)
X_test_hashed = hasher.transform(X_test_to_hash)

In [None]:
# Columns Names for data
hashed_column_names = [f'hashed_feature_{i}' for i in range(n_features)]

In [None]:
# Create DataFrame with dense matrix
X_train_hashed_df = pd.DataFrame(X_train_hashed.toarray(), columns=hashed_column_names, index=X_train.index)
X_test_hashed_df = pd.DataFrame(X_test_hashed.toarray(), columns=hashed_column_names, index=X_test.index)


In [None]:
# Create Final DataFrame
# Columns to remove
columns_rm = X_train[['Product_Name', 'Supplier_Name']].columns.to_list()

# Concatenate wit others DataFrames
X_train_final = pd.concat([
    X_train.drop(columns=columns_rm),
    X_train_hashed_df
], axis=1)

X_test_final = pd.concat([
    X_test.drop(columns=columns_rm),
    X_test_hashed_df
], axis=1)

In [None]:
X_test_final

In [None]:
X_train_final

# Pipeline

In [None]:
# load Data
df = pd.read_pickle(data_path + '/grocery.pkl')

In [None]:
# Split data in train test
X = df.drop(columns=["Sales_Volume", "Stock_Quantity", 'Product_ID', 'Supplier_ID',
                     'Stock_Value', 'Purchase_Order', 'Last_Order_Date', 'Expiration_Date', 'Warehouse_Location'])
y = df[["Sales_Volume", "Stock_Quantity"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=67)

In [None]:
# Instatiate Class
extractor = DateFeatureExtractor(date_column='Date_Received')
diff = Differentiator(columns=['Delivery_Lag', 'Days_For_Expiration'])

In [None]:
# Transform data
X_train_processed = extractor.transform(X_train)
X_train_processed = diff.transform(X_train_processed)

X_test_processed = extractor.transform(X_test)
X_test_processed = diff.transform(X_test_processed)

In [None]:
# Select columns
category_columns = X_train_processed.select_dtypes('category').columns.to_list()
string_columns = X_train_processed.select_dtypes('object').columns.to_list()
numeric_columns = (X_train_processed.select_dtypes(['int', 'float']).columns.to_list())
# + (y_test.select_dtypes(['int', 'float']).columns.to_list())

In [None]:
# Create Preprocessors
preprocessor = ColumnTransformer(
    transformers= [
        # Encapsulate the TextTokenizer and FeatureHasher in a Pipeline
        ('text_pipeline', Pipeline([
            ('tokenizer', TextTokenizer()),
            ('hasher', FeatureHasher(n_features=100, input_type='string'))
        ]), string_columns),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), category_columns),
        ('scale', StandardScaler(), numeric_columns)
    ],
    remainder='drop' # Remove columns that aren't in the list
)

# Configure Pipelines and Parameters

In [None]:
# Global configurations
seed_std = 67

In [None]:
# Algoritms for train
# MultiOutputRegressor used because have more than 1 targets
pipelines = {
    'RandomForest': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(RandomForestRegressor(random_state=seed_std, oob_score=True)))
    ]),
    'LightGBM': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(LGBMRegressor(random_state=seed_std)))
    ]),
    'XGBoost': Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(XGBRegressor(random_state=seed_std, silent=1)))
    ]),
}

In [None]:
# Dictionary with padronization for models
param_grids = {
    'RandomForest': {
        'regressor__estimator__n_estimators': [200, 300, 400],
        'regressor__estimator__max_depth': [None, 10, 20, 30],
        'regressor__estimator__min_samples_split': [2, 5, 10],
        'regressor__estimator__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        'regressor__estimator__max_features': ['sqrt', 'log2', None],

    },
    'LightGBM': {
        'regressor__estimator__n_estimators': [200, 400, 600, 800],
        'regressor__estimator__max_depth': [-1, 10, 20],
        'regressor__estimator__learning_rate': [0.05, 0.1, 0.15],
        'regressor__estimator__num_leaves': [31, 50, 70],
        'regressor__estimator__colsample_bytree': [0.7, 0.8, 1.0],
    },
    'XGBoost': {
    'regressor__estimator__n_estimators': [200, 400, 600, 800],
    'regressor__estimator__learning_rate': [0.05, 0.1, 0.15],
    'regressor__estimator__max_depth': [3, 5, 10], # Geralmente valores menores que LGBM
    'regressor__estimator__subsample': [0.7, 0.8, 1.0], # Subamostragem de linhas
    'regressor__estimator__colsample_bytree': [0.7, 0.8, 1.0], # Subamostragem de colunas
    'regressor__estimator__reg_alpha': [0, 0.1, 0.5] # Regularização L1
    }
}

In [None]:
# Define cross validation for Time Series
tscv = TimeSeriesSplit(n_splits=5)

---

# Running MLflow

To **start the MLflow** user interface, open your terminal and execute the following command: 

```
cd notebooks
pdm run mlflow ui
```

---

In [None]:
def TrainModelPipeline(pipelines: dict, param_grids: dict, tscv, X, y, experiment_name: str):
    """
    Trains and tunes multi-output models from a dictionary of pipelines,
    logging results to MLflow using nested runs.
    
    Args:
        pipelines (dict): Dictionary where keys are model names and values are scikit-learn pipelines.
        param_grids (dict): Dictionary with the hyperparameter grid for searching.
        tscv: TimeSeriesSplit object for cross-validation.
        X: DataFrame with the training features.
        y: DataFrame or array with the training targets (multi-output).

    Example of use:
        Assuming you already have X_train_processed, y_train, and tscv:
            TrainModelPipeline(
                pipelines=pipelines, 
                param_grids=param_grids, 
                tscv=tscv, 
                X=X_train_processed, 
                y=y_train, 
                experiment_name="Demand_Forecasting_Experiment"
            )
    """
    
    # Starts the main run that will group all trainings
    with mlflow.start_run(run_name="Hyperparameter_Optimization_Experiment"):
        print("Starting main MLflow run for hyperparameter optimization.")

        # Iterates over each model and its set of parameters
        for model_name, current_pipeline in pipelines.items():
            print(f'Starting process for model: {model_name}')

            # Defines the name of the nested run
            run_name = f"GridSearch_{model_name}"

            # Checks if the run is already finished to avoid reprocessing
            existing_runs = mlflow.search_runs(
                experiment_names=[experiment_name],
                filter_string=f"tags.mlflow.runName = '{run_name}' and status = 'FINISHED'"
            )

            if not existing_runs.empty:
                print(f"Skipping {model_name}... Run '{run_name}' already exists and is finished.")
                continue

            # Gets the specific parameter dictionary for the current model
            current_param_grid = param_grids.get(model_name, {})
            if not current_param_grid:
                print(f"Warning: No param_grid found for {model_name}. Skipping...")
                continue
            
            # Defines the scorer using RMSE
            rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False, multioutput='uniform_average')

            # Starts the nested run for the current model
            with mlflow.start_run(run_name=run_name, nested=True):
                # Activates autologging for sklearn within the run context
                mlflow.sklearn.autolog()
                
                # Initializes and fits GridSearchCV
                grid_search = GridSearchCV(
                    estimator=current_pipeline,
                    param_grid=current_param_grid,
                    scoring=rmse_scorer,
                    cv=tscv,
                    n_jobs=-1,
                    verbose=1,
                    return_train_score=False
                )
                
                print(f"Fitting GridSearchCV for {model_name}...")
                grid_search.fit(X, y)
                
                # Autologging already logs most results
                # Here you can log extra metrics or information
                mlflow.log_metric("best_validation_rmse", -grid_search.best_score_)
                mlflow.log_params(grid_search.best_params_)

                print(f"Best RMSE for {model_name}: {-grid_search.best_score_}")
                print(f"Best parameters for {model_name}: {grid_search.best_params_}")



In [None]:
# Train Pipeline
TrainModelPipeline(
                pipelines=pipelines, 
                param_grids=param_grids, 
                tscv=tscv, 
                X=X_train_processed, 
                y=y_train, 
                experiment_name="Demand_Forecasting_Experiment"
            )

## Train First Result without otimization

Random Forest:  
> MAE: ~21.8839 / RMSE: ~25.3878

LightGBM:  
> MAE: ~22.5347 /  RMSE: ~26.6251

XGBoost: 
> MAE: ~23.4955 / RMSE: ~27.9475

In [None]:
# # Train Models
# # MLflow configurations
# experiment_name = 'Base_models:Demand_Forecasting'
# mlflow.set_experiment(experiment_name)
# # Experiment: 2_Hyperparameter_Tuning  
# # setup_experiment('hyperparameter_tuning')
# # Experiment: 3_Advanced_Models_DL
# # setup_experiment('advanced_models')
# # Experiment: 4_Final_Ensemble  
# # setup_experiment('final_ensemble')


# for model_name, current_pipeline in pipelines.items():
#     run_name = f"Training_{model_name}"

#     # Verify name and status
#     existing_runs = mlflow.search_runs(
#         experiment_names= [experiment_name],
#         filter_string=f"tags.mlflow.runName = '{run_name}' and status = 'FINISHED'"
#     )

#     if not existing_runs.empty:
#         print(f"Skipping {model_name}... Run '{run_name}' already exists and is finished.")
#         continue

#     with mlflow.start_run(run_name=run_name):
#         print(f"Training {model_name}...")
        
#         # Train the pipeline
#         current_pipeline.fit(X_train_processed, y_train)

#         # Make predictions and evaluate
#         preds = current_pipeline.predict(X_test_processed)
#         rmse = root_mean_squared_error(y_test, preds)
#         mae = mean_absolute_error(y_test, preds)

#         # Log metrics and model with MLflow
#         mlflow.log_metric("test_rmse", rmse)
#         mlflow.log_metric("test_mae", mae)
#         mlflow.sklearn.log_model(current_pipeline, f"{model_name}_model")

#         print(f"  Test RMSE for {model_name}: {rmse:.2f}")

# print("\nModel comparison completed. See the results in the MLflow UI.")

In [None]:
mlflow.search_runs()

#### LightGBM

#### XGBoost