<a href="https://colab.research.google.com/github/sabin74/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform/blob/main/notebooks/05_ml_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Advanced Machine Learning Models
### Goal: Achieve strong performance
### Models
 - Random Forest
 - XGBoost
 - LightGBM (primary model)
 - CatBoost
### Techniques
 - Time-series cross-validation
 - Feature importance
 - Hyperparameter tuning (Optuna)
### Output: Best ML model


# Notebook Setup & Data Preparation

 - Load feature-engineered dataset
 - Memory optimization (critical for Colab)
 - Final NaN handling (lag-safe)
### Goal:
 - target (sales_log)
 - feature list
 - Time-based train/validation split

In [None]:
# Clone GitHub Repository
!git clone https://github.com/sabin74/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform.git

In [None]:
# Environment Setup - Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error

import os
from pathlib import Path
import gc

In [None]:
# Set Project Root
os.chdir("/content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform")
print("Current Directory: ", os.getcwd())

In [None]:
# Load Feature-Engineered Data
DATA_DIR = Path("data/features")

train = pd.read_parquet(DATA_DIR / "train_features.parquet")

In [None]:
# Memory Optimization (reduce memory usage)
def reduce_mem_usage(df, ):
  for col in df.columns:
    if df[col].dtype == "float64":
      df[col] = df[col].astype("float32")
    elif df[col].dtype == "int64":
      df[col] = df[col].astype("int32")
  return df

train = reduce_mem_usage(train)
gc.collect()

In [None]:
# Data Handling and Sorting
train['date'] = pd.to_datetime(train['date'])
train = train.sort_values(
    ['store_nbr', 'family', 'date']
).reset_index(drop=True)

In [None]:
# Drop NaN values in Lag/Roll Features
# identify Lag/Roll Columns
lag_cols = [col for col in train.columns if "lag" in col or "roll" in col]

# Drop NaN
initial_rows = len(train)

train = train.dropna(subset=lag_cols)

print(f"Rows dropped: {initial_rows - len(train)}")
print(f"Remaining Rows: {len(train)}")
print(f"Loose Percentage: {100 * (initial_rows - len(train)) / initial_rows:.2f}")

In [None]:
# Define Target and Features
TARGET = 'sales_log'
y = train[TARGET]

In [None]:
# Features Selection
# Drop unnecessaary Columns
Drop_cols = ['id', 'date', 'sales', 'sales_log']

FEATURES = [col for col in train.columns if col not in Drop_cols]

X = train[FEATURES]

In [None]:
# Categorical Features
CAT_COLS = train.select_dtypes(include='category').columns.tolist()
CAT_COLS

In [None]:
# Time Based Train / Validation Split
TRAIN_END_DATE = pd.to_datetime('2017-07-15')

train_model = train.copy()
train_mask = train_model['date'] <= TRAIN_END_DATE
valid_mask = train_model['date'] > TRAIN_END_DATE

X_train = X[train_mask]
y_train = y[train_mask]

X_valid = X[valid_mask]
y_valid = y[valid_mask]

In [None]:
# Data Split Summary
train_valid = train_model[valid_mask].reset_index(drop=True)

print("DATA SPLIT SUMMARY:\n")
print(f"Train dates: {train_model[train_mask]['date'].min().date()} to {train_model[train_mask]['date'].max().date()}")
print(f"Validation dates: {train_model[valid_mask]['date'].min().date()} to {train_model[valid_mask]['date'].max().date()}")

print(f"\nTrain Shape: {X_train.shape}")
print(f"Validation Shape: {X_valid.shape}")

In [None]:
# RMSLE Evaluation Function
def rmsle(y_true, y_pred):
  y_true = np.expm1(y_true)
  y_pred = np.expm1(y_pred)
  y_pred = np.maximum(y_pred, 0)
  return np.sqrt(mean_squared_log_error(y_true, y_pred))

## Model 1. Random Forest Model
 - Captures non-linearity
 - Strong improvement over linear models
 - Gives feature importance intuition

In [None]:
# Import Libraries
!pip install category_encoders

from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder

In [None]:
# Handling Categorical Features

te = TargetEncoder()
X_train_te = te.fit_transform(X_train, y_train)
X_valid_te = te.transform(X_valid)


In [None]:
# Define RF Model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=12,
    min_samples_split=50,
    min_samples_leaf=5,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

In [None]:
# Train RF Model
rf_model.fit(X_train_te, y_train)

In [None]:
# Validation Prediction
rf_valid_pred = rf_model.predict(X_valid_te)
rf_rmsle = rmsle(y_valid, rf_valid_pred)
print(f"Random Forest RMSLE: {rf_rmsle:.4f}")

In [None]:
# feature Importances
# Create Importance Dataframe
rf_importance = pd.DataFrame({
    'feature': X_train_te.columns,
    'importance': rf_model.feature_importances_
}).sort_values(by='importance', ascending=False)

# Sort by Importance
rf_importance = rf_importance.sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(8, 6))
plt.barh(
    rf_importance["feature"][:20][::-1],
    rf_importance["importance"][:20][::-1]
)
plt.title("Random Forest Feature Importance (Top 20)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


In [None]:
# Save Feature Importances
rf_importance.to_csv("rf_feature_importance.csv", index=False)

In [None]:
# Save Target Encoder
import joblib
joblib.dump(te, "models/target_encoder.pkl")

# Save Random Forest Model
joblib.dump(rf_model, "models/random_forest.pkl")

## Model 2. XGBoost
 -  Industry-standard GBM
 -  Excellent with lag + sparse features
 -  Strong biasâ€“variance tradeoff
 -  Much faster & better than Random Forest

In [None]:
# Import Library
!pip install -q xgboost

import xgboost as xgb

In [None]:
# Prepare XGBoost Dataset
dtrain = xgb.DMatrix(X_train_te, label=y_train)
dvalid = xgb.DMatrix(X_valid_te, label=y_valid)

In [None]:
# Define XGB Parameters
xgb_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "tree_method": "hist",   # fast & memory efficient
    "seed": 42
}

In [None]:
evals = [(dtrain, "train"), (dvalid, "valid")]

xgb_model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=50
)


In [None]:
# Validation prediction
xgb_valid_pred = xgb_model.predict(dvalid)
xgb_rmsle = rmsle(y_valid, xgb_valid_pred)
print(f"XGBoost RMSLE: {xgb_rmsle:.4f}")

In [None]:
# Overfitting Check - Train vs Valid
xgb_train_pred = xgb_model.predict(dtrain)
train_rmsle = rmsle(y_train, xgb_train_pred)
valid_rmsle = xgb_rmsle

print(f"Train RMSLE: {train_rmsle:.4f}")
print(f"Validation RMSLE: {valid_rmsle:.4f}")

In [None]:
# Feature Importance - Top 20
importances = xgb_model.get_score(importance_type="weight")

xgb_importance = pd.DataFrame({
    "feature": list(importances.keys()),
    "importance": list(importances.values())
}).sort_values(by="importance", ascending=False)

xgb_importance = xgb_importance.reset_index(drop=True)

In [None]:
plt.figure(figsize=(8, 6))
plt.barh(
    xgb_importance["feature"][:20][::-1],
    xgb_importance["importance"][:20][::-1]
)
plt.title("XGBoost Feature Importance (Top 20)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
# Save Feature Importances
xgb_importance.to_csv("lgb_feature_importance.csv", index=False)

In [None]:
# Save XGBoost
xgb_model.save_model("models/xgboost.json")

## Model 3 - LightGBM (PRIMARY MODEL)
-  Built for large tabular time-series
-  Extremely fast
-  Handles non-linearity + interactions
-  Kaggle favorite for this dataset
-  Best balance of accuracy + speed

In [None]:
# import Library
!pip install -q lightgbm

import lightgbm as lgb

In [None]:
# Prepare LightGBM Dataset
lgb_train = lgb.Dataset(
    X_train,
    label=y_train,
    categorical_feature=CAT_COLS,
    free_raw_data=False
)
lgb_valid = lgb.Dataset(
    X_valid,
    label=y_valid,
    reference=lgb_train,
    categorical_feature=CAT_COLS,
    free_raw_data=False
)

In [None]:
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq":5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42,
    "lambda_l1": 0.1,
    "lambda_l2": 0.1,
    "verbosity": -1,
    "boosting_type": "gbdt"
}

In [None]:
lgb_model = lgb.train(
    params=lgb_params,
    train_set=lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=["train", "valid"],
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

In [None]:
# Validation Prediction
lgb_valid_pred = lgb_model.predict(
    X_valid,
    num_iteration=lgb_model.best_iteration
)
lgb_rmsle = rmsle(y_valid, lgb_valid_pred)
print(f"LightGBM RMSLE: {lgb_rmsle:.4f}")

In [None]:
# Feature Importance
lgb_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": lgb_model.feature_importance()
}).sort_values(by="importance", ascending=False)

In [None]:
# Plot top 20 Features
plt.figure(figsize=(8, 6))
plt.barh(
    lgb_importance["feature"][:20][::-1],
    lgb_importance["importance"][:20][::-1]
)
plt.title("LightGBM Feature Importance (Top 20)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
# Save Feature Importance
lgb_importance.to_csv("lgb_feature_importance.csv", index=False)

In [None]:
# Save LightBGM
lgb_model.save_model("models/baseline_lightgbm.txt")

##Model 4 - CatBoost (practical view)

 - Strong with categorical features
 - Handles non-linearity well
 - Stable training, less tuning required

In [None]:
# Import Library
!pip install -q catboost

from catboost import CatBoostRegressor

In [None]:
# Define CatBoost Model
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    bagging_temperature=0.2,
    verbose=100,
    early_stopping_rounds=50
)

In [None]:
cat_model.fit(
    X_train, y_train,
    cat_features=CAT_COLS,
    eval_set=(X_valid, y_valid),
    use_best_model=True
)

In [None]:
# Validation Prediction
cat_valid_pred = cat_model.predict(X_valid)
cat_rmsle = rmsle(y_valid, cat_valid_pred)

print(f"CatBoost RMSLE: {cat_rmsle:.4f}")

In [None]:
# Overfitting Check
cat_train_pred = cat_model.predict(X_train)
train_rmsle = rmsle(y_train, cat_train_pred)
valid_rmsle = cat_rmsle

print(f"Train RMSLE: {train_rmsle:.4f}")
print(f"Validation RMSLE: {valid_rmsle:.4f}")

In [None]:
# Save CatBoost
cat_model.save_model("models/catboost.cbm", format="cbm")

## Model Comparision
Compare Model Based on RMSLE

In [None]:
# Model Comparision
model_rmsle = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost", "LightGBM", "CatBoost"],
    "RMSLE": [rf_rmsle, xgb_rmsle, lgb_rmsle, cat_rmsle]
}).sort_values(by="RMSLE", ascending=True)

plt.figure(figsize=(8, 6))
sns.barplot(x="Model", y="RMSLE", data=model_rmsle)
plt.title("Model RMSLE Comparison")
plt.xlabel("Model")
plt.ylabel("RMSLE")
plt.show()

## Hyperparameter Tuning with Optuna (LightGBM)

### Key Principles

 - Tune only LightGBM
 - Use time-based validation
 - Optimize RMSLE
 - Keep trials controlled

In [None]:
!pip install optuna
import optuna

In [None]:
def objective(trial):
  params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
    "num_leaves": trial.suggest_int("num_leaves", 31, 256),
    "max_depth": trial.suggest_int("max_depth", 5, 15),
    "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
    "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
    "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
    "verbosity": -1,
    "seed": 42
  }
  model = lgb.train(
    params=params,
    train_set=lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_valid],
    callbacks=[
      lgb.early_stopping(stopping_rounds=50),
      lgb.log_evaluation(period=0)  # period=0 disables logging
    ]
  )
  preds = model.predict(
    X_valid,
    num_iteration=model.best_iteration
  )
  return rmsle(y_valid, preds)

In [None]:
# Run Optuna Study
study = optuna.create_study(direction="minimize")
study.optimize(
  objective,
  n_trials=50,
  show_progress_bar=True
)

In [None]:
# Retrain Best Parameter and score
print('Best RMSLE', study.best_value)
print('Best Parameters:')
for key, value in study.best_params.items():
  print(f'{key}: {value}')

In [None]:
# Retrain Fine Tuned LighGBM
best_params = study.best_params
best_params.update(
    {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "seed": 42
    }
)

final_lgb_model = lgb.train(
    params=best_params,
    train_set=lgb_train,
    num_boost_round=2000,
    valid_sets=[lgb_valid],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

In [None]:
# Final Evaluation
final_preds = final_lgb_model.predict(
    X_valid,
    num_iteration=final_lgb_model.best_iteration
)
final_rmsle = rmsle(y_valid, final_preds)

print(f"Final Tuned LightGBM RMSLE: {final_rmsle:.4f}")

In [None]:
# Save LightBGM
final_lgb_model.save_model("models/tuned_lightgbm.txt")

## Compare all Models

In [None]:
# Compare all Modles
final_results = pd.DataFrame({
    "Model": [
        "Random Forest",
        "XGBoost",
        "CatBoost",
        "LightGBM (Baseline)",
        "LightGBM (Tuned)"
    ],
    "RMSLE": [
        rf_rmsle,
        xgb_rmsle,
        cat_rmsle,
        lgb_rmsle,
        final_rmsle
    ]
}).sort_values("RMSLE")

final_results


# Save Final Report

In [None]:
# Final Models Report
final_results.to_csv(
    "data/reports/ml_model_comparison_reports.csv",
    index=False
)
