# Ensembling & Optimization
### Goal: Maximize performance
## Methods
 - Weighted averaging
 - Stacking (ML + DL)
 - Bias correction
 - Post-processing (zero-sales handling)
### üìå Output: Final optimized model

In [1]:
# Clone GitHub Repository
!git clone https://github.com/sabin74/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform.git

Cloning into 'Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform'...
remote: Enumerating objects: 329, done.[K
remote: Counting objects: 100% (113/113), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 329 (delta 60), reused 37 (delta 8), pack-reused 216 (from 1)[K
Receiving objects: 100% (329/329), 41.98 MiB | 22.15 MiB/s, done.
Resolving deltas: 100% (172/172), done.
Updating files: 100% (58/58), done.
Filtering content: 100% (22/22), 348.58 MiB | 44.03 MiB/s, done.


In [2]:
!pip install -q catboost
!pip install category_encoders


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m85.9/85.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.9.0


In [3]:

# Environment Setup - Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

import os
from pathlib import Path
import gc

In [4]:
# Set Project Root
os.chdir("/content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform")
print("Current Directory: ", os.getcwd())

Current Directory:  /content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform


In [5]:
import joblib
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

# Random Forest
rf_model = joblib.load("models/random_forest/random_forest.pkl")

# Target Encoder
tt = joblib.load("models/random_forest/target_encoder.pkl")

# CatBoost
cat_model = CatBoostRegressor()
cat_model.load_model("models/catboost/catboost.cbm")

# LightGBM
lgb_model = lgb.Booster(model_file="models/lightgbm/baseline_lightgbm.txt")

# XGBoost
xgb_model = xgb.Booster()
xgb_model.load_model("models/xgboost/xgboost.json")

In [6]:
# Load Feature-Engineered Data
DATA_DIR = Path("data/features")

train = pd.read_parquet(DATA_DIR / "train_features.parquet")

In [7]:
# Memory Optimization (reduce memory usage)
def reduce_mem_usage(df, ):
  for col in df.columns:
    if df[col].dtype == "float64":
      df[col] = df[col].astype("float32")
    elif df[col].dtype == "int64":
      df[col] = df[col].astype("int32")
  return df

train = reduce_mem_usage(train)
gc.collect()


0

In [8]:
# Data Handling and Sorting
train['date'] = pd.to_datetime(train['date'])
train = train.sort_values(
    ['store_nbr', 'family', 'date']
).reset_index(drop=True)


In [9]:
# Drop NaN values in Lag/Roll Features
lag_cols = [col for col in train.columns if "lag" in col or "roll" in col]
train = train.dropna(subset=lag_cols).reset_index(drop=True)

In [10]:
# Define Target and Features
TARGET = 'sales_log'
y_true = train[TARGET]

In [11]:
# Drop unnecessaary Columns
Drop_cols = ['id', 'date', 'sales', 'sales_log']

FEATURES = [col for col in train.columns if col not in Drop_cols]

X_true = train[FEATURES]

In [12]:
# Categorical Features
CAT_COLS = train.select_dtypes(include='category').columns.tolist()

# Handling Categorical Features
X_true_te = tt.transform(X_true)


# RMSLE Evaluation Function
def rmsle(y_true, y_pred):
  y_true = np.expm1(y_true)
  y_pred = np.expm1(y_pred)
  y_pred = np.maximum(y_pred, 0)
  return np.sqrt(mean_squared_log_error(y_true, y_pred))

# MAE Evaluation Function
def mae(y_true, y_pred):
    y_true = np.expm1(y_true)
    y_pred = np.expm1(y_pred)
    y_pred = np.maximum(y_pred, 0)
    return mean_absolute_error(y_true, y_pred)


## Validation Prediction

In [13]:
# RF Validation Prediction
rf_valid_pred = rf_model.predict(X_true_te)

# XGB Validation prediction
xgb_valid_pred = xgb_model.predict(xgb.DMatrix(X_true_te))

# LightGBM Validation Prediction
lgb_valid_pred = lgb_model.predict(
    X_true,
    num_iteration=lgb_model.best_iteration
)

# CatBoost Validation Prediction
cat_valid_pred = cat_model.predict(X_true)

##  Evaluation Metrics

In [14]:
metrics = {

    "RandomForest": {
        "RMSLE": rmsle(y_true, rf_valid_pred),
        "MAE": mae(y_true, rf_valid_pred),
    },
    "XGBoost": {
        "RMSLE": rmsle(y_true, xgb_valid_pred),
        "MAE": mae(y_true, xgb_valid_pred),
    },
    "LightGBM": {
        "RMSLE": rmsle(y_true, lgb_valid_pred),
        "MAE": mae(y_true, lgb_valid_pred),
    },
    "CatBoost": {
        "RMSLE": rmsle(y_true, cat_valid_pred),
        "MAE": mae(y_true, cat_valid_pred),
    }
}

metrics_df = pd.DataFrame(metrics).T
metrics_df

Unnamed: 0,RMSLE,MAE
RandomForest,0.420985,59.492132
XGBoost,0.358644,41.652336
LightGBM,0.346664,40.643089
CatBoost,0.393336,52.850364


## WEIGHTED AVERAGING
<pre>
Combine 4 models: RandomForest, XGBoost, LightGBM, CatBoost into one stronger predictor by learning optimal weights.
</pre>

In [21]:
# Prepare Prediciton matrix
ensemble_df = pd.DataFrame({
    "y_true": y_true,
    "RandomForest": rf_valid_pred,
    "XGBoost": xgb_valid_pred,
    "LightGBM": lgb_valid_pred,
    "CatBoost": cat_valid_pred
})

In [22]:
# Simple Baseline : Equal Weight
ensemble_df['avg_pred'] = (
    ensemble_df['RandomForest'] +
    ensemble_df['XGBoost'] +
    ensemble_df['LightGBM'] +
    ensemble_df['CatBoost']
) / 4


In [23]:
# Evaluate
baseline_rmsle = rmsle(
    ensemble_df['y_true'].values,
    ensemble_df['avg_pred'].values
)
baseline_mae = mae(
    ensemble_df['y_true'].values,
    ensemble_df['avg_pred'].values
)
baseline_rmsle, baseline_mae

(np.float64(0.36859132750955187), 45.615535908194616)

## Optimize Weights
<pre>
Equal weights assume all models are equally good ‚Äî false in practice.

Method:
  final_pred = w1*RF + w2*XGB + w3*LGB + w4*CAT

Subject to:
  w_i ‚â• 0
  w1 + w2 + w3 + w4 = 1

üéØ Objective: Minimize RMSLE

In [24]:
# Define Optimization Objective
from scipy.optimize import minimize

def weighted_rmsle(weights, df):
  pred = (
      weights[0] * df['RandomForest'].values +
      weights[1] * df['XGBoost'].values +
      weights[2] * df['LightGBM'].values +
      weights[3] * df['CatBoost'].values
  )
  return rmsle(df['y_true'].values, pred)

In [26]:
from scipy.sparse import construct
# Initial equal weight
initial_weights = [0.25] * 4

# Constraints
constraints = ({
    'type': 'eq',
    'fun': lambda w: np.sum(w) - 1
})

# Each Weight between 0 and 1
bounds = [(0, 1)] * 4

In [27]:
# Run Optimization
opt_result = minimize(
    weighted_rmsle,
    initial_weights,
    args=(ensemble_df),
    method='SLSQP',
    bounds=bounds,
    constraints=constraints
)

opt_weight = opt_result['x']
opt_weight

array([2.18079593e-17, 0.00000000e+00, 1.00000000e+00, 1.11022302e-16])

In [28]:
# Final Weight Ensemble prediction
ensemble_df['opt_pred'] = (
    opt_weight[0] * ensemble_df['RandomForest'] +
    opt_weight[1] * ensemble_df['XGBoost'] +
    opt_weight[2] * ensemble_df['LightGBM'] +
    opt_weight[3] * ensemble_df['CatBoost']
)

In [29]:
# Evaluate Optimized Ensemble
final_rmsle = rmsle(
    ensemble_df['y_true'].values,
    ensemble_df['opt_pred'].values
)

final_mae = mae(
    ensemble_df['y_true'].values,
    ensemble_df['opt_pred'].values
)

final_rmsle, final_mae

(np.float64(0.3466643303966908), 40.64308918042002)

In [30]:
# Save Optimized Model
import json

weights_dict = {
    "rf": float(opt_weight[0]),
    "xgb": float(opt_weight[1]),
    "lgb": float(opt_weight[2]),
    "cat": float(opt_weight[3]),
}

with open("models/ensemble_weights.json", "w") as f:
    json.dump(weights_dict, f, indent=4)