# Model Evaluation
## Best ML vs Best DL
### Evaluation & Visualization
### Models
 - Best ML: LightGBM
 - Best DL: Seq2Seq

In [1]:
# Clone GitHub Repository
!git clone https://github.com/sabin74/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform.git


Cloning into 'Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform'...
remote: Enumerating objects: 301, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 301 (delta 39), reused 36 (delta 6), pack-reused 216 (from 1)[K
Receiving objects: 100% (301/301), 41.69 MiB | 27.07 MiB/s, done.
Resolving deltas: 100% (151/151), done.
Updating files: 100% (57/57), done.
Filtering content: 100% (22/22), 348.58 MiB | 44.52 MiB/s, done.


In [51]:
# Import Libraries
import os
import json
import joblib
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
import tensorflow as tf
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

In [10]:
# Set Project Root
os.chdir('/content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform')
print("Current Directory: ", os.getcwd())

Current Directory:  /content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform


In [11]:
MODEL_PATH = Path('models')
REPORT_PATH = Path('data/reports')
REPORT_PATH.mkdir(parents=True, exist_ok=True)

In [52]:
# Import Original Data
train = pd.read_parquet('data/features/train_features.parquet')
test = pd.read_parquet('data/features/test_features.parquet')

In [53]:
# Memory Optimization (reduce memory usage)
def reduce_mem_usage(df, ):
  for col in df.columns:
    if df[col].dtype == "float64":
      df[col] = df[col].astype("float32")
    elif df[col].dtype == "int64":
      df[col] = df[col].astype("int32")
  return df

train = reduce_mem_usage(train)
gc.collect()

5883

In [54]:
# Data Handling and Sorting
train['date'] = pd.to_datetime(train['date'])
train = train.sort_values(
    ['store_nbr', 'family', 'date']
).reset_index(drop=True)

In [55]:
# Drop NaN values in Lag/Roll Features
lag_cols = [col for col in train.columns if "lag" in col or "roll" in col]
train = train.dropna(subset=lag_cols)

In [60]:
# Load Ml and DL Features
with open(MODEL_PATH /'dl_feature&Scaler' /'dl_feature_map.json', 'r') as f:
    dl_feature_map = json.load(f)

scaler = joblib.load(MODEL_PATH / 'seq2seq_model'/ 'scaler.pkl')


ml_feature_importances = pd.read_csv(
    MODEL_PATH /'lightgbm' /'lgb_feature_importance.csv'
)

DL_FEATURES = dl_feature_map['dl_features_order']
DL_NUMERICAL_FEATURES = dl_feature_map['numeric_features']
DL_CATEGORICAL_FEATURES = dl_feature_map['categorical_features']

ML_FEATURES = ml_feature_importances.feature.values

In [19]:
# Build rmsle_tf function for seq2seq model
def rmsle_tf(y_true, y_pred):
    y_true = tf.exp(y_true)
    y_pred = tf.exp(y_pred)
    return tf.sqrt(
        tf.reduce_mean(
            tf.square(tf.math.log1p(y_pred) - tf.math.log1p(y_true))
        )
    )

In [21]:
# Load best Model
lgbm_model = lgb.Booster(
    model_file=MODEL_PATH / 'lightgbm' / 'baseline_lightgbm.txt'
)

seq2seq_model = tf.keras.models.load_model(
    MODEL_PATH / 'seq2seq_model' / 'seq2seq_model.keras',
    custom_objects={'rmsle_tf': rmsle_tf}
)

## Prediction on Validation Set
 - X_val_ml → 2D (samples, features)
 - X_val_dl → 3D (samples, window, features)
 - y_val → log1p(sales)

In [57]:
# Prepare ML validation Data
X_val_ml = train[ML_FEATURES].values
y_val = train["sales_log"].values


In [None]:

# Encode Categoricals (Integer IDs for Embeddings)
# Build Category
category_maps = {}

for col in DL_CATEGORICAL_FEATURES:
  category_maps[col] = {
      v: i + 1 for i, v in enumerate(train[col].astype(str).unique())
    }

# Apply Encoding
def encode_categories(df, maps):
  df = df.copy()
  for col, mp in maps.items():
    df[col] = df[col].astype(str).map(mp).fillna(0).astype("int32")
  return df

ml_train_df = encode_categories(train, category_maps)

ml_train_df[DL_NUMERICAL_FEATURES] = scaler.transform(ml_train_df[DL_NUMERICAL_FEATURES])

In [22]:
# Predictions
y_pred_ml = lgbm_model.predict(X_val_ml)
y_pred_dl = seq2seq_model.predict(X_val_dl).ravel()

# Conver back to Origial Scale
y_true = np.expm1(y_val)
y_pred_ml = np.expm1(y_pred_ml)
y_pred_dl = np.expm1(y_pred_dl)

NameError: name 'X_val_ml' is not defined