# Model Evaluation
## Best ML vs Best DL
### Evaluation & Visualization
### Models
 - Best ML: LightGBM
 - Best DL: Seq2Seq

In [None]:
# Clone GitHub Repository
!git clone https://github.com/sabin74/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform.git


In [None]:
# Import Libraries
import os
import json
import joblib
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
import tensorflow as tf
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

In [None]:
# Set Project Root
os.chdir('/content/Enterprise-Intelligent-Demand-Forecasting-Decision-Optimization-Platform')
print("Current Directory: ", os.getcwd())

In [None]:
MODEL_PATH = Path('models')
REPORT_PATH = Path('data/reports')
REPORT_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# Import Original Data
train = pd.read_parquet('data/features/train_features.parquet')
test = pd.read_parquet('data/features/test_features.parquet')

In [None]:
# Memory Optimization (reduce memory usage)
def reduce_mem_usage(df, ):
  for col in df.columns:
    if df[col].dtype == "float64":
      df[col] = df[col].astype("float32")
    elif df[col].dtype == "int64":
      df[col] = df[col].astype("int32")
  return df

train = reduce_mem_usage(train)
gc.collect()

In [None]:
# Data Handling and Sorting
train['date'] = pd.to_datetime(train['date'])
train = train.sort_values(
    ['store_nbr', 'family', 'date']
).reset_index(drop=True)

In [None]:
# Drop NaN values in Lag/Roll Features
lag_cols = [col for col in train.columns if "lag" in col or "roll" in col]
train = train.dropna(subset=lag_cols)

In [None]:
# Load Ml and DL Features
with open(MODEL_PATH /'dl_feature&Scaler' /'dl_feature_map.json', 'r') as f:
    dl_feature_map = json.load(f)

scaler = joblib.load(MODEL_PATH / 'seq2seq_model'/ 'scaler.pkl')


ml_feature_importances = pd.read_csv(
    MODEL_PATH /'lightgbm' /'lgb_feature_importance.csv'
)

DL_FEATURES = dl_feature_map['dl_features_order']
DL_NUMERICAL_FEATURES = dl_feature_map['numeric_features']
DL_CATEGORICAL_FEATURES = dl_feature_map['categorical_features']

ML_FEATURES = ml_feature_importances.feature.values

In [None]:
# Build rmsle_tf function for seq2seq model
def rmsle_tf(y_true, y_pred):
    y_true = tf.exp(y_true)
    y_pred = tf.exp(y_pred)
    return tf.sqrt(
        tf.reduce_mean(
            tf.square(tf.math.log1p(y_pred) - tf.math.log1p(y_true))
        )
    )

In [None]:
# Load best Model
lgbm_model = lgb.Booster(
    model_file=MODEL_PATH / 'lightgbm' / 'baseline_lightgbm.txt'
)

seq2seq_model = tf.keras.models.load_model(
    MODEL_PATH / 'seq2seq_model' / 'seq2seq_model.keras',
    custom_objects={'rmsle_tf': rmsle_tf}
)

## Prediction on Validation Set
 - X_val_ml → 2D (samples, features)
 - X_val_dl → 3D (samples, window, features)
 - y_val → log1p(sales)

In [None]:
# Prepare ML validation Data
X_val_ml = train[ML_FEATURES].values
y_val = train["sales_log"].values

# Categorical Features
ML_CAT_COLS = train.select_dtypes(include='category').columns.tolist()

lgb_valid = lgb.Dataset(
    X_val_ml,
    label=y_val,
    categorical_feature=ML_CAT_COLS,
    free_raw_data=False
)


In [None]:

# Encode Categoricals (Integer IDs for Embeddings)
# Build Category
category_maps = {}

for col in DL_CATEGORICAL_FEATURES:
  category_maps[col] = {
      v: i + 1 for i, v in enumerate(train[col].astype(str).unique())
    }

# Apply Encoding
def encode_categories(df, maps):
  df = df.copy()
  for col, mp in maps.items():
    df[col] = df[col].astype(str).map(mp).fillna(0).astype("int32")
  return df

ml_train_df = encode_categories(train, category_maps)

ml_train_df[DL_NUMERICAL_FEATURES] = scaler.transform(ml_train_df[DL_NUMERICAL_FEATURES])

In [None]:
# Sliding Window Function
def sliding_window_generator(
    df,
    window_size,
    horizon,
    feature_cols,
    target_col,
    batch_size=256,
):
    X_batch, y_batch = [], []

    for _, gdf in df.groupby(["store_nbr", "family"]):
        gdf = gdf.sort_values("date")

        X = gdf[feature_cols].values.astype("float32")
        y = gdf[target_col].values.astype("float32")

        if len(gdf) < window_size + horizon:
            continue

        for i in range(len(gdf) - window_size - horizon + 1):
            X_batch.append(X[i:i+window_size])
            y_batch.append(y[i+window_size+horizon-1])

            if len(X_batch) == batch_size:
                yield np.array(X_batch), np.array(y_batch)
                X_batch, y_batch = [], []

    if X_batch:
        yield np.array(X_batch), np.array(y_batch)




## tf.data Dataset Wrapper
WINDOW_SIZE = 28
HORIZON = 1
TARGET = "sales_log"
BATCH_SIZE = 256

def make_dataset(df, shuffle=False):
  ds = tf.data.Dataset.from_generator(
    lambda: sliding_window_generator(
      df, WINDOW_SIZE, HORIZON, DL_FEATURES, TARGET, BATCH_SIZE
    ),
    output_signature=(
      tf.TensorSpec(shape=(None, WINDOW_SIZE, len(DL_FEATURES)), dtype=tf.float32),
      tf.TensorSpec(shape=(None,), dtype=tf.float32),
    ),
  )
  if shuffle:
      ds = ds.shuffle(1024)
  return ds.prefetch(tf.data.AUTOTUNE)

# Validation datasets
valid_dl = make_dataset(ml_train_df)

In [None]:
# Predict (log scale)
y_pred_ml = lgbm_model.predict(lgb_valid)
y_pred_dl = seq2seq_model.predict(valid_dl).ravel()

# Back to original scale
y_true = np.expm1(y_val)
y_pred_ml = np.expm1(y_pred_ml)
y_pred_dl = np.expm1(y_pred_dl)


In [None]:
test.columns

In [None]:
def build_sliding_windows(df, window, horizon, features, target):
    X, y = [], []

    for _, gdf in df.groupby(["store_nbr", "family"]):
        gdf = gdf.sort_values("date")

        values = gdf[features].values.astype("float32")
        labels = gdf[target].values.astype("float32")

        if len(gdf) < window + horizon:
            continue

        for i in range(len(gdf) - window - horizon + 1):
            X.append(values[i:i+window])
            y.append(labels[i+window+horizon-1])

    return np.array(X), np.array(y)

X_val_dl, y_val_dl = build_sliding_windows(
    ml_train_df,
    WINDOW_SIZE,
    HORIZON,
    DL_FEATURES,
    TARGET,
)

