In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [2]:
daily = pd.read_csv('../data/call-center-data-v2-daily.csv')
hoo = pd.read_csv('../data/call-center-data-v2-daily-hoo.csv')

In [3]:
daily

Unnamed: 0,Date,Incoming Calls,Answered Calls,Abandoned Calls,Answer Speed (AVG),Talk Duration (AVG),Waiting Time (AVG)
0,2022-01-01,157,145,12,0:00:15,0:02:29,0:03:12
1,2022-01-02,37,37,0,0:00:03,0:02:06,0:00:35
2,2022-01-03,317,304,13,0:00:18,0:01:35,0:02:37
3,2022-01-04,253,244,9,0:00:13,0:01:50,0:02:02
4,2022-01-05,214,205,9,0:00:10,0:02:10,0:03:22
...,...,...,...,...,...,...,...
1242,2025-05-27,203,195,8,0:00:11,0:02:47,0:01:52
1243,2025-05-28,192,184,8,0:00:07,0:02:50,0:01:56
1244,2025-05-29,212,209,3,0:00:10,0:02:51,0:01:45
1245,2025-05-30,211,203,8,0:00:12,0:03:22,0:03:52


In [4]:
hoo

Unnamed: 0,Date,Incoming Calls,Answered Calls,Abandoned Calls,Answer Speed (AVG),Talk Duration (AVG),Waiting Time (AVG)
0,2022-01-01,,,,,,
1,2022-01-02,,,,,,
2,2022-01-03,317.0,304.0,13.0,0:00:18,0:01:35,0:02:37
3,2022-01-04,253.0,244.0,9.0,0:00:13,0:01:50,0:02:02
4,2022-01-05,214.0,205.0,9.0,0:00:10,0:02:10,0:03:22
...,...,...,...,...,...,...,...
1242,2025-05-27,203.0,195.0,8.0,0:00:11,0:02:47,0:01:52
1243,2025-05-28,192.0,184.0,8.0,0:00:07,0:02:50,0:01:56
1244,2025-05-29,212.0,209.0,3.0,0:00:10,0:02:51,0:01:45
1245,2025-05-30,211.0,203.0,8.0,0:00:12,0:03:22,0:03:52


## Data Cleaning

In [5]:
# Convert 'Date' to datetime
daily['Date'] = pd.to_datetime(daily['Date'])
hoo['Date'] = pd.to_datetime(hoo['Date'])

# Convert time-string columns to total seconds
for col in ['Answer Speed (AVG)', 'Talk Duration (AVG)', 'Waiting Time (AVG)']:
    daily[col] = pd.to_timedelta(daily[col]).dt.total_seconds()
    hoo[col] = pd.to_timedelta(hoo[col]).dt.total_seconds()

# Extract the day of week: Monday=0, Sunday=6
daily['Day Of Week'] = daily['Date'].dt.dayofweek
hoo['Day Of Week'] = hoo['Date'].dt.dayofweek

In [6]:
# Identify missing values (NaN) in the hoo dataset's 'Incoming Calls'.
# .isna() returns True/False. .astype(int) converts True to 1, False to 0.
hoo['Is Non Operational'] = hoo['Incoming Calls'].isna().astype(int)

# Merge the 'Is_Non_Operational' column from 'hoo' into 'daily' using the Date index.
daily = daily.merge(
    # Select only the two necessary columns from hoo: the key and the new flag
    hoo[['Date', 'Is Non Operational']], 
    on='Date',
    how='left'
)

In [7]:
# Calculate Abandonment Rate
daily['Abandonment Rate (%)'] = np.where(
    daily['Incoming Calls'] > 0,
    (daily['Abandoned Calls'] / daily['Incoming Calls']) * 100,
    0
)

In [8]:
# Define top 25% and bottom 25% thresholds for Incoming Calls
q75 = daily['Incoming Calls'].quantile(0.75)
q25 = daily['Incoming Calls'].quantile(0.25)

daily['Volume Segment'] = 'Medium'
daily.loc[daily['Incoming Calls'] >= q75, 'Volume Segment'] = 'High'
daily.loc[daily['Incoming Calls'] <= q25, 'Volume Segment'] = 'Low'

In [9]:
daily = daily.sort_values('Date')
daily = daily.set_index('Date')

In [10]:
daily

Unnamed: 0_level_0,Incoming Calls,Answered Calls,Abandoned Calls,Answer Speed (AVG),Talk Duration (AVG),Waiting Time (AVG),Day Of Week,Is Non Operational,Abandonment Rate (%),Volume Segment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-01,157,145,12,15.0,149.0,192.0,5,1,7.643312,Medium
2022-01-02,37,37,0,3.0,126.0,35.0,6,1,0.000000,Low
2022-01-03,317,304,13,18.0,95.0,157.0,0,0,4.100946,High
2022-01-04,253,244,9,13.0,110.0,122.0,1,0,3.557312,High
2022-01-05,214,205,9,10.0,130.0,202.0,2,0,4.205607,Medium
...,...,...,...,...,...,...,...,...,...,...
2025-05-27,203,195,8,11.0,167.0,112.0,1,0,3.940887,Medium
2025-05-28,192,184,8,7.0,170.0,116.0,2,0,4.166667,Medium
2025-05-29,212,209,3,10.0,171.0,105.0,3,0,1.415094,Medium
2025-05-30,211,203,8,12.0,202.0,232.0,4,0,3.791469,Medium


## Modeling

In [11]:
# Data manipulation
import pandas as pd
import numpy as np

# Time series / stats models
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import pmdarima as pm
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Machine learning models
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Model evaluation
from sklearn.metrics import mean_absolute_error

# Optional / utility
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# train/test split (chronological)
train = daily[daily.index < '2025-03-01']
test = daily[daily.index >= '2025-03-01']

print("train range:", train.index.min(), "->", train.index.max(), "n=", len(train))
print("test range:", test.index.min(),  "->", test.index.max(),  "n=", len(test))

train range: 2022-01-01 00:00:00 -> 2025-02-28 00:00:00 n= 1155
test range: 2025-03-01 00:00:00 -> 2025-05-31 00:00:00 n= 92


In [13]:
train

Unnamed: 0_level_0,Incoming Calls,Answered Calls,Abandoned Calls,Answer Speed (AVG),Talk Duration (AVG),Waiting Time (AVG),Day Of Week,Is Non Operational,Abandonment Rate (%),Volume Segment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-01,157,145,12,15.0,149.0,192.0,5,1,7.643312,Medium
2022-01-02,37,37,0,3.0,126.0,35.0,6,1,0.000000,Low
2022-01-03,317,304,13,18.0,95.0,157.0,0,0,4.100946,High
2022-01-04,253,244,9,13.0,110.0,122.0,1,0,3.557312,High
2022-01-05,214,205,9,10.0,130.0,202.0,2,0,4.205607,Medium
...,...,...,...,...,...,...,...,...,...,...
2025-02-24,393,363,30,19.0,186.0,174.0,0,0,7.633588,High
2025-02-25,283,280,3,6.0,185.0,109.0,1,0,1.060071,High
2025-02-26,241,238,3,5.0,175.0,196.0,2,0,1.244813,High
2025-02-27,281,273,8,9.0,166.0,218.0,3,0,2.846975,High


In [16]:
y_train = train['Incoming Calls']
y_test = test['Incoming Calls']

print(f"y_train contains {len(y_train)} records.")
print(f"y_test contains {len(y_test)} records.")

y_train contains 1155 records.
y_test contains 92 records.


## Naive baseline: Forecast using prior year demand

In [17]:
# --- 2. Calculate Naive Baseline (Prior Year) ---
prior_year_dates = test.index - pd.DateOffset(years=1)
naive_forecast_values = train.loc[prior_year_dates, 'Incoming Calls'].values
y_pred_naive = pd.Series(naive_forecast_values, index=y_test.index)
MAE_NAIVE_DENOMINATOR = mean_absolute_error(y_test, y_pred_naive)
naive_mase = 1.0
naive_wmape = 100 * np.sum(np.abs(y_test - y_pred_naive)) / np.sum(np.abs(y_test))

print("--- Naive Baseline (Prior Year) Metrics ---")
print(f"Baseline MAE (MAE_naive): {MAE_NAIVE_DENOMINATOR:.3f}")
print(f"Baseline MASE: {naive_mase:.3f}")
print(f"Baseline WMAPE: {naive_wmape:.3f}%")

--- Naive Baseline (Prior Year) Metrics ---
Baseline MAE (MAE_naive): 333.772
Baseline MASE: 1.000
Baseline WMAPE: 186.737%


In [19]:
def forecast_metrics(y_true, y_pred, mae_naive_denom=MAE_NAIVE_DENOMINATOR):

    mae_model = mean_absolute_error(y_true, y_pred)
    mase_model = mae_model / mae_naive_denom
    wmape_model = 100 * np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))
    
    return {
        "MAE": mae_model, 
        "MASE": mase_model, 
        "WMAPE (%)": wmape_model
    }

## Model 1: Simple Moving Average
Predict today = average of previous 7 days (rolling(7).mean()) or just use the value 7 days ago (lag_7)

In [24]:
# --- Model 1: Simple Moving Average (1, 7, and 30-day) ---

# 0. Define the window sizes we want to test
window_sizes = [1, 7, 30]

# 1. Create a list to store our results dictionaries
sma_results_list = []

print("Running SMA walk-forward validation...")

# 2. Loop through each window size
for window in window_sizes:
    print(f"--- Testing SMA with {window}-day window ---")
    
    # 3. Initialize history and forecast list for this loop
    history = y_train.copy()
    sma_forecast = []
    
    # 4. Walk-forward forecast
    for current_date in y_test.index:
        # Compute the moving average for the current window size
        forecast = history[-window:].mean()
        sma_forecast.append(forecast)
        
        # Update history with the actual test value
        # This simulates the model having the real data after the day ends
        history.loc[current_date] = y_test.loc[current_date]
    
    # 5. Convert this model's predictions to a Series
    sma_forecast_series = pd.Series(sma_forecast, index=y_test.index)
    
    # 6. Evaluate metrics using our global helper function
    metrics_dict = forecast_metrics(y_test, sma_forecast_series)
    
    # 7. Add the model name to the dictionary
    metrics_dict['Model'] = f'SMA ({window}-day)'
    
    # 8. Append the results to our list
    sma_results_list.append(metrics_dict)
    print("Done.")

# 9. Convert the list of dictionaries to a DataFrame
print("\n--- All SMA Models Complete ---")
sma_summary_df = pd.DataFrame(sma_results_list)

# 10. Re-order columns to be more readable
sma_summary_df = sma_summary_df[['Model', 'MAE', 'MASE', 'WMAPE (%)']]

# 11. Display the final DataFrame
display(sma_summary_df)

Running SMA walk-forward validation...
--- Testing SMA with 1-day window ---
Done.
--- Testing SMA with 7-day window ---
Done.
--- Testing SMA with 30-day window ---
Done.

--- All SMA Models Complete ---


Unnamed: 0,Model,MAE,MASE,WMAPE (%)
0,SMA (1-day),87.826087,0.263132,49.136463
1,SMA (7-day),76.669255,0.229706,42.894499
2,SMA (30-day),83.569928,0.25038,46.75525


## Model 2: SARIMA

In [25]:
# 1. Check stationarity
result = adfuller(train['Incoming Calls'])
print('ADF Statistic:', result[0])
print('p-value:', result[1])

ADF Statistic: -4.917338760935737
p-value: 3.226315822581374e-05


In [27]:
# 2. Fit auto_arima on the train set (using y_train)
print("Running auto_arima... This may take a minute.")
sarima_model = pm.auto_arima(
    y_train,                  # Use the y_train variable
    seasonal=True,
    m=7,                      # Weekly seasonality
    stepwise=True,
    trace=True,
    error_action='ignore',
    suppress_warnings=True
)

print("\n--- Best SARIMA Model Summary ---")
print(sarima_model.summary())

# 3. Forecast for the test period
n_periods = len(y_test) # Use y_test
sarima_forecast = sarima_model.predict(n_periods=n_periods)
sarima_forecast = pd.Series(sarima_forecast, index=y_test.index)

# 4. Evaluate metrics using our global helper function
#   THIS IS THE CORRECTED SECTION
sarima_metrics = forecast_metrics(
    y_true=y_test,          # Pass in the actuals
    y_pred=sarima_forecast  # Pass in the predictions
)

# Unpack the dictionary
sarima_mae = sarima_metrics["MAE"]
sarima_mase = sarima_metrics["MASE"]
sarima_wmape = sarima_metrics["WMAPE (%)"]

# This print will now work correctly
print("\n--- SARIMA Forecast Metrics ---")
print(f"MAE: {sarima_mae:.2f}")
print(f"MASE: {sarima_mase:.3f}")
print(f"WMAPE: {sarima_wmape:.2f}%")

Running auto_arima... This may take a minute.
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(1,0,1)[7] intercept   : AIC=inf, Time=3.89 sec
 ARIMA(0,1,0)(0,0,0)[7] intercept   : AIC=14815.719, Time=0.02 sec
 ARIMA(1,1,0)(1,0,0)[7] intercept   : AIC=14366.380, Time=0.57 sec
 ARIMA(0,1,1)(0,0,1)[7] intercept   : AIC=14387.010, Time=0.55 sec
 ARIMA(0,1,0)(0,0,0)[7]             : AIC=14813.720, Time=0.02 sec
 ARIMA(1,1,0)(0,0,0)[7] intercept   : AIC=14648.937, Time=0.04 sec
 ARIMA(1,1,0)(2,0,0)[7] intercept   : AIC=14275.114, Time=1.08 sec
 ARIMA(1,1,0)(2,0,1)[7] intercept   : AIC=inf, Time=4.73 sec
 ARIMA(1,1,0)(1,0,1)[7] intercept   : AIC=14084.206, Time=2.10 sec
 ARIMA(1,1,0)(0,0,1)[7] intercept   : AIC=14474.861, Time=0.65 sec
 ARIMA(1,1,0)(1,0,2)[7] intercept   : AIC=inf, Time=4.45 sec
 ARIMA(1,1,0)(0,0,2)[7] intercept   : AIC=14385.461, Time=1.07 sec
 ARIMA(1,1,0)(2,0,2)[7] intercept   : AIC=14088.180, Time=5.78 sec
 ARIMA(0,1,0)(1,0,1)[7] intercept   : AIC=14202.074, Time=

## Model 3: Random Forest Regressor

In [30]:
# --- 1. Feature Engineering (Same as your code) ---

# We need these new imports for tuning
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

print("Preparing features for RF...")
df = train.copy()

# Lag features (previous 7 days)
for lag in range(1, 8):
    df[f'lag_{lag}'] = df['Incoming Calls'].shift(lag)

# Rolling mean
df['rolling_7'] = df['Incoming Calls'].shift(1).rolling(7).mean()

# Day of week and weekend indicator
df['day_of_week'] = df.index.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)

# Drop first 7 days with NaN
df = df.dropna()

# Prepare test set
test_df = test.copy()
full = pd.concat([train, test])

for lag in range(1, 8):
    test_df[f'lag_{lag}'] = full['Incoming Calls'].shift(lag).loc[test.index]

test_df['rolling_7'] = full['Incoming Calls'].shift(1).rolling(7).mean().loc[test.index]
test_df['day_of_week'] = test_df.index.dayofweek
test_df['is_weekend'] = test_df['day_of_week'].isin([5,6]).astype(int)
# test_df = test_df.dropna() # This line is correct in your original, but not strictly needed as lags are clean

# Features and target
features = [f'lag_{i}' for i in range(1,8)] + ['rolling_7','day_of_week','is_weekend']
# These are the local, feature-engineered X and y
X_train_rf = df[features]
y_train_rf = df['Incoming Calls'] # Aligned with X_train_rf (1148 rows)

X_test_rf = test_df[features]
y_test_rf = test_df['Incoming Calls']   # Aligned with X_test_rf (92 rows)


# --- 2. Hyperparameter Tuning with TimeSeriesSplit ---

# Define the parameter grid to search
# Note: This is a small grid to run quickly. A real project might test more.
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Set up the Time Series Cross-Validator
# This is crucial to prevent random shuffling and respect time order
tscv = TimeSeriesSplit(n_splits=5)

# Set up the base model
base_rf = RandomForestRegressor(random_state=42)

# Set up the Grid Search
print("Starting RF tuning (GridSearchCV)... This may take a few minutes.")
grid_search = GridSearchCV(
    estimator=base_rf,
    param_grid=param_grid,
    cv=tscv, # Use our time-series splitter
    scoring='neg_mean_absolute_error', # We minimize MAE, so we maximize negative MAE
    n_jobs=-1, # Use all available CPU cores
    verbose=1
)

# Fit the grid search on the training data
grid_search.fit(X_train_rf, y_train_rf) 

# Get the best model found by the search
rf_model = grid_search.best_estimator_

print("\n--- Tuning Complete ---")
print(f"Best parameters found: {grid_search.best_params_}")


# --- 3. Forecast & Evaluate (Corrected) ---

# Forecast using the *best tuned model*
rf_forecast = rf_model.predict(X_test_rf)

# Evaluate using our global function
# We pass the y_test that is aligned with X_test_rf
rf_metrics = forecast_metrics(y_true=y_test_rf, y_pred=rf_forecast)

rf_mae = rf_metrics["MAE"]
rf_mase = rf_metrics["MASE"]
rf_wmape = rf_metrics["WMAPE (%)"]

print("\n--- Random Forest Forecast Metrics (Tuned) ---")
print(f"MAE: {rf_mae:.2f}")
print(f"MASE: {rf_mase:.3f}")
print(f"WMAPE: {rf_wmape:.2f}%")

Preparing features for RF...
Starting RF tuning (GridSearchCV)... This may take a few minutes.
Fitting 5 folds for each of 27 candidates, totalling 135 fits

--- Tuning Complete ---
Best parameters found: {'max_depth': 10, 'min_samples_leaf': 5, 'n_estimators': 200}

--- Random Forest Forecast Metrics (Tuned) ---
MAE: 51.40
MASE: 0.154
WMAPE: 28.76%


## Model 4: LightGBM

In [32]:
# --- 1. Set up Tuning ---
# We are re-using X_train, y_train, X_test, and y_test
# from the feature engineering step in the Random Forest cell.

# Define the parameter grid to search
# Note: This is a small grid to run quickly.
param_grid_lgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40]
}

# Set up the Time Series Cross-Validator
# This ensures we respect the time order during tuning
tscv = TimeSeriesSplit(n_splits=5)

# Set up the base model
# Note: We use LGBMRegressor, not lgb.LGBMRegressor,
# based on your import in cell 11
base_lgb = LGBMRegressor(random_state=42)

# Set up the Grid Search
print("Starting LGBM tuning (GridSearchCV)... This may take a few minutes.")
grid_search_lgb = GridSearchCV(
    estimator=base_lgb,
    param_grid=param_grid_lgb,
    cv=tscv, # Use our time-series splitter
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# --- 2. Train Model ---
# Fit the grid search on the training data from the previous step
grid_search_lgb.fit(X_train, y_train)

# Get the best model found by the search
lgb_model = grid_search_lgb.best_estimator_

print("\n--- Tuning Complete ---")
print(f"Best parameters found: {grid_search_lgb.best_params_}")


# --- 3. Forecast ---
# Forecast using the *best tuned model*
lgb_forecast = lgb_model.predict(X_test)


# --- 4. Evaluate (Corrected) ---
# Evaluate using our global function
# We pass the y_test that is aligned with X_test
lgb_metrics = forecast_metrics(y_true=y_test, y_pred=lgb_forecast)

lgb_mae = lgb_metrics["MAE"]
lgb_mase = lgb_metrics["MASE"]
lgb_wmape = lgb_metrics["WMAPE (%)"]

print("\n--- LightGBM Forecast Metrics (Tuned) ---")
print(f"MAE: {lgb_mae:.2f}")
print(f"MASE: {lgb_mase:.3f}")
print(f"WMAPE: {lgb_wmape:.2f}%")

Starting LGBM tuning (GridSearchCV)... This may take a few minutes.
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2049
[LightGBM] [Info] Number of data points in the train set: 1148, number of used features: 10
[LightGBM] [Info] Start training from score 204.414634

--- Tuning Complete ---
Best parameters found: {'learning_rate': 0.01, 'n_estimators': 500, 'num_leaves': 20}

--- LightGBM Forecast Metrics (Tuned) ---
MAE: 52.50
MASE: 0.157
WMAPE: 29.37%


## Model 5: XG Boost

In [33]:
# --- 1. Set up Tuning ---
# We are re-using X_train, y_train, X_test, and y_test
# from the feature engineering step in the Random Forest cell.

# Define the parameter grid to search
# Note: This is a small grid to run quickly.
param_grid_xgb = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Set up the Time Series Cross-Validator
# This ensures we respect the time order during tuning
tscv = TimeSeriesSplit(n_splits=5)

# Set up the base model
base_xgb = XGBRegressor(random_state=42)

# Set up the Grid Search
print("Starting XGBoost tuning (GridSearchCV)... This may take a few minutes.")
grid_search_xgb = GridSearchCV(
    estimator=base_xgb,
    param_grid=param_grid_xgb,
    cv=tscv, # Use our time-series splitter
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# --- 2. Train Model ---
# Fit the grid search on the training data from the previous step
grid_search_xgb.fit(X_train, y_train)

# Get the best model found by the search
xgb_model = grid_search_xgb.best_estimator_

print("\n--- Tuning Complete ---")
print(f"Best parameters found: {grid_search_xgb.best_params_}")


# --- 3. Forecast ---
# Forecast using the *best tuned model*
xgb_forecast = xgb_model.predict(X_test)


# --- 4. Evaluate (Corrected) ---
# Evaluate using our global function
# We pass the y_test that is aligned with X_test
xgb_metrics = forecast_metrics(y_true=y_test, y_pred=xgb_forecast)

xgb_mae = xgb_metrics["MAE"]
xgb_mase = xgb_metrics["MASE"]
xgb_wmape = xgb_metrics["WMAPE (%)"]

print("\n--- XGBoost Forecast Metrics (Tuned) ---")
print(f"MAE: {xgb_mae:.2f}")
print(f"MASE: {xgb_mase:.3f}")
print(f"WMAPE: {xgb_wmape:.2f}%")

Starting XGBoost tuning (GridSearchCV)... This may take a few minutes.
Fitting 5 folds for each of 27 candidates, totalling 135 fits

--- Tuning Complete ---
Best parameters found: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}

--- XGBoost Forecast Metrics (Tuned) ---
MAE: 49.19
MASE: 0.147
WMAPE: 27.52%


## Summary

In [34]:
# Create a summary dictionary
summary_dict = {
    "Model": ["SMA (7-day)", "SARIMA", "Random Forest", "LightGBM", "XGBoost"],
    "MAE": [sma_mae, sarima_mae, rf_mae, lgb_mae, xgb_mae],
    "MASE": [sma_mase, sarima_mase, rf_mase, lgb_mase, xgb_mase],
    "WMAPE (%)": [sma_wmape, sarima_wmape, rf_wmape, lgb_wmape, xgb_wmape]
}

# Convert to DataFrame
summary_df = pd.DataFrame(summary_dict)

# Optional: sort by MAE (best performing first)
summary_df = summary_df.sort_values("MAE").reset_index(drop=True)

# Display
print("Forecast Model Comparison Summary:")
display(summary_df)

Forecast Model Comparison Summary:


Unnamed: 0,Model,MAE,MASE,WMAPE (%)
0,XGBoost,49.186532,0.147366,27.518614
1,Random Forest,51.399599,0.153996,28.756769
2,LightGBM,52.496826,0.157284,29.37064
3,SMA (7-day),76.669255,0.229706,42.894499
4,SARIMA,102.599768,0.307395,57.401962
