In [24]:
import pandas as pd

df = pd.read_csv('../data/engineered_retail_data.csv', parse_dates=['Date'])
df = df.sort_values(['product_id', 'Date']).reset_index(drop=True)

# ---- Add extra lag features ----
df['lag_2'] = df.groupby('product_id')['Total_Purchases'].shift(2)
df['lag_3'] = df.groupby('product_id')['Total_Purchases'].shift(3)

# ---- Rolling median ----
df['roll_7_median'] = df.groupby('product_id')['Total_Purchases'].shift(1).rolling(7).median()
df['roll_30_median'] = df.groupby('product_id')['Total_Purchases'].shift(1).rolling(30).median()

# ---- Rolling min/max ----
df['roll_7_min'] = df.groupby('product_id')['Total_Purchases'].shift(1).rolling(7).min()
df['roll_7_max'] = df.groupby('product_id')['Total_Purchases'].shift(1).rolling(7).max()

# ---- EWMA ----
df['ewm_7'] = df.groupby('product_id')['Total_Purchases'].shift(1).ewm(span=7, adjust=False).mean()
df['ewm_30'] = df.groupby('product_id')['Total_Purchases'].shift(1).ewm(span=30, adjust=False).mean()

# Fill missing values created by lags/rolls
df = df.fillna(0)

df.to_csv('../data/engineered_retail_data_catboost_final.csv', index=False)

df.head()


Unnamed: 0,State,Country,Date,Total_Purchases,Product_Category,Product_Brand,Product_Type,products,product_id,Year,...,roll_30_mean,roll_30_std,lag_2,lag_3,roll_7_median,roll_30_median,roll_7_min,roll_7_max,ewm_7,ewm_30
0,england,united kingdom,2023-03-11,3.0,electronics,sony,television,4k_tv,0,2023,...,6.033333,2.772815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,new south wales,australia,2023-03-11,4.0,electronics,sony,television,4k_tv,0,2023,...,5.966667,2.797577,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0
2,georgia,usa,2023-03-11,9.0,electronics,sony,television,4k_tv,0,2023,...,6.166667,2.792642,3.0,0.0,0.0,0.0,0.0,0.0,3.25,3.064516
3,berlin,germany,2023-03-11,4.0,electronics,samsung,television,4k_tv,0,2023,...,6.233333,2.712466,4.0,3.0,0.0,0.0,0.0,0.0,4.6875,3.447451
4,new south wales,australia,2023-03-11,6.0,electronics,samsung,television,4k_tv,0,2023,...,6.2,2.708862,9.0,4.0,0.0,0.0,0.0,0.0,4.515625,3.483099


In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

df = pd.read_csv('../data/engineered_retail_data_catboost_final.csv')

# Time-based split
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')



train_df = df[df['Date'] < '2024-01-01']
test_df  = df[df['Date'] >= '2024-01-01']

target = 'Total_Purchases'

# CatBoost can take raw categorical columns
cat_features = ['State','Country','Product_Category','Product_Brand','Product_Type','products']

features = [c for c in df.columns if c not in ['Total_Purchases','Date']]

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

# model = CatBoostRegressor(
#     iterations = 1500,
#     depth = 10,
#     learning_rate = 0.03,
#     loss_function = 'RMSE',
#     eval_metric = 'RMSE',
#     random_seed = 42,
#     task_type = 'CPU',  # change to CPU if no GPU
#     early_stopping_rounds = 80
# )

# model.fit(
#     X_train, y_train,
#     eval_set=(X_test, y_test),
#     cat_features=cat_features,
#     verbose=200
# )

# pred = model.predict(X_test)

from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error, 
    r2_score, 
    median_absolute_error
)
import numpy as np

# ---- Existing Metrics ----
# mae = mean_absolute_error(y_test, pred)
# rmse = np.sqrt(mean_squared_error(y_test, pred))

# # ---- Additional Metrics ----
# # MAPE (avoid divide by zero)
# mape = np.mean(np.abs((y_test - pred) / (y_test + 1e-8))) * 100

# # SMAPE
# smape = 100 * np.mean(
#     2 * np.abs(y_test - pred) / (np.abs(y_test) + np.abs(pred) + 1e-8)
# )
# y_test_clipped  = np.maximum(y_test, 0)
# pred_clipped    = np.maximum(pred, 0)
    

# # R2 Score
# r2 = r2_score(y_test, pred)
# # Median Absolute Error
# medae = median_absolute_error(y_test, pred)

# RMSLE
# rmsle = np.sqrt(mean_squared_error(
#     np.log1p(y_test_clipped),
#     np.log1p(pred_clipped)
# ))



# # Print all metrics
# print("CatBoost Test MAE:", mae)
# print("CatBoost Test RMSE:", rmse)
# print("CatBoost Test MAPE:", mape)
# print("CatBoost Test SMAPE:", smape)
# print("CatBoost Test R²:", r2)
# print("CatBoost Test Median AE:", medae)
# print("CatBoost Test RMSLE:", rmsle)



In [None]:
pred = model.predict(X_test)


In [None]:
test_df['Predicted'] = pred

daily_actual = test_df.groupby('Date')['Total_Purchases'].sum()
daily_pred = test_df.groupby('Date')['Predicted'].sum()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14,6))
plt.plot(daily_actual.index, daily_actual.values, label='Actual', linewidth=2)
plt.plot(daily_pred.index, daily_pred.values, label='Predicted', linewidth=2)
plt.title("Actual vs Predicted Total Purchases (Daily Sum)")
plt.xlabel("Date")
plt.ylabel("Purchases")
plt.legend()
plt.grid(True)
plt.show()
