In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

# 1 Load data

In [2]:
df = pd.read_csv("../data/raw/df.csv", index_col=0, parse_dates=['date'])
df = df.sort_values('date')

# 2 Feature engineering

In [3]:
# 2.1 Categorical
df['dayofweek'] = df['date'].dt.dayofweek.astype('category')
cat_features = ['sku_id', 'category_id', 'dayofweek']
if 'is_holiday_or_adjacent' in df.columns:
    df['is_holiday_or_adjacent'] = df['is_holiday_or_adjacent'].astype('category')
    cat_features.append('is_holiday_or_adjacent')

# 2.2 Numeric
df['sales_price'] = df['sales_price'].astype(float)

# 2.3 Lags & rolling stats (per SKU)
df = df.sort_values(['sku_id','date'])
grp = df.groupby('sku_id')['sales_quantity']
df['lag_1']      = grp.shift(1).fillna(0)
df['lag_7']      = grp.shift(7).fillna(0)
df['roll7_mean'] = grp.shift(1).rolling(7, min_periods=1).mean().fillna(0)

# 3 Train/Test split

In [4]:
cut_date = df['date'].quantile(0.8)
train = df[df['date'] <= cut_date].copy()
test  = df[df['date'] >  cut_date].copy()

# 4 Baseline: mean

In [5]:
sku_mean = train.groupby('sku_id')['sales_quantity'] \
                .mean() \
                .rename('sku_mean')

df = df.merge(sku_mean, on='sku_id', how='left')
df['sku_mean'] = df['sku_mean'].fillna(0)

train = train.merge(sku_mean, on='sku_id', how='left')
test  = test .merge(sku_mean, on='sku_id', how='left')

train['sku_mean'] = train['sku_mean'].fillna(0)
test ['sku_mean'] = test ['sku_mean'].fillna(0)

y_test = test['sales_quantity']
y_pred_baseline = test['sku_mean']
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
print(f"Baseline MAE (per‑SKU mean): {mae_baseline:.3f}\n")

Baseline MAE (per‑SKU mean): 2.766



### 2 Dummy Catboost

In [6]:
features_dummy = cat_features
X_train_d, y_train_d = train[features_dummy], train['sales_quantity']
X_test_d,  y_test_d  = test [features_dummy], test ['sales_quantity']

train_pool_d = Pool(X_train_d, y_train_d, cat_features=cat_features)
test_pool_d  = Pool(X_test_d,  y_test_d,  cat_features=cat_features)

model_dummy = CatBoostRegressor(
    iterations=300,
    learning_rate=0.01,
    depth=5,
    one_hot_max_size=len(df['sku_id'].unique()), 
    random_seed=42,
    early_stopping_rounds=50,
    verbose=50
)
model_dummy.fit(train_pool_d, eval_set=test_pool_d, use_best_model=True)

y_pred_dummy = model_dummy.predict(X_test_d)
mae_dummy  = mean_absolute_error(y_test_d, y_pred_dummy)
print(f"Dummy CatBoost MAE: {mae_dummy:.3f}\n")

0:	learn: 203.9717897	test: 16.3483745	best: 16.3483745 (0)	total: 59.5ms	remaining: 17.8s
50:	learn: 136.2841746	test: 11.8420518	best: 11.8420518 (50)	total: 195ms	remaining: 951ms
100:	learn: 100.3182220	test: 9.7127731	best: 9.7127731 (100)	total: 334ms	remaining: 659ms
150:	learn: 83.1300644	test: 8.8335611	best: 8.8335611 (150)	total: 473ms	remaining: 467ms
200:	learn: 75.7406680	test: 8.4990313	best: 8.4990313 (200)	total: 626ms	remaining: 308ms
250:	learn: 72.7806303	test: 8.3783140	best: 8.3783140 (250)	total: 795ms	remaining: 155ms
299:	learn: 71.6455571	test: 8.3284052	best: 8.3284052 (299)	total: 980ms	remaining: 0us

bestTest = 8.328405246
bestIteration = 299

Dummy CatBoost MAE: 4.057



### 3 Catboost with numeric features

In [7]:
num_features = ['sku_mean','sales_price','lag_1','lag_7','roll7_mean']
features_all = cat_features + num_features

X_train, y_train = train[features_all], train['sales_quantity']
X_test,  y_test  = test [features_all], test ['sales_quantity']

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_features)

model_full = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    early_stopping_rounds=50,
    verbose=50
)
model_full.fit(train_pool, eval_set=test_pool, use_best_model=True)

y_pred_full = model_full.predict(X_test)
mae_full   = mean_absolute_error(y_test, y_pred_full)
print(f"Full CatBoost MAE with extra features: {mae_full:.3f}\n")

0:	learn: 196.5532413	test: 15.8696157	best: 15.8696157 (0)	total: 24.2ms	remaining: 12.1s
50:	learn: 48.0238659	test: 7.4426307	best: 7.4426307 (50)	total: 1.01s	remaining: 8.9s
100:	learn: 42.0370508	test: 5.8443629	best: 5.8443629 (100)	total: 2.11s	remaining: 8.35s
150:	learn: 40.1841298	test: 5.1756651	best: 5.1756214 (149)	total: 3s	remaining: 6.93s
200:	learn: 38.7202594	test: 5.0048621	best: 5.0048621 (200)	total: 4.01s	remaining: 5.97s
250:	learn: 37.2224326	test: 4.9566585	best: 4.9561121 (248)	total: 5s	remaining: 4.96s
300:	learn: 35.9350690	test: 4.9541178	best: 4.9497781 (267)	total: 5.91s	remaining: 3.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 4.949778061
bestIteration = 267

Shrink model to first 268 iterations.
Full CatBoost MAE with extra features: 2.316

