In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

from forecast_sales.data_utils import create_full_dataset
from forecast_sales.features import mark_holiday_adjacent, add_date_categorical_features

# 1 Load and Fill data

In [12]:
df = pd.read_csv("../data/raw/df.csv", index_col=0, parse_dates=['date'])
df = df.sort_values('date').reset_index(drop=True)

display(df.head())
df.shape

Unnamed: 0,date,category_id,sku_id,sales_price,sales_quantity
0,2016-11-18,17,415510,23.205,1.0
1,2016-11-18,17,420009,22.49,2.0
2,2016-11-18,17,567734,23.205,1.0
3,2016-11-18,17,556333,32.76,2.0
4,2016-11-18,7,566241,283.4,1.0


(226486, 5)

In [13]:
# df_future = pd.read_csv("../data/raw/df_future.csv", index_col=0, parse_dates=['date'])
# df_future
# df_future.nunique()

In [14]:
# Add zero quantity rows, fill with previos prices
df = create_full_dataset(df)

display(df.head())
df.shape

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added
0,415510,2016-11-18,17.0,23.205,1.0,False
1,680322,2016-11-18,17.0,34.71,4.0,False
2,735757,2016-11-18,7.0,432.9,0.0,True
3,700392,2016-11-18,17.0,22.75,0.0,True
4,742356,2016-11-18,7.0,700.7,0.0,True


(1529196, 6)

# 2 Feature engineering

In [15]:
# Add holdays flag (+-1 day)
df = mark_holiday_adjacent(df)

display(df.head())
df["is_holiday_or_adjacent"].value_counts(normalize=True)

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added,is_holiday_or_adjacent
0,415510,2016-11-18,17.0,23.205,1.0,False,False
1,680322,2016-11-18,17.0,34.71,4.0,False,False
2,735757,2016-11-18,7.0,432.9,0.0,True,False
3,700392,2016-11-18,17.0,22.75,0.0,True,False
4,742356,2016-11-18,7.0,700.7,0.0,True,False


is_holiday_or_adjacent
False    0.889197
True     0.110803
Name: proportion, dtype: float64

In [16]:
# Add categorical_features
df = add_date_categorical_features(df)

display(df.head())

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added,is_holiday_or_adjacent,dayofweek,month,quarter
0,415510,2016-11-18,17.0,23.205,1.0,False,False,4,11,4
1,680322,2016-11-18,17.0,34.71,4.0,False,False,4,11,4
2,735757,2016-11-18,7.0,432.9,0.0,True,False,4,11,4
3,700392,2016-11-18,17.0,22.75,0.0,True,False,4,11,4
4,742356,2016-11-18,7.0,700.7,0.0,True,False,4,11,4


In [24]:
cat_features = ["is_holiday_or_adjacent", "dayofweek", "month", "quarter"]

In [25]:
# Check Numeric
df['sales_price'] = df['sales_price'].astype(float)

In [26]:
# Lags & rolling stats (per SKU): will be moved to function
df = df.sort_values(['sku_id','date'])
grp = df.groupby('sku_id')['sales_quantity']
df['lag_1']      = grp.shift(14).fillna(0)
df['lag_7']      = grp.shift(28).fillna(0)
df['roll7_mean'] = grp.shift(1).rolling(14, min_periods=1).mean().fillna(0)

# 3 Train/Test split

In [27]:
cutoff = df['date'].quantile(0.8)
train = df[df['date'] <= cutoff].copy()
test  = df[df['date'] > cutoff].copy()

# 4 Baseline: historical mean

In [28]:
sku_mean = train.groupby('sku_id')['sales_quantity'] \
                .mean() \
                .rename('sku_mean')
                
df = df.merge(sku_mean, on='sku_id', how='left') # For next steps
df['sku_mean'] = df['sku_mean'].fillna(0)

train = train.merge(sku_mean, on='sku_id', how='left') # For next steps
train['sku_mean'] = train['sku_mean'].fillna(0)

test = test.merge(sku_mean, on='sku_id', how='left')
test['sku_mean'] = test['sku_mean'].fillna(0)

KeyError: 'sku_mean'

In [29]:
y_test = test['sales_quantity']
y_pred_baseline = test['sku_mean']
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
print(f"Baseline MAE (per‑SKU mean): {mae_baseline:.3f}")

Baseline MAE (per‑SKU mean): 2.962


# 2 Dummy Catboost: only categorical features

In [6]:
features_dummy = cat_features
X_train_d, y_train_d = train[features_dummy], train['sales_quantity']
X_test_d,  y_test_d  = test [features_dummy], test ['sales_quantity']

train_pool_d = Pool(X_train_d, y_train_d, cat_features=cat_features)
test_pool_d  = Pool(X_test_d,  y_test_d,  cat_features=cat_features)

model_dummy = CatBoostRegressor(
    iterations=300,
    learning_rate=0.01,
    depth=5,
    # one_hot_max_size=len(df['sku_id'].unique()), 
    random_seed=42,
    early_stopping_rounds=50,
    verbose=50
)
model_dummy.fit(train_pool_d, eval_set=test_pool_d, use_best_model=True)

y_pred_dummy = model_dummy.predict(X_test_d)
mae_dummy  = mean_absolute_error(y_test_d, y_pred_dummy)
print(f"Dummy CatBoost MAE: {mae_dummy:.3f}")

0:	learn: 204.0614719	test: 15.9623055	best: 15.9623055 (0)	total: 72.1ms	remaining: 21.6s
50:	learn: 136.9210838	test: 11.3783145	best: 11.3783145 (50)	total: 456ms	remaining: 2.23s
100:	learn: 101.2349673	test: 9.5586103	best: 9.5586103 (100)	total: 813ms	remaining: 1.6s
150:	learn: 84.1340903	test: 9.1264650	best: 9.1264650 (150)	total: 1.23s	remaining: 1.21s
200:	learn: 76.6818679	test: 9.1739435	best: 9.1049467 (167)	total: 1.59s	remaining: 785ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 9.104946659
bestIteration = 167

Shrink model to first 168 iterations.
Dummy CatBoost MAE: 4.923


### 3 Catboost with numeric features

In [None]:
num_features = ['sku_mean','sales_price','lag_1','lag_7','roll7_mean']
features_all = cat_features + num_features

X_train, y_train = train[features_all], train['sales_quantity']
X_test, y_test = test [features_all], test ['sales_quantity']

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

model_full = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    early_stopping_rounds=50,
    verbose=50
)
model_full.fit(train_pool, eval_set=test_pool, use_best_model=True)

y_pred_full = model_full.predict(X_test)
mae_full   = mean_absolute_error(y_test, y_pred_full)
print(f"Full CatBoost MAE with extra features: {mae_full:.3f}\n")

0:	learn: 196.5268951	test: 15.4027708	best: 15.4027708 (0)	total: 21.5ms	remaining: 10.7s
50:	learn: 55.9760347	test: 7.6966445	best: 7.6934106 (49)	total: 892ms	remaining: 7.85s
100:	learn: 50.0556349	test: 7.1338949	best: 7.1338949 (100)	total: 1.87s	remaining: 7.38s
150:	learn: 47.6353468	test: 6.7440433	best: 6.7376364 (141)	total: 2.85s	remaining: 6.58s
200:	learn: 45.5044069	test: 6.5826828	best: 6.5824326 (198)	total: 3.89s	remaining: 5.78s
250:	learn: 43.6914806	test: 6.5364573	best: 6.5354063 (237)	total: 4.84s	remaining: 4.8s
300:	learn: 42.5765194	test: 6.4593000	best: 6.4572052 (285)	total: 5.77s	remaining: 3.81s
350:	learn: 41.4892256	test: 6.4641912	best: 6.4469409 (348)	total: 6.67s	remaining: 2.83s
400:	learn: 40.5001375	test: 6.4022068	best: 6.4020295 (390)	total: 7.66s	remaining: 1.89s
450:	learn: 39.6119025	test: 6.3584105	best: 6.3583923 (448)	total: 8.59s	remaining: 933ms
499:	learn: 38.8037863	test: 6.2973773	best: 6.2973773 (499)	total: 9.48s	remaining: 0us

bes