In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error

from forecast_sales.data_utils import create_full_dataset
from forecast_sales.features import mark_holiday_adjacent, add_date_categorical_features, add_rolling_mean_features

# 1 Load and Fill data

In [2]:
df = pd.read_csv("../data/raw/df.csv", index_col=0, parse_dates=['date'])
df = df.sort_values('date').reset_index(drop=True)

display(df.head())
df.shape

Unnamed: 0,date,category_id,sku_id,sales_price,sales_quantity
0,2016-11-18,17,415510,23.205,1.0
1,2016-11-18,17,420009,22.49,2.0
2,2016-11-18,17,567734,23.205,1.0
3,2016-11-18,17,556333,32.76,2.0
4,2016-11-18,7,566241,283.4,1.0


(226486, 5)

In [3]:
# df_future = pd.read_csv("../data/raw/df_future.csv", index_col=0, parse_dates=['date'])
# df_future
# df_future.nunique()

In [4]:
# Add zero quantity rows, fill with previos prices
df = create_full_dataset(df)

display(df.head())
df.shape

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added
0,415510,2016-11-18,17.0,23.205,1.0,False
1,680322,2016-11-18,17.0,34.71,4.0,False
2,735757,2016-11-18,7.0,432.9,0.0,True
3,700392,2016-11-18,17.0,22.75,0.0,True
4,742356,2016-11-18,7.0,700.7,0.0,True


(1529196, 6)

# 2 Feature engineering

## 2.1 Categorical

In [5]:
# Add holdays flag (+-1 day)
df = mark_holiday_adjacent(df)

display(df.head())
df["is_holiday_or_adjacent"].value_counts(normalize=True)

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added,is_holiday_or_adjacent
0,415510,2016-11-18,17.0,23.205,1.0,False,False
1,680322,2016-11-18,17.0,34.71,4.0,False,False
2,735757,2016-11-18,7.0,432.9,0.0,True,False
3,700392,2016-11-18,17.0,22.75,0.0,True,False
4,742356,2016-11-18,7.0,700.7,0.0,True,False


is_holiday_or_adjacent
False    0.889197
True     0.110803
Name: proportion, dtype: float64

In [6]:
# Add categorical_features
df, date_feature_names = add_date_categorical_features(df)

display(df.head())

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added,is_holiday_or_adjacent,dayofweek,month,quarter
0,415510,2016-11-18,17.0,23.205,1.0,False,False,4,11,4
1,680322,2016-11-18,17.0,34.71,4.0,False,False,4,11,4
2,735757,2016-11-18,7.0,432.9,0.0,True,False,4,11,4
3,700392,2016-11-18,17.0,22.75,0.0,True,False,4,11,4
4,742356,2016-11-18,7.0,700.7,0.0,True,False,4,11,4


In [7]:
cat_features = ["is_holiday_or_adjacent"] + date_feature_names
cat_features

['is_holiday_or_adjacent', 'dayofweek', 'month', 'quarter']

## 2.2 Numeric

In [8]:
# Check Numeric
df['sales_price'] = df['sales_price'].astype(float)

In [None]:
# Additional Numeric Features
df, rolling_feature_names = add_rolling_mean_features(df, columns=['sales_quantity', 'sales_price'])

display(df.head())

Unnamed: 0,sku_id,date,category_id,sales_price,sales_quantity,added,is_holiday_or_adjacent,dayofweek,month,quarter,...,sales_quantity_roll112_mean,sales_quantity_roll500_mean,sales_quantity_roll1000_mean,sales_price_roll7_mean,sales_price_roll14_mean,sales_price_roll28_mean,sales_price_roll56_mean,sales_price_roll112_mean,sales_price_roll500_mean,sales_price_roll1000_mean
631,1045,2016-11-18,7.0,253.5,0.0,True,False,4,11,4,...,,,,,,,,,,
648,8620,2016-11-18,17.0,32.5,2.0,False,False,4,11,4,...,,,,,,,,,,
411,642939,2016-11-18,7.0,648.7,0.0,True,False,4,11,4,...,,,,,,,,,,
244,544804,2016-11-18,17.0,32.76,2.0,False,False,4,11,4,...,,,,,,,,,,
454,685105,2016-11-18,17.0,27.066,0.0,True,False,4,11,4,...,,,,,,,,,,


In [10]:
num_features = ['sku_mean', 'sales_price'] + rolling_feature_names
num_features

['sku_mean',
 'sales_price',
 'sales_quantity_roll7_mean',
 'sales_quantity_roll14_mean',
 'sales_quantity_roll28_mean',
 'sales_quantity_roll56_mean',
 'sales_quantity_roll112_mean',
 'sales_quantity_roll500_mean',
 'sales_quantity_roll1000_mean',
 'sales_price_roll7_mean',
 'sales_price_roll14_mean',
 'sales_price_roll28_mean',
 'sales_price_roll56_mean',
 'sales_price_roll112_mean',
 'sales_price_roll500_mean',
 'sales_price_roll1000_mean']

# 3 Train/Test split

In [11]:
cutoff = df['date'].quantile(0.8)
train = df[df['date'] <= cutoff].copy()
test  = df[df['date'] > cutoff].copy()

# 4 Simple Models

## 4.1 Baseline: historical mean

In [12]:
sku_mean = train.groupby('sku_id')['sales_quantity'] \
                .mean() \
                .rename('sku_mean')
                
df = df.merge(sku_mean, on='sku_id', how='left') # For next steps
df['sku_mean'] = df['sku_mean'].fillna(0)

train = train.merge(sku_mean, on='sku_id', how='left') # For next steps
train['sku_mean'] = train['sku_mean'].fillna(0)

test = test.merge(sku_mean, on='sku_id', how='left')
test['sku_mean'] = test['sku_mean'].fillna(0)

In [13]:
y_test = test['sales_quantity']
y_pred_baseline = test['sku_mean']
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
print(f"Baseline MAE (per‑SKU mean): {mae_baseline:.3f}")

Baseline MAE (per‑SKU mean): 2.962


# 5 Dummy Catboost: only categorical features

In [14]:
features_dummy = cat_features
X_train_d, y_train_d = train[features_dummy], train['sales_quantity']
X_test_d,  y_test_d  = test [features_dummy], test ['sales_quantity']

train_pool_d = Pool(X_train_d, y_train_d, cat_features=cat_features)
test_pool_d  = Pool(X_test_d,  y_test_d,  cat_features=cat_features)

model_dummy = CatBoostRegressor(
    iterations=300,
    learning_rate=0.01,
    depth=5,
    # one_hot_max_size=len(df['sku_id'].unique()), 
    random_seed=42,
    early_stopping_rounds=50,
    verbose=50
)
model_dummy.fit(train_pool_d, eval_set=test_pool_d, use_best_model=True)

y_pred_dummy = model_dummy.predict(X_test_d)
mae_dummy  = mean_absolute_error(y_test_d, y_pred_dummy)
print(f"Dummy CatBoost MAE: {mae_dummy:.3f}")

0:	learn: 79.4749925	test: 4.5281657	best: 4.5281657 (0)	total: 159ms	remaining: 47.7s
50:	learn: 79.4746978	test: 4.5352542	best: 4.5281657 (0)	total: 2.6s	remaining: 12.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 4.528165714
bestIteration = 0

Shrink model to first 1 iterations.
Dummy CatBoost MAE: 2.904


# 3 Catboost with numeric features

In [None]:
features_all = cat_features + num_features

X_train, y_train = train[features_all], train['sales_quantity']
X_test, y_test = test [features_all], test ['sales_quantity']

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

model_full = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    early_stopping_rounds=50,
    verbose=50
)
model_full.fit(train_pool, eval_set=test_pool, use_best_model=True)

y_pred_full = model_full.predict(X_test)
mae_full   = mean_absolute_error(y_test, y_pred_full)
print(f"Full CatBoost MAE with extra features: {mae_full:.3f}\n")

0:	learn: 75.9261631	test: 4.4856803	best: 4.4856803 (0)	total: 175ms	remaining: 1m 27s
50:	learn: 21.5858050	test: 3.0366909	best: 3.0365456 (47)	total: 6.86s	remaining: 1m
100:	learn: 19.3447258	test: 2.5760191	best: 2.5760191 (100)	total: 12.6s	remaining: 49.9s
150:	learn: 18.7051170	test: 2.2806020	best: 2.2805896 (148)	total: 18.8s	remaining: 43.5s
200:	learn: 18.1040685	test: 2.2456773	best: 2.2421937 (199)	total: 24.1s	remaining: 35.9s
250:	learn: 17.6647315	test: 2.2215985	best: 2.2215703 (242)	total: 29.9s	remaining: 29.7s
300:	learn: 17.1332910	test: 2.2145436	best: 2.2145409 (295)	total: 35.1s	remaining: 23.2s
350:	learn: 16.4988965	test: 2.2115105	best: 2.2114913 (323)	total: 40.5s	remaining: 17.2s
