### Environment Settings

In [18]:
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

In [2]:
def get_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate temporal feature metrics for each item/store in a sales DataFrame.
    
    For each sales entry of an item at a store, the function calculates:
    - Lag values for 1 day.
    - Cumulative sums for the last 3 and 7 days.
    - Rolling mean for 7 and 30 days.
    - Rolling standard deviation for 7 and 30 days.
    
    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame containing sales data, with the following columns:
        - 'item_id': Item identifier.
        - 'store_id': Store identifier.
        - 'date': Sales date.
        - 'y': Sales value (target variable).
    
    Returns:
    --------
    pd.DataFrame
        Original DataFrame with new feature columns added.
    """
    # Sort the DataFrame by item_id, store_id, and date
    df = df.sort_values(['item_id', 'store_id', 'date'])

    # Calculate lag_1
    df['lag_1'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(1)
    
    # Calculate cumulative sums for lag_3 and lag_7 (not true lags)
    df['sum_3'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 4))
    df['sum_7'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 8))

    # Calculate rolling mean and standard deviation for 7 and 30 days
    for window in [7, 30]:
        df[f'rolling_mean_{window}'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].rolling(window=window, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
        df[f'rolling_std_{window}'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].rolling(window=window, min_periods=1).std().shift(1).reset_index(level=[0, 1], drop=True)
    
    return df

def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add day of the week and month features to the dataset, as categorical variables.

    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame containing a 'date' column to extract the features from.

    Returns:
    --------
    pd.DataFrame
        DataFrame with new 'wday' and 'month' columns added as categorical variables.
    """
    # Day of the week (1 = Monday, 7 = Sunday)
    df['wday'] = df['date'].dt.dayofweek + 1
    df['wday'] = df['wday'].astype('category')

    # Month of the year
    df['month'] = df['date'].dt.month
    df['month'] = df['month'].astype('category')
    
    return df


def drop_unnecessary_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    """
    Drop unnecessary columns from the dataset.

    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame from which columns will be dropped.
    columns_to_drop : list
        List of column names to drop from the DataFrame.

    Returns:
    --------
    pd.DataFrame
        DataFrame with specified columns removed.
    """
    return df.drop(columns=columns_to_drop, axis=1)


### Data Loading n' Preparation

In [3]:
train = pd.read_parquet('../data/train.snap.parquet')

### Feature Engineering

In [4]:
# Apply functions to train and test datasets
train = add_date_features(train)

# Creating an unique snap column to reduce dimensionality
train['snap'] = 0
train.loc[
    (train['store_id'].str[:2] == 'CA') & (train['snap_CA'] == 1) |
    (train['store_id'].str[:2] == 'TX') & (train['snap_TX'] == 1) |
    (train['store_id'].str[:2] == 'WI') & (train['snap_WI'] == 1),
    'snap'
] = 1
train['snap'] = train['snap'].astype('category')

# Removing uneeded columns
columns_to_drop = ['id', 'dept_id', 'cat_id', 'state_id', 'wm_yr_wk', 'event_type_1', 
                   'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI']
train = drop_unnecessary_columns(train, columns_to_drop)

train.head()

Unnamed: 0,item_id,store_id,d,y,date,event_name_1,sell_price,wday,month,snap
0,FOODS_1_001,CA_1,d_1542,1.0,2015-04-19,,2.240234,7,4,0
1,FOODS_1_001,CA_1,d_1543,0.0,2015-04-20,,2.240234,1,4,0
2,FOODS_1_001,CA_1,d_1544,0.0,2015-04-21,,2.240234,2,4,0
3,FOODS_1_001,CA_1,d_1545,0.0,2015-04-22,,2.240234,3,4,0
4,FOODS_1_001,CA_1,d_1546,1.0,2015-04-23,,2.240234,4,4,0


In [5]:
train = get_metrics(train)

cutoff_date = train['date'].min() + pd.Timedelta('30D')
train  = train[train['date'] > cutoff_date]

In [6]:
train.head()

Unnamed: 0,item_id,store_id,d,y,date,event_name_1,sell_price,wday,month,snap,lag_1,sum_3,sum_7,rolling_mean_7,rolling_std_7,rolling_mean_30,rolling_std_30
5718316,HOBBIES_1_001,CA_1,d_1573,0.0,2015-05-20,,8.257812,3,5,0,0.0,0.0,0.0,0.0,0.0,0.366667,0.668675
5718317,HOBBIES_1_001,CA_1,d_1574,0.0,2015-05-21,,8.257812,4,5,0,0.0,0.0,0.0,0.0,0.0,0.333333,0.660895
5718318,HOBBIES_1_001,CA_1,d_1575,0.0,2015-05-22,,8.257812,5,5,0,0.0,0.0,0.0,0.0,0.0,0.333333,0.660895
5718319,HOBBIES_1_001,CA_1,d_1576,0.0,2015-05-23,,8.257812,6,5,0,0.0,0.0,0.0,0.0,0.0,0.266667,0.583292
5718320,HOBBIES_1_001,CA_1,d_1577,0.0,2015-05-24,,8.257812,7,5,0,0.0,0.0,0.0,0.0,0.0,0.233333,0.568321


In [7]:
train.tail()

Unnamed: 0,item_id,store_id,d,y,date,event_name_1,sell_price,wday,month,snap,lag_1,sum_3,sum_7,rolling_mean_7,rolling_std_7,rolling_mean_30,rolling_std_30
5718280,FOODS_3_827,WI_3,d_1937,0.0,2016-05-18,,1.0,3,5,0,4.0,9.0,12.0,1.714286,2.058663,0.833333,1.288767
5718281,FOODS_3_827,WI_3,d_1938,2.0,2016-05-19,,1.0,4,5,0,0.0,9.0,12.0,1.714286,2.058663,0.833333,1.288767
5718282,FOODS_3_827,WI_3,d_1939,2.0,2016-05-20,,1.0,5,5,0,2.0,6.0,14.0,2.0,1.914854,0.9,1.295882
5718283,FOODS_3_827,WI_3,d_1940,5.0,2016-05-21,,1.0,6,5,0,2.0,4.0,15.0,2.142857,1.864454,0.966667,1.299425
5718284,FOODS_3_827,WI_3,d_1941,1.0,2016-05-22,,1.0,7,5,0,5.0,9.0,18.0,2.571429,2.149197,1.133333,1.47936


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11226877 entries, 5718316 to 5718284
Data columns (total 17 columns):
 #   Column           Dtype         
---  ------           -----         
 0   item_id          category      
 1   store_id         category      
 2   d                category      
 3   y                float32       
 4   date             datetime64[ns]
 5   event_name_1     category      
 6   sell_price       float32       
 7   wday             category      
 8   month            category      
 9   snap             category      
 10  lag_1            float32       
 11  sum_3            float32       
 12  sum_7            float32       
 13  rolling_mean_7   float64       
 14  rolling_std_7    float64       
 15  rolling_mean_30  float64       
 16  rolling_std_30   float64       
dtypes: category(7), datetime64[ns](1), float32(5), float64(4)
memory usage: 824.5 MB


### Modeling

#### Fitting

In [9]:
features = [
    'item_id', 'store_id', 'wday', 'month', 'event_name_1', 'snap', 'sell_price', 
    'lag_1', 'sum_3', 'sum_7', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_std_30'
]

train_val_split_date = '2016-04-25' # Validation starts here 

val = train.copy()
val = val[val['date'] >= train_val_split_date]
val.reset_index(drop=True, inplace=True)

train = train[train['date'] < train_val_split_date]
train.reset_index(drop=True, inplace=True)


print(val.shape, train.shape)

(853720, 17) (10373157, 17)


In [11]:
print(train['date'].max(), val['date'].min())

2016-04-24 00:00:00 2016-04-25 00:00:00


In [12]:
X_train, y_train = train[features], train['y']
X_val, y_val = val[features], val['y']

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(10373157, 14) (10373157,)
(853720, 14) (853720,)


In [16]:
model = xgb.XGBRegressor(
    enable_categorical = True,
    early_stopping_rounds = 10,
    random_state=42 
)

In [17]:
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-rmse:2.94372
[1]	validation_0-rmse:2.51943
[2]	validation_0-rmse:2.27817
[3]	validation_0-rmse:2.14457
[4]	validation_0-rmse:2.07038
[5]	validation_0-rmse:2.02942
[6]	validation_0-rmse:2.00652
[7]	validation_0-rmse:1.99155
[8]	validation_0-rmse:1.98265
[9]	validation_0-rmse:1.97750
[10]	validation_0-rmse:1.97492
[11]	validation_0-rmse:1.97137
[12]	validation_0-rmse:1.96685
[13]	validation_0-rmse:1.96611
[14]	validation_0-rmse:1.96431
[15]	validation_0-rmse:1.96337
[16]	validation_0-rmse:1.96308
[17]	validation_0-rmse:1.96275
[18]	validation_0-rmse:1.96174
[19]	validation_0-rmse:1.96195
[20]	validation_0-rmse:1.96166
[21]	validation_0-rmse:1.96165
[22]	validation_0-rmse:1.96112
[23]	validation_0-rmse:1.96029
[24]	validation_0-rmse:1.95896
[25]	validation_0-rmse:1.95752
[26]	validation_0-rmse:1.95723
[27]	validation_0-rmse:1.95735
[28]	validation_0-rmse:1.95655
[29]	validation_0-rmse:1.95668
[30]	validation_0-rmse:1.95729
[31]	validation_0-rmse:1.95761
[32]	validation_0-

In [19]:
# Save pkl for future usage
joblib.dump(model, '../models/xgb_model.pkl')

['../models/xgb_model.pkl']

----

In [21]:
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)

print(f'MAE in train set: {mae:.2f}')

MAE in train set: 0.91


In [22]:
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)

print(f'MAE in validation set: {mae:.2f}')

MAE in validation set: 0.99
