In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
import pickle

In [2]:
with open('../models/xgb_model.pkl', 'rb') as file:
    model = pickle.load(file)

In [3]:
def get_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate temporal feature metrics for each item/store in a sales DataFrame.
    
    For each sales entry of an item at a store, the function calculates:
    - Lag values for 1 day.
    - Cumulative sums for the last 3 and 7 days.
    - Rolling mean for 7 and 30 days.
    - Rolling standard deviation for 7 and 30 days.
    
    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame containing sales data, with the following columns:
        - 'item_id': Item identifier.
        - 'store_id': Store identifier.
        - 'date': Sales date.
        - 'y': Sales value (target variable).
    
    Returns:
    --------
    pd.DataFrame
        Original DataFrame with new feature columns added.
    """
    # Sort the DataFrame by item_id, store_id, and date
    df = df.sort_values(['item_id', 'store_id', 'date'])

    # Calculate lag_1
    df['lag_1'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(1)
    
    # Calculate cumulative sums for lag_3 and lag_7 (not true lags)
    df['sum_3'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 4))
    df['sum_7'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 8))

    # Calculate rolling mean and standard deviation for 7 and 30 days
    for window in [7, 30]:
        df[f'rolling_mean_{window}'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].rolling(window=window, min_periods=1).mean().shift(1).reset_index(level=[0, 1], drop=True)
        df[f'rolling_std_{window}'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].rolling(window=window, min_periods=1).std().shift(1).reset_index(level=[0, 1], drop=True)
    
    return df


def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add day of the week and month features to the dataset, as categorical variables.

    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame containing a 'date' column to extract the features from.

    Returns:
    --------
    pd.DataFrame
        DataFrame with new 'wday' and 'month' columns added as categorical variables.
    """
    # Day of the week (1 = Monday, 7 = Sunday)
    df['wday'] = df['date'].dt.dayofweek + 1
    df['wday'] = df['wday'].astype('category')

    # Month of the year
    df['month'] = df['date'].dt.month
    df['month'] = df['month'].astype('category')
    
    return df


def drop_unnecessary_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    """
    Drop unnecessary columns from the dataset.

    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame from which columns will be dropped.
    columns_to_drop : list
        List of column names to drop from the DataFrame.

    Returns:
    --------
    pd.DataFrame
        DataFrame with specified columns removed.
    """
    return df.drop(columns=columns_to_drop, axis=1)

In [4]:
train, test = pd.read_parquet('../data/train.snap.parquet'), pd.read_parquet('../data/test.snap.parquet')

In [5]:
df = pd.concat([train[train['date'] >= '2016-03-15'], test])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1873,0.0,2016-03-15,11607,,,,,0,1,1,2.240234
1,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1874,0.0,2016-03-16,11607,,,,,0,0,0,2.240234
2,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1875,2.0,2016-03-17,11607,StPatricksDay,Cultural,,,0,0,0,2.240234
3,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1876,1.0,2016-03-18,11607,,,,,0,0,0,2.240234
4,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1877,2.0,2016-03-19,11608,,,,,0,0,0,2.240234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2957525,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,0.0,2016-06-19,11621,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,2.980469
2957526,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,0.0,2016-06-19,11621,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,2.480469
2957527,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,0.0,2016-06-19,11621,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,3.980469
2957528,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,5.0,2016-06-19,11621,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0,1.280273


In [6]:
# Apply functions to train and test datasets
df = add_date_features(df)

# Creating an unique snap column to reduce dimensionality
df['snap'] = 0
df.loc[
    (df['store_id'].str[:2] == 'CA') & (df['snap_CA'] == 1) |
    (df['store_id'].str[:2] == 'TX') & (df['snap_TX'] == 1) |
    (df['store_id'].str[:2] == 'WI') & (df['snap_WI'] == 1),
    'snap'
] = 1
df['snap'] = df['snap'].astype('category')

# Removing uneeded columns
columns_to_drop = ['id', 'dept_id', 'cat_id', 'state_id', 'wm_yr_wk', 'event_type_1', 
                   'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI']
df = drop_unnecessary_columns(df, columns_to_drop)

df.head()

Unnamed: 0,item_id,store_id,d,y,date,event_name_1,sell_price,wday,month,snap
0,FOODS_1_001,CA_1,d_1873,0.0,2016-03-15,,2.240234,2,3,0
1,FOODS_1_001,CA_1,d_1874,0.0,2016-03-16,,2.240234,3,3,0
2,FOODS_1_001,CA_1,d_1875,2.0,2016-03-17,StPatricksDay,2.240234,4,3,0
3,FOODS_1_001,CA_1,d_1876,1.0,2016-03-18,,2.240234,5,3,0
4,FOODS_1_001,CA_1,d_1877,2.0,2016-03-19,,2.240234,6,3,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2957530 entries, 0 to 2957529
Data columns (total 10 columns):
 #   Column        Dtype         
---  ------        -----         
 0   item_id       category      
 1   store_id      category      
 2   d             category      
 3   y             float32       
 4   date          datetime64[ns]
 5   event_name_1  category      
 6   sell_price    float32       
 7   wday          category      
 8   month         category      
 9   snap          category      
dtypes: category(7), datetime64[ns](1), float32(2)
memory usage: 70.7 MB


In [8]:
features = [
    'item_id', 'store_id', 'wday', 'month', 'event_name_1', 'snap', 'sell_price', 
    'lag_1', 'sum_3', 'sum_7', 'rolling_mean_7', 'rolling_std_7', 'rolling_mean_30', 'rolling_std_30'
]


data = df.copy()
days = [f'd_{c}' for c in range(1942, 1970)]
for d in days:
    datetime = data[data['d'] == d]['date'].iloc[0]
    start_window = datetime - pd.Timedelta('40D') 
    end_window  = datetime

    batch = data[(data['date'] > start_window) & (data['date'] <= end_window)]
    batch = get_metrics(batch)

    X_test = batch[batch['d'] == d][features]
    y_pred = model.predict(X_test)
    data.loc[X_test.index, 'y'] = y_pred
   

In [9]:
test = test[['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'y']]
test

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,d,y
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2
...,...,...,...,...,...,...,...
853715,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,0
853716,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,0
853717,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,0
853718,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,5


In [12]:
data = data[['item_id', 'store_id', 'd', 'y']]
data.rename(columns={"y": "y_pred"}, inplace=True)
data

Unnamed: 0,item_id,store_id,d,y_pred
0,FOODS_1_001,CA_1,d_1873,0.000000
1,FOODS_1_001,CA_1,d_1874,0.000000
2,FOODS_1_001,CA_1,d_1875,2.000000
3,FOODS_1_001,CA_1,d_1876,1.000000
4,FOODS_1_001,CA_1,d_1877,2.000000
...,...,...,...,...
2957525,FOODS_3_823,WI_3,d_1969,0.474385
2957526,FOODS_3_824,WI_3,d_1969,0.105818
2957527,FOODS_3_825,WI_3,d_1969,0.540723
2957528,FOODS_3_826,WI_3,d_1969,1.264359


In [13]:
df_merged = pd.merge(test, data[['item_id', 'store_id', 'd', 'y_pred']], on=['item_id', 'store_id', 'd'], how='left')
df_merged.reset_index(drop=True, inplace=True)
df_merged

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,d,y,y_pred
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2,1.024071
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,0.247850
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,0.659320
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,0,1.851732
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2,1.280504
...,...,...,...,...,...,...,...,...
853715,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,0,0.474385
853716,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,0,0.105818
853717,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,0,0.540723
853718,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,5,1.264359


In [14]:
mae = mean_absolute_error(df_merged['y'], df_merged['y_pred'])
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 1.090191125869751


In [17]:
for d in days:
    scope = df_merged.copy()
    scope = scope[scope['d'] == d]
    mae = mean_absolute_error(scope['y'], scope['y_pred'])
    print(f'Mean Absolute Error (MAE) for {d}: {mae}')
    
    

Mean Absolute Error (MAE) for d_1942: 0.9369040727615356
Mean Absolute Error (MAE) for d_1943: 0.9038417339324951
Mean Absolute Error (MAE) for d_1944: 0.892241358757019
Mean Absolute Error (MAE) for d_1945: 0.9095984101295471
Mean Absolute Error (MAE) for d_1946: 0.9751017093658447
Mean Absolute Error (MAE) for d_1947: 1.1209381818771362
Mean Absolute Error (MAE) for d_1948: 1.0942572355270386
Mean Absolute Error (MAE) for d_1949: 1.0421104431152344
Mean Absolute Error (MAE) for d_1950: 0.9311428666114807
Mean Absolute Error (MAE) for d_1951: 1.014474868774414
Mean Absolute Error (MAE) for d_1952: 1.0457723140716553
Mean Absolute Error (MAE) for d_1953: 1.1870696544647217
Mean Absolute Error (MAE) for d_1954: 1.2649259567260742
Mean Absolute Error (MAE) for d_1955: 1.2965446710586548
Mean Absolute Error (MAE) for d_1956: 1.1439213752746582
Mean Absolute Error (MAE) for d_1957: 1.0649548768997192
Mean Absolute Error (MAE) for d_1958: 1.0462684631347656
Mean Absolute Error (MAE) for d_1