In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv


In [6]:
df_submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
print(len(df_submission)/2, len(df_submission.columns))
df_submission.head()

30490.0 29


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_price = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
df_calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
df_price_calendar = df_price.merge(df_calendar[['d','wm_yr_wk']], on=['wm_yr_wk'], how='inner')
df_price_calendar = df_price_calendar.rename(columns={'d':'period'})

In [8]:
df_train = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
df_test = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv')

print(len(df_train.columns), len(df_test.columns))
print(len(df_train), len(df_test))

1919 1947
30490 30490


# Generate Naive Forecaster
- Naive Forecaster: $ \hat{Y}_t = Y_{t-1} $
- Purely for use to test WRMSSE

In [9]:
LAST_TIME_STAMP = 1913
HORIZON = 28
TIME_COLS = [f'd_{i+1}' for i in range(LAST_TIME_STAMP)]
PRED_COLS = [f'd_{LAST_TIME_STAMP+1+i}' for i in range(28)]

In [10]:
# Perfect Forecast
df_pred = df_train.copy()
pred = np.tile(df_pred[f'd_{LAST_TIME_STAMP}'], (HORIZON, 1)).T
df_pred[PRED_COLS] = pred
df_pred.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,4,4,4,4,4,4,4,4,4,4


In [57]:
class WRMSSE():
    
    def __init__(self, horizon, last_time_stamp, df_pred, df_actuals, df_sales):
        self.horizon = horizon
        self.last_time_stamp = last_time_stamp
        self.train_cols = [f'd_{i+1}' for i in range(last_time_stamp)]
        self.pred_cols = [f'd_{last_time_stamp+1+i}' for i in range(horizon)]
        self.df_pred = df_pred
        self.df_actuals = df_actuals
        self.df_sales = df_sales
        
        self.rmsse_numer = self.calculate_rmsse_numer()
        self.rmsse_denom = self.calculate_rmsse_denom()
        self.weights = self.calculate_weights()
        self.wrmsse = self.calculate_wrmsse()
        
    def calculate_rmsse_denom(self):
        print("Calculating RMSSE Denominator...")
        train_period = self.df_pred[self.train_cols].copy().to_numpy()
        first_sales = (train_period!=0).argmax(axis=1)
        nonzero_periods = self.last_time_stamp - first_sales - 1
        
        prev_y = np.roll(train_period, 1)[:,1:]
        current_y = train_period[:,1:]
        abs_err_y = np.square(current_y - prev_y)
        denominator = np.sum(abs_err_y, axis=1) / nonzero_periods
        return denominator
    
    def calculate_rmsse_numer(self):
        print("Calculating RMSSE Numerator...")
        abs_err_yhat = np.square(self.df_actuals[self.pred_cols] - self.df_pred[self.pred_cols])
        numerator = np.sum(abs_err_yhat, axis=1) / self.horizon
        return numerator
    
    def calculate_weights(self):
        print("Calculating Weights...")
        # Calculate Weights
        weights = self.df_actuals[['id','item_id','store_id']+PRED_COLS].copy()
        weights = pd.wide_to_long(weights, stubnames='d_', i=['id', 'item_id', 'store_id'], j='period')
        weights = weights.reset_index()
        weights = weights.rename(columns={'d_':'d'})
        weights['period'] = 'd_' + weights['period'].astype(str)
        weights = weights.merge(self.df_sales, on=['item_id','store_id','period'], how='left')
        weights = weights.fillna(0)
        weights['sales'] = weights['d'] * weights['sell_price']
        weights = weights.groupby(['id'])[['sales']].sum().reset_index()
        return weights
    
    def calculate_wrmsse(self):
        print("Calculating WRMSSE")
        rmsse = self.rmsse_numer / self.rmsse_denom
        wrmsse = self.weights['sales'] * rmsse / len(rmsse)
        return wrmsse

In [58]:
naive_wrmsse = WRMSSE(
    horizon=28, 
    last_time_stamp=1913, 
    df_pred=df_pred, 
    df_actuals=df_test, 
    df_sales=df_price_calendar
)

Calculating RMSSE Numerator...
Calculating RMSSE Denominator...
Calculating Weights...
Calculating WRMSSE


In [60]:
print(np.sum(naive_wrmsse.wrmsse))

176.34000285937316
