In [1]:
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

package_paths = [
    '/kaggle/input/m5-forecasting',
]

for pth in package_paths:
    sys.path.append(pth)

import os
for dirname, _, filenames in os.walk('/kaggle/input/m5-forecasting-accuracy'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv


In [3]:
from m5_forecasting.src.forecasters.naive_forecaster import NaiveForecaster
from m5_forecasting.src.evaluation import RMSSE, WRMSSE
from m5_forecasting.src.aggregation import calculate_sales, calculate_weights

# 1️⃣ Load Data

In [4]:
df_submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
print(len(df_submission)/2, len(df_submission.columns))
df_submission.head()

30490.0 29


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_price = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
df_calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
df_price_calendar = df_price.merge(df_calendar[['d','wm_yr_wk']], on=['wm_yr_wk'], how='inner')
df_price_calendar = df_price_calendar.rename(columns={'d':'period'})

In [6]:
df_train = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
df_test = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv')

print(len(df_train.columns), len(df_test.columns))
print(len(df_train), len(df_test))

1919 1947
30490 30490


In [7]:
df_price_calendar.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,period
0,CA_1,HOBBIES_1_001,11325,9.58,d_897
1,CA_1,HOBBIES_1_001,11325,9.58,d_898
2,CA_1,HOBBIES_1_001,11325,9.58,d_899
3,CA_1,HOBBIES_1_001,11325,9.58,d_900
4,CA_1,HOBBIES_1_001,11325,9.58,d_901


1. # 2️⃣ Generate Naive Forecaster
- Naive Forecaster: $ \hat{Y}_t = Y_{t-1} $

In [8]:
df_train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [9]:
LAST_TIME_STAMP = 1913
HORIZON = 28

forecaster = NaiveForecaster(df_train, HORIZON, LAST_TIME_STAMP)
df_pred = forecaster.df_pred
df_pred.head()

1947
1919


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,4,4,4,4,4,4,4,4,4,4


# 3️⃣ RMSSE

In [10]:
rmsse = RMSSE(df_pred, df_test, HORIZON, LAST_TIME_STAMP)
print(f"RMSSE (Average over all time series): {np.mean(rmsse.rmsse)}")
print(f"RMSSE (Sum over all time series): {np.sum(rmsse.rmsse)}")

RMSSE (Average over all time series): 1.0004941807673557
RMSSE (Sum over all time series): 30505.067571596675


# 4️⃣ WRMSSE

In [16]:
from m5_forecasting.src.aggregation import calculate_weights
from m5_forecasting.definitions import level_dict

In [19]:
pred_cols = forecaster.pred_cols
groupby_cols = level_dict[12]
df_weights = calculate_weights(df_test, df_price_calendar, groupby_cols, pred_cols)

In [22]:
wrmsse = WRMSSE(df_pred, df_test, HORIZON, LAST_TIME_STAMP, df_weights)
print(f"WRMSSE (Average over all time series): {np.mean(wrmsse.wrmsse)}")
print(f"WRMSSE (Sum over all time series): {np.sum(wrmsse.wrmsse)}")

WRMSSE (Average over all time series): 3.315859482420686e-05
WRMSSE (Sum over all time series): 1.0110055561900673
