### Problem Statement
Predict exact number of sales items for the next 28 days based on historical data on these items.

In [46]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm import tqdm

1. Calculate weight for the level 12 series
2. Use the naive logic to make forecasts for each of the level 12 series
3. Infer forecast, ground truth values, and weights for all the higher level series by aggregating
4. Calculalte RMSSE for all series using the equation
5. Multiply weight by respective RMSSE and add all these products

In [47]:
df = pd.read_csv('dataset/sales_train_validation.csv')
price_df = pd.read_csv("dataset/sell_prices.csv")


In [48]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [49]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [50]:
cal_df = pd.read_csv("dataset/calendar.csv")

In [51]:
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [52]:
cal_df["d"]=cal_df["d"].apply(lambda x: int(x.split("_")[1]))
price_df["id"] = price_df["item_id"] + "_" + price_df["store_id"] + "_validation"

In [53]:
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [54]:
cal_df.tail()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1964,2016-06-15,11620,Wednesday,5,6,2016,1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,1968,,,,,0,0,0
1968,2016-06-19,11621,Sunday,2,6,2016,1969,NBAFinalsEnd,Sporting,Father's day,Cultural,0,0,0


In [55]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


**In the sales_train_validation data, last 28 days from 1913 are test data and last days from test data are the validation data.**

### 1. Calculate weight for the level 12 series

In [56]:
for day in tqdm(range(1858, 1886)):
    wk_id = list(cal_df[cal_df["d"]==day]["wm_yr_wk"])[0]
    wk_price_df = price_df[price_df["wm_yr_wk"]==wk_id]
    df = df.merge(wk_price_df[["sell_price", "id"]], on=["id"], how='inner')
    df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)]
    df.drop(columns=["sell_price"], inplace=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:29<00:00,  1.07s/it]


In [57]:
df["dollar_sales"] = df[[c for c in df.columns if c.find("unit_sales")==0]].sum(axis=1)
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,unit_sales_1877,unit_sales_1878,unit_sales_1879,unit_sales_1880,unit_sales_1881,unit_sales_1882,unit_sales_1883,unit_sales_1884,unit_sales_1885,dollar_sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,8.26,24.78,8.26,16.52,16.52,0.0,8.26,8.26,8.26,231.28
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,3.97,3.97,3.97,3.97,35.73
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.97,2.97,0.0,44.55
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,9.28,4.64,18.56,4.64,13.92,23.2,0.0,27.84,27.84,245.92
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5.76,5.76,5.76,8.64,2.88,0.0,0.0,0.0,0.0,89.28


In [58]:
df.drop(columns=[c for c in df.columns if c.find("unit_sales")==0], inplace=True)
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,231.28
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,35.73
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,44.55
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,245.92
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,89.28


In [59]:
df["weight"] = df["dollar_sales"] / df["dollar_sales"].sum()
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,1,1,3,0,1,1,231.28,6.3e-05
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,0,0,0,0,35.73,1e-05
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,1,0,1,1,1,44.55,1.2e-05
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5,4,1,0,1,3,7,2,245.92,6.7e-05
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,1,1,2,2,2,4,89.28,2.4e-05


In [60]:
df.drop(columns=["dollar_sales"], inplace=True)
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,6.3e-05
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1e-05
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,1.2e-05
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,6.7e-05
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,2.4e-05


### 2. Use the naive logic to make forecasts for each of the level 12 series
* All 0s
* Average through all history
* Same as previous 28 days
* Mean of previous 10, 20, 30, 40, 50, 60 days
* Average of same day for all previous weeks**

In [61]:
for d in range(1886, 1914):
    df["F_" + str(d)] = 0

In [62]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3. Infer forecast, ground truth values, and weights for all the higher level series by aggregating¶

In [63]:
agg_df = pd.DataFrame(df[[c for c in df.columns if c.find("d_") == 0 or c.find("F_") == 0]].sum()).transpose()
agg_df["level"] = 1
agg_df["weight"] = 1/12
column_order = agg_df.columns

In [64]:
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,0,0,0,0,0,0,0,0,1,0.083333


In [65]:
level_groupings = {2: ["state_id"], 
                   3: ["store_id"], 
                   4: ["cat_id"], 
                   5: ["dept_id"], 
                   6: ["state_id", "cat_id"], 
                   7: ["state_id", "dept_id"], 
                   8: ["store_id", "cat_id"], 
                   9: ["store_id", "dept_id"],
                   10: ["item_id"], 
                   11: ["item_id", "state_id"]}

In [66]:
df.groupby(by=level_groupings[11]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
item_id,state_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FOODS_1_001,CA,6,3,2,3,7,5,8,3,5,2,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_001,TX,0,1,2,2,0,4,0,0,4,3,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_001,WI,0,2,0,1,0,9,2,1,2,5,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_002,CA,3,3,4,4,3,3,0,2,1,1,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_002,TX,0,0,2,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HOUSEHOLD_2_515,TX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOUSEHOLD_2_515,WI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOUSEHOLD_2_516,CA,0,2,0,1,1,1,0,2,1,2,...,0,0,0,0,0,0,0,0,0,0
HOUSEHOLD_2_516,TX,2,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
for level in tqdm(level_groupings):
    temp_df = df.groupby(by=level_groupings[level]).sum().reset_index(drop=True)
    temp_df["level"] = level
    temp_df["weight"] /= 12
    agg_df = agg_df.append(temp_df[column_order])

del temp_df

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:23<00:00,  2.34s/it]


In [68]:
agg_df.head()

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,0,0,0,0,0,0,0,0,1,0.083333
0,14195,13805,10108,11047,9925,11322,12251,16610,14696,11822,...,0,0,0,0,0,0,0,0,2,0.036146
1,9438,9630,6778,7381,5912,9006,6226,9440,9376,7319,...,0,0,0,0,0,0,0,0,2,0.023281
2,8998,8314,6897,6984,3309,8883,9533,11882,8664,6431,...,0,0,0,0,0,0,0,0,2,0.023907
0,4337,4155,2816,3051,2630,3276,3450,5437,4340,3157,...,0,0,0,0,0,0,0,0,3,0.009092


In [69]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
df["weight"] /= 12

In [71]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
print('df', df.shape[0])
print('agg_df', agg_df.shape[0]) 
print('sum', df.shape[0] + agg_df.shape[0])

df 30490
agg_df 12350
sum 42840


In [73]:
agg_df["weight"].sum() + df["weight"].sum()

0.9999999999999996

### 4. Calculalte RMSSE for all series using the equation

In [74]:
1913 - 28

1885

In [75]:
h = 28
n = 1885

def rmsse(ground_truth, forecast, train_series, axis = 1):
    assert axis == 0 or axis == 1
    assert type(ground_truth) == np.ndarray and type(forecast) == np.ndarray and type(train_series) == np.ndarray
    
    if axis == 1:
        # using axis == 1 we must guarantee these are matrices and not arrays
        assert ground_truth.shape[1] > 1 and forecast.shape[1] > 1 and train_series.shape[1] > 1
    
    numerator = ((ground_truth - forecast)**2).sum(axis=axis)
    if axis == 1:
        denominator = 1/(n-1) * ((train_series[:, 1:] - train_series[:, :-1]) ** 2).sum(axis=axis)
    else:
        denominator = 1/(n-1) * ((train_series[1:] - train_series[:-1]) ** 2).sum(axis=axis)
    return (1/h * numerator/denominator) ** 0.5       
        

In [76]:
train_series_cols = [c for c in df.columns if c.find("d_") == 0][:-28]  # col names until last 28 days
ground_truth_cols = [c for c in df.columns if c.find("d_") == 0][-28:]
forecast_cols = [c for c in df.columns if c.find("F_") == 0]

In [77]:
df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'F_1904', 'F_1905', 'F_1906', 'F_1907', 'F_1908', 'F_1909', 'F_1910',
       'F_1911', 'F_1912', 'F_1913'],
      dtype='object', length=1948)

In [78]:
agg_df.columns

Index(['d_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10',
       ...
       'F_1906', 'F_1907', 'F_1908', 'F_1909', 'F_1910', 'F_1911', 'F_1912',
       'F_1913', 'level', 'weight'],
      dtype='object', length=1943)

In [79]:
df["rmsse"] = rmsse(np.array(df[ground_truth_cols]), 
                   np.array(df[forecast_cols]), np.array(df[train_series_cols]))
agg_df["rmsse"] = rmsse(np.array(agg_df[ground_truth_cols]), 
                   np.array(agg_df[forecast_cols]), np.array(agg_df[train_series_cols]))

In [80]:
# Calculating score series by series to validate scores of matrix agree row by row 
# for row_idx in range(len(df)):
#     row_df = pd.DataFrame(df.iloc[row_idx]).transpose()
#     train_series = np.array(row_df[train_series_cols].transpose()[row_idx])
#     ground_truth_series = np.array(row_df[ground_truth_cols].transpose()[row_idx])
#     forecast_series = np.array(row_df[forecast_cols].transpose()[row_idx])
#     print(rmsse(ground_truth_series, forecast_series, train_series, axis=0))

In [81]:
df["wrmsse"] = df["weight"] * df["rmsse"]
agg_df["wrmsse"] = agg_df["weight"] * agg_df["rmsse"]

In [82]:
df["wrmsse"].sum() + agg_df["wrmsse"].sum()

5.35979273095658

# Naive Forecasts

1. Calculate weight for the level 12 series
2. Use the naive logic to make forecasts for each of the level 12 series
3. Infer forecast, ground truth values, and weights for all the higher level series by aggregating
4. Calculalte RMSSE for all series using the equation
5. Multiply weight by respective RMSSE and add all these products

In [83]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm import tqdm

In [84]:
df = pd.read_csv('dataset/sales_train_validation.csv')
price_df = pd.read_csv("dataset/sell_prices.csv")
cal_df = pd.read_csv("dataset/calendar.csv")

In [85]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [86]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [90]:
# removing d_ from the days columns. 
# Doing this for join 
cal_df["d"]=cal_df["d"].apply(lambda x: int(x.split("_")[1])) 

In [92]:
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [91]:
# creating id column from item_id + store_id + '_validation'
# eg. HOBBIES_1_001_CA_1_validation
# doing this for join with df (sales_train_validation)
price_df["id"] = price_df["item_id"] + "_" + price_df["store_id"] + "_validation"  

In [93]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


In [96]:
1913 - 28 # test horizon: 1885- 1913

1885

In [98]:
1886 + 28

1914

In [97]:
1885 - 28 # validation set

1857

In [94]:
# calculate weight for the 12 series
for day in tqdm(range(1858, 1886)):
    wk_id = list(cal_df[cal_df["d"]==day]["wm_yr_wk"])[0]
    wk_price_df = price_df[price_df["wm_yr_wk"]==wk_id]
    df = df.merge(wk_price_df[["sell_price", "id"]], on=["id"], how='inner')
    df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)]
    df.drop(columns=["sell_price"], inplace=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 28/28 [00:29<00:00,  1.05s/it]


In [95]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,unit_sales_1876,unit_sales_1877,unit_sales_1878,unit_sales_1879,unit_sales_1880,unit_sales_1881,unit_sales_1882,unit_sales_1883,unit_sales_1884,unit_sales_1885
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,24.78,8.26,24.78,8.26,16.52,16.52,0.0,8.26,8.26,8.26
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.97,3.97,3.97,3.97
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2.97,0.0,0.0,0.0,0.0,0.0,0.0,2.97,2.97,0.0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,18.56,9.28,4.64,18.56,4.64,13.92,23.2,0.0,27.84,27.84
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,8.64,5.76,5.76,5.76,8.64,2.88,0.0,0.0,0.0,0.0


In [99]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,unit_sales_1876,unit_sales_1877,unit_sales_1878,unit_sales_1879,unit_sales_1880,unit_sales_1881,unit_sales_1882,unit_sales_1883,unit_sales_1884,unit_sales_1885
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,24.78,8.26,24.78,8.26,16.52,16.52,0.00,8.26,8.26,8.26
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,3.97,3.97,3.97,3.97
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2.97,0.00,0.00,0.00,0.00,0.00,0.00,2.97,2.97,0.00
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,18.56,9.28,4.64,18.56,4.64,13.92,23.20,0.00,27.84,27.84
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,8.64,5.76,5.76,5.76,8.64,2.88,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,7.96,0.00,0.00,7.96,0.00,0.00,0.00,7.96,0.00,3.98
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,1.28,1.28,1.28,1.28,0.00,2.56,1.28,1.28,2.56,5.12
