In [None]:
import math
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from pandas import date_range
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error

In [None]:
#path to the dataset in Kaggle's notebook
#path = '../input/store-sales-time-series-forecasting/'

In [None]:
df_oil = pd.read_csv('/Users/liukuo/Desktop/store-sales-time-series-forecasting/oil.csv', parse_dates=['date'], infer_datetime_format=True, index_col='date')

In [None]:
calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

In [None]:
df_oil['ma_oil'] = df_oil['dcoilwtico'].rolling(7).mean()
calendar = calendar.merge(df_oil, how='left', left_index=True, right_index=True)
calendar['ma_oil'].fillna(method='ffill', inplace=True)

In [None]:
calendar

Unnamed: 0,dcoilwtico,ma_oil
2013-01-01,,
2013-01-02,93.14,
2013-01-03,92.97,
2013-01-04,93.12,
2013-01-05,,
...,...,...
2017-08-27,,47.720000
2017-08-28,46.40,47.624286
2017-08-29,46.46,47.320000
2017-08-30,45.96,47.115714


In [None]:
calendar['dow'] = calendar.index.dayofweek

In [None]:
holidays = pd.read_csv('/Users/liukuo/Desktop/store-sales-time-series-forecasting/holidays_events.csv', parse_dates=['date'], infer_datetime_format=True)

holidays = holidays.set_index('date').sort_index()

holidays = holidays[holidays.locale == 'National'] # National level only for simplicity

#holidays = holidays.groupby(holidays.index).first() # Keep one event only

In [None]:
holidays

Unnamed: 0_level_0,type,locale,locale_name,description,transferred
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,False
2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
2012-11-02,Holiday,National,Ecuador,Dia de Difuntos,False
2012-11-03,Holiday,National,Ecuador,Independencia de Cuenca,False
...,...,...,...,...,...
2017-12-22,Additional,National,Ecuador,Navidad-3,False
2017-12-23,Additional,National,Ecuador,Navidad-2,False
2017-12-24,Additional,National,Ecuador,Navidad-1,False
2017-12-25,Holiday,National,Ecuador,Navidad,False


In [None]:
calendar['wd'] = True
calendar.loc[calendar.dow > 4, 'wd'] = False
calendar = calendar.merge(holidays, how='left', left_index=True, right_index=True)
calendar.loc[calendar.type == 'Bridge'  , 'wd'] = False
calendar.loc[calendar.type == 'Work Day', 'wd'] = True
calendar.loc[calendar.type == 'Transfer', 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True ), 'wd'] = True

In [None]:
calendar

Unnamed: 0,dcoilwtico,ma_oil,dow,wd,type,locale,locale_name,description,transferred
2013-01-01,,,1,False,Holiday,National,Ecuador,Primer dia del ano,False
2013-01-02,93.14,,2,True,,,,,
2013-01-03,92.97,,3,True,,,,,
2013-01-04,93.12,,4,True,,,,,
2013-01-05,,,5,True,Work Day,National,Ecuador,Recupero puente Navidad,False
...,...,...,...,...,...,...,...,...,...
2017-08-27,,47.720000,6,False,,,,,
2017-08-28,46.40,47.624286,0,True,,,,,
2017-08-29,46.46,47.320000,1,True,,,,,
2017-08-30,45.96,47.115714,2,True,,,,,


In [None]:
df_train = pd.read_csv('/Users/liukuo/Desktop/store-sales-time-series-forecasting/train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

df_train.date = df_train.date.dt.to_period('D')
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()

In [None]:
sdate = '2017-04-01'
edate = '2017-08-15'

In [None]:
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]

#add trend
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,constant=False,order=1,seasonal=False,additional_terms=[fourier],drop=True)
X = dp.in_sample()

In [None]:
index = pd.date_range(start=sdate, end=edate)
dp = DeterministicProcess(index=y.index, constant=False, order=1)
X = dp.in_sample()

In [None]:
# Extentions
X['oil']  = calendar.loc[sdate:edate]['ma_oil'].values
X['dow'] = calendar.loc[sdate:edate]['dow'].values
X['wd']   = calendar.loc[sdate:edate]['wd'].values
X['type'] = calendar.loc[sdate:edate]['type'].values

In [None]:
X

Unnamed: 0_level_0,trend,oil,dow,wd,type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-04-01,1.0,48.570000,5,False,
2017-04-02,2.0,48.570000,6,False,
2017-04-03,3.0,49.034286,0,True,
2017-04-04,4.0,49.561429,1,True,
2017-04-05,5.0,50.150000,2,True,
...,...,...,...,...,...
2017-08-11,133.0,49.140000,4,False,Transfer
2017-08-12,134.0,49.140000,5,False,
2017-08-13,135.0,49.140000,6,False,
2017-08-14,136.0,48.934286,0,True,


In [None]:
X = pd.get_dummies(X, columns=['dow'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)

In [None]:
X

Unnamed: 0_level_0,trend,oil,wd,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6,type_Additional,type_Event,type_Holiday,type_Transfer
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-04-01,1.0,48.570000,False,0,0,0,0,1,0,0,0,0,0
2017-04-02,2.0,48.570000,False,0,0,0,0,0,1,0,0,0,0
2017-04-03,3.0,49.034286,True,0,0,0,0,0,0,0,0,0,0
2017-04-04,4.0,49.561429,True,1,0,0,0,0,0,0,0,0,0
2017-04-05,5.0,50.150000,True,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,133.0,49.140000,False,0,0,0,1,0,0,0,0,0,1
2017-08-12,134.0,49.140000,False,0,0,0,0,1,0,0,0,0,0
2017-08-13,135.0,49.140000,False,0,0,0,0,0,1,0,0,0,0
2017-08-14,136.0,48.934286,True,0,0,0,0,0,0,0,0,0,0


model = LinearRegression()
model.fit(X,y)
y_pred = pd.DataFrame(model.predict(X), index=X.index,columns=y.columns)

In [None]:
model_rf = RandomForestRegressor(n_estimators=250,random_state=2022, verbose=0)

model_rf.fit(X, y)

RandomForestRegressor(n_estimators=250, random_state=2022)

In [None]:
y_pred = pd.DataFrame(model_rf.predict(X), index=X.index, columns=y.columns)

In [None]:
y_pred=y_pred.stack(['store_nbr', 'family']).reset_index()
y_target=y.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) # Sales should be >= 0
y_target.groupby('family').apply(lambda x: mean_squared_log_error(x['sales'], x['sales_pred']))

family
AUTOMOTIVE                    0.064153
BABY CARE                     0.013870
BEAUTY                        0.064379
BEVERAGES                     0.107072
BOOKS                         0.005167
BREAD/BAKERY                  0.057778
CELEBRATION                   0.076444
CLEANING                      0.089012
DAIRY                         0.066586
DELI                          0.042435
EGGS                          0.043250
FROZEN FOODS                  0.047577
GROCERY I                     0.118427
GROCERY II                    0.090856
HARDWARE                      0.066286
HOME AND KITCHEN I            0.066104
HOME AND KITCHEN II           0.054888
HOME APPLIANCES               0.033231
HOME CARE                     0.048760
LADIESWEAR                    0.061717
LAWN AND GARDEN               0.058037
LINGERIE                      0.105974
LIQUOR,WINE,BEER              0.212318
MAGAZINES                     0.063891
MEATS                         0.048188
PERSONAL CARE     

In [None]:
df_test = pd.read_csv('/Users/liukuo/Desktop/store-sales-time-series-forecasting/test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)

df_test.date = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

In [None]:
stest = '2017-08-16'
etest = '2017-08-31'
X_test = dp.out_of_sample(steps=16)

In [None]:
# Extentions
X_test['oil']  = calendar.loc[stest:etest]['ma_oil'].values
X_test['dow'] = calendar.loc[stest:etest]['dow'].values
X_test['wd']   = calendar.loc[stest:etest]['wd'].values
X_test = pd.get_dummies(X_test, columns=['dow'], drop_first=True)

# No national level events in this period
X_test[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer']] = 0
sales_pred = pd.DataFrame(model_rf.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])
sales_pred[sales_pred < 0] = 0. # Sales should be >= 0

In [None]:
df_sub = pd.read_csv('/Users/liukuo/Desktop/store-sales-time-series-forecasting/sample_submission.csv', index_col='id')
df_sub.sales = sales_pred.values
df_sub.to_csv('submission.csv', index=True)