In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython
import sklearn
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import DeterministicProcess
from sklearn.linear_model import ElasticNet
from statsmodels.tsa.deterministic import CalendarFourier


import os

In [2]:
# path to the dataset in Kaggle's notebook
path ='../input/store-sales-time-series-forecasting/'

# LR

### 1. Compute Moving Average of Oil Prices

In [3]:
# read oil price
data_oil = pd.read_csv(path + 'oil.csv', parse_dates=['date'], infer_datetime_format=True, index_col='date')

########################################################################################################################
# TODO: compute data_oil['ma_oil'] as the moving average of data_oil['dcoilwtico'] with window size 7
# Hint: check the documentation of .rolling() method of pandas.DataFrame
########################################################################################################################
data_oil['ma_oil'] = None # change 'None' to your answer
data_oil['ma_oil'] = data_oil['dcoilwtico'].rolling(7).mean() ### answer



# Create continguous moving average of oil prices
calendar_LR = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

########################################################################################################################
# TODO 1: merge two DataFrame instances (data_oil and calendar_LR) such that the merged instances has the same indexes
# as calendar_LR.
# TODO 2: replace each NaN in data_oil['ma_oil'] by the first non-null value before it.
# Hint: check the documentation of .merge() and .fillna() methods of pandas.DataFrame
########################################################################################################################
calendar_LR = calendar_LR.merge(data_oil, how='left', left_index=True, right_index=True) ### answer
calendar_LR['ma_oil'].fillna(method='ffill', inplace=True) ### answer

calendar_LR.head(15) # display some entries of calendar_LR

Unnamed: 0,dcoilwtico,ma_oil
2013-01-01,,
2013-01-02,93.14,
2013-01-03,92.97,
2013-01-04,93.12,
2013-01-05,,
2013-01-06,,
2013-01-07,93.2,
2013-01-08,93.21,
2013-01-09,93.08,
2013-01-10,93.81,93.218571


### 2. Create Workday Feature

In [4]:
########################################################################################################################
# TODO: create a True/False feature calendar_LR['wd'] to indicate whether each date is a workday (Monday-Friday) or not.
# Hint: check documentation of pandas.DatetimeIndex.dayofweek
########################################################################################################################
calendar_LR['wd'] = None # change 'None' to your answer
calendar_LR['dofw'] = calendar_LR.index.dayofweek ### answer
calendar_LR['wd'] = True ### answer
calendar_LR.loc[calendar_LR.dofw > 4, 'wd'] = False ### answer

calendar_LR.head(15) # display some entries of calendar_LR

Unnamed: 0,dcoilwtico,ma_oil,wd,dofw
2013-01-01,,,True,1
2013-01-02,93.14,,True,2
2013-01-03,92.97,,True,3
2013-01-04,93.12,,True,4
2013-01-05,,,False,5
2013-01-06,,,False,6
2013-01-07,93.2,,True,0
2013-01-08,93.21,,True,1
2013-01-09,93.08,,True,2
2013-01-10,93.81,93.218571,True,3


### 3. Read Train and Test Data

In [5]:
df_train = pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

df_train.date = df_train.date.dt.to_period('D')
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()

df_train.head(15) # display some entries of the training data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.0
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0
1,AUTOMOTIVE,2013-01-06,2.0
1,AUTOMOTIVE,2013-01-07,0.0
1,AUTOMOTIVE,2013-01-08,2.0
1,AUTOMOTIVE,2013-01-09,2.0
1,AUTOMOTIVE,2013-01-10,2.0


In [6]:
df_test = pd.read_csv(path + 'test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)

df_test.date = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

df_test.head(15) # display some entries of the testing data

store_nbr,family,date
1,AUTOMOTIVE,2017-08-16
1,AUTOMOTIVE,2017-08-17
1,AUTOMOTIVE,2017-08-18
1,AUTOMOTIVE,2017-08-19
1,AUTOMOTIVE,2017-08-20
1,AUTOMOTIVE,2017-08-21
1,AUTOMOTIVE,2017-08-22
1,AUTOMOTIVE,2017-08-23
1,AUTOMOTIVE,2017-08-24
1,AUTOMOTIVE,2017-08-25


In [7]:
# set the range of data used in training
sdate = '2017-04-01'
edate = '2017-08-15'

# we will train a model that takes feature of a date as input and predicts the sales for each store and family of goods on that date.
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]


########################################################################################################################
# TODO: create the trend feature X_LR: the value for sdate is 1, the value for the next day of sdate is 2, etc.
# Hint: check the documentation of DeterministicProcess, or this tutorial: https://www.kaggle.com/code/ryanholbrook/trend.
########################################################################################################################
X_LR = None # change 'None' to your answer
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X_LR = dp.in_sample() ### answer

# Extentions
X_LR['oil']  = calendar_LR.loc[sdate:edate]['ma_oil'].values
X_LR['wd']   = calendar_LR.loc[sdate:edate]['wd'].values

X_LR.head(15)

Unnamed: 0_level_0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","sin(3,freq=W-SUN)","cos(3,freq=W-SUN)",oil,wd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-04-01,1.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,48.57,False
2017-04-02,2.0,-0.781831,0.62349,-0.974928,-0.222521,-0.433884,-0.900969,48.57,False
2017-04-03,3.0,0.0,1.0,0.0,1.0,0.0,1.0,49.034286,True
2017-04-04,4.0,0.781831,0.62349,0.974928,-0.222521,0.433884,-0.900969,49.561429,True
2017-04-05,5.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.62349,50.15,True
2017-04-06,6.0,0.433884,-0.900969,-0.781831,0.62349,0.974928,-0.222521,50.625714,True
2017-04-07,7.0,-0.433884,-0.900969,0.781831,0.62349,-0.974928,-0.222521,51.022857,True
2017-04-08,8.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,51.022857,False
2017-04-09,9.0,-0.781831,0.62349,-0.974928,-0.222521,-0.433884,-0.900969,51.022857,False
2017-04-10,10.0,0.0,1.0,0.0,1.0,0.0,1.0,51.417143,True


### 4. Train Model!

In [8]:
model = Ridge(fit_intercept=True, solver='svd', alpha=0.5, normalize=True)
model.fit(X_LR, y)
y_pred = pd.DataFrame(model.predict(X_LR), index=X_LR.index, columns=y.columns)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [9]:
# Results on the training set

y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()

y_target['sales_pred'] = y_pred['sales'].clip(0.) # Sales should be >= 0

########################################################################################################################
# TODO: show the training loss for each type of product.
# Hint: check the documentation of DataFrame.groupby() and GroupBy.apply().
########################################################################################################################
LR_result = y_target.groupby('family').apply(lambda r: mean_squared_log_error(r['sales'], r['sales_pred'])) ### answer

# RF

## Feature Engineering

In [10]:
calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

## Oil

In [11]:
df_oil = pd.read_csv(path + 'oil.csv', parse_dates=['date'], infer_datetime_format=True, index_col='date')
df_oil['ma_oil'] = df_oil['dcoilwtico'].rolling(7).mean()
calendar = calendar.merge(df_oil, how='left', left_index=True, right_index=True)
calendar['ma_oil'].fillna(method='bfill', inplace=True)
calendar.head()

Unnamed: 0,dcoilwtico,ma_oil
2013-01-01,,93.218571
2013-01-02,93.14,93.218571
2013-01-03,92.97,93.218571
2013-01-04,93.12,93.218571
2013-01-05,,93.218571


## Day of week

In [12]:
calendar['dow'] = calendar.index.dayofweek
calendar.head()

Unnamed: 0,dcoilwtico,ma_oil,dow
2013-01-01,,93.218571,1
2013-01-02,93.14,93.218571,2
2013-01-03,92.97,93.218571,3
2013-01-04,93.12,93.218571,4
2013-01-05,,93.218571,5


## Hoilday

In [13]:
holidays = pd.read_csv(path + 'holidays_events.csv', parse_dates=['date'], infer_datetime_format=True)
holidays = holidays.set_index('date').sort_index()
holidays = holidays[holidays.locale == 'National'] # National level only for simplicity
#holidays = holidays.groupby(holidays.index).first() # Keep one event only
holidays.head()

Unnamed: 0_level_0,type,locale,locale_name,description,transferred
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,False
2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
2012-10-12,Transfer,National,Ecuador,Traslado Independencia de Guayaquil,False
2012-11-02,Holiday,National,Ecuador,Dia de Difuntos,False
2012-11-03,Holiday,National,Ecuador,Independencia de Cuenca,False


## Work Day

In [14]:
calendar['wd'] = True
calendar.loc[calendar.dow > 4, 'wd'] = False
calendar = calendar.merge(holidays, how='left', left_index=True, right_index=True)
calendar.loc[calendar.type == 'Bridge'  , 'wd'] = False
calendar.loc[calendar.type == 'Work Day', 'wd'] = True
calendar.loc[calendar.type == 'Transfer', 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == False), 'wd'] = False
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True ), 'wd'] = True
calendar.head()

Unnamed: 0,dcoilwtico,ma_oil,dow,wd,type,locale,locale_name,description,transferred
2013-01-01,,93.218571,1,False,Holiday,National,Ecuador,Primer dia del ano,False
2013-01-02,93.14,93.218571,2,True,,,,,
2013-01-03,92.97,93.218571,3,True,,,,,
2013-01-04,93.12,93.218571,4,True,,,,,
2013-01-05,,93.218571,5,True,Work Day,National,Ecuador,Recupero puente Navidad,False


## Drop 12/25 which training set exclude

In [15]:
calendar = calendar.reset_index()
calendar.drop_duplicates(subset=['index'], keep='last', inplace=True, ignore_index=True)
calendar = calendar.set_index('index')
calendar = calendar.drop(index=['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25'])
print(calendar.shape)

(1700, 9)


## Training

In [16]:
df_train = pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

df_train.date = df_train.date.dt.to_period('D')
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.0
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0


### Label (y)

In [17]:
sdate = '2017-04-01'
edate = '2017-08-15'
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]
print(y.shape)
y.head()

(137, 1782)


Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,9,9,9,9,9,9,9,9,9,9
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2017-04-01,9.0,0.0,1.0,3229.0,0.0,526.249023,14.0,858.0,1151.0,243.272003,...,3.0,671.481018,1524.0,20.0,24.0,851.388,311.147003,2625.523926,1.0,40.446999
2017-04-02,4.0,0.0,1.0,1210.0,0.0,180.339005,3.0,281.0,446.0,86.641998,...,4.0,513.000977,842.0,19.0,29.0,621.062012,229.181,2303.476074,3.0,15.0
2017-04-03,11.0,0.0,2.0,2097.0,0.0,444.856995,11.0,801.0,794.0,166.119995,...,3.0,481.912018,742.0,7.0,14.0,532.640991,132.583008,1697.151001,2.0,16.0
2017-04-04,3.0,0.0,4.0,2249.0,1.0,403.819,19.0,673.0,725.0,149.078003,...,0.0,290.71701,594.0,4.0,15.0,451.968994,127.585999,2506.139893,2.0,15.0
2017-04-05,5.0,0.0,1.0,2687.0,2.0,499.38501,18.0,1057.0,1074.0,216.742996,...,1.0,410.497009,633.0,2.0,20.0,395.341003,84.173996,1349.001953,1.0,13.373


### Data point (X)

In [18]:
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

# Extentions
X['oil']  = calendar.loc[sdate:edate]['ma_oil'].values
X['dow'] = calendar.loc[sdate:edate]['dow'].values
X['wd']   = calendar.loc[sdate:edate]['wd'].values
X['type'] = calendar.loc[sdate:edate]['type'].values
X = pd.get_dummies(X, columns=['dow'], drop_first=True)
X = pd.get_dummies(X, columns=['type'], drop_first=False)
print(X.shape)
X.head()


(137, 19)


Unnamed: 0_level_0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","sin(3,freq=W-SUN)","cos(3,freq=W-SUN)",oil,wd,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6,type_Additional,type_Event,type_Holiday,type_Transfer
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-04-01,1.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.62349,49.034286,False,0,0,0,0,1,0,0,0,0,0
2017-04-02,2.0,-0.781831,0.62349,-0.974928,-0.222521,-0.433884,-0.900969,49.034286,False,0,0,0,0,0,1,0,0,0,0
2017-04-03,3.0,0.0,1.0,0.0,1.0,0.0,1.0,49.034286,True,0,0,0,0,0,0,0,0,0,0
2017-04-04,4.0,0.781831,0.62349,0.974928,-0.222521,0.433884,-0.900969,49.561429,True,1,0,0,0,0,0,0,0,0,0
2017-04-05,5.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.62349,50.15,True,0,1,0,0,0,0,0,0,0,0


### Train

In [19]:
# model = LinearRegression()
# model.fit(X,y)
model_rf = RandomForestRegressor(n_estimators=300,random_state=2022, verbose=0)
model_rf.fit(X, y)

RandomForestRegressor(n_estimators=300, random_state=2022)

In [20]:
y_pred = pd.DataFrame(model_rf.predict(X), index=X.index, columns=y.columns)
y_pred=y_pred.stack(['store_nbr', 'family']).reset_index()
y_pred.head()

Unnamed: 0,date,store_nbr,family,sales
0,2017-04-01,1,AUTOMOTIVE,7.273333
1,2017-04-01,1,BABY CARE,0.0
2,2017-04-01,1,BEAUTY,1.383333
3,2017-04-01,1,BEVERAGES,2762.533333
4,2017-04-01,1,BOOKS,0.023333


In [21]:
y_target=y.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) # Sales should be >= 0
RF_result = y_target.groupby('family').apply(lambda x: np.sqrt(mean_squared_log_error(x['sales'], x['sales_pred'])))

# Retrain

## Comapre two model

In [22]:
LR_list = []
RF_list = []
for i in LR_result.index:
    if LR_result.loc[i] <= RF_result.loc[i]:
        LR_list.append(i)
    else:
        RF_list.append(i)
print(LR_list)
print(RF_list)

['BABY CARE', 'BEVERAGES', 'BOOKS', 'BREAD/BAKERY', 'CLEANING', 'DAIRY', 'DELI', 'EGGS', 'FROZEN FOODS', 'GROCERY I', 'HOME AND KITCHEN II', 'HOME APPLIANCES', 'HOME CARE', 'MEATS', 'PERSONAL CARE', 'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY', 'PREPARED FOODS', 'PRODUCE']
['AUTOMOTIVE', 'BEAUTY', 'CELEBRATION', 'GROCERY II', 'HARDWARE', 'HOME AND KITCHEN I', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE', 'LIQUOR,WINE,BEER', 'MAGAZINES', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD']


## Data set

In [23]:
df_train = pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

df_train.date = df_train.date.dt.to_period('D')

In [24]:
LR_train = pd.DataFrame()
for item in LR_list:
    df = df_train[df_train['family'] == item]
    LR_train = pd.concat([LR_train, df])
LR_train = LR_train.set_index(['store_nbr', 'family', 'date']).sort_index()

RF_train = pd.DataFrame()
for item in RF_list:
    df = df_train[df_train['family'] == item]
    RF_train = pd.concat([RF_train, df])
RF_train = RF_train.set_index(['store_nbr', 'family', 'date']).sort_index()

In [25]:
sdate = '2017-04-01'
edate = '2017-08-15'
y_LR = LR_train.unstack(['store_nbr', 'family']).loc[sdate:edate]
y_rf = RF_train.unstack(['store_nbr', 'family']).loc[sdate:edate]

## Train Model

In [26]:
model_rf = RandomForestRegressor(n_estimators=300,random_state=2022, verbose=0)
model_rf.fit(X, y_rf)

RandomForestRegressor(n_estimators=300, random_state=2022)

In [27]:
model = Ridge(fit_intercept=True, solver='svd', alpha=0.5, normalize=True)
model.fit(X_LR, y_LR)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


Ridge(alpha=0.5, normalize=True, solver='svd')

# Test

In [28]:
stest = '2017-08-16'
etest = '2017-08-31'

X_test_rf = dp.out_of_sample(steps=16)

# Extentions
X_test_rf['oil']  = calendar.loc[stest:etest]['ma_oil'].values
X_test_rf['dow'] = calendar.loc[stest:etest]['dow'].values
X_test_rf['wd']   = calendar.loc[stest:etest]['wd'].values
X_test_rf['type'] = calendar.loc[stest:etest]['type'].values
X_test_rf = pd.get_dummies(X_test_rf, columns=['dow'], drop_first=True)
X_test_rf = pd.get_dummies(X_test_rf, columns=['type'], drop_first=False)

# No national level events in this period
X_test_rf[['type_Additional', 'type_Event', 'type_Holiday', 'type_Transfer']] = 0
X_test_rf.index.names = ['date']

sales_pred_rf = pd.DataFrame(model_rf.predict(X_test_rf), index=X_test_rf.index, columns=y_rf.columns)
# sales_pred_rf = sales_pred_rf.stack(['store_nbr', 'family']).reset_index()
sales_pred_rf[sales_pred_rf < 0] = 0 # Sales should be >= 0

In [29]:
# Test predictions

stest = '2017-08-16'
etest = '2017-08-31'

########################################################################################################################
# TODO: create the feature matrix of test data.
# Hint: check the documentation of DeterministicProcess.
########################################################################################################################
X_test_LR = None # change 'None' to your answer
X_test_LR = dp.out_of_sample(steps=16) ### answer

# Extentions

X_test_LR['oil']  = calendar_LR.loc[stest:etest]['ma_oil'].values ### answer
X_test_LR['wd']   = calendar_LR.loc[stest:etest]['wd'].values ### answer

sales_pred_LR = pd.DataFrame(model.predict(X_test_LR), index=X_test_LR.index, columns=y_LR.columns)
sales_pred_LR[sales_pred_LR < 0] = 0 

In [30]:
pred_final = sales_pred_rf.merge(sales_pred_LR, left_index=True, right_index=True)
pred_final = pred_final.stack(['store_nbr', 'family']).reset_index()

In [31]:
# Create submission
df_sub = pd.read_csv(path + 'sample_submission.csv', index_col='id')
df_sub.sales = pred_final['sales'].values
df_sub.to_csv('submission.csv', index=True)