In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import DeterministicProcess

In [4]:
# path to the dataset in Kaggle's notebook
path = 'input/store-sales-time-series-forecasting/'

### 1. Compute Moving Average of Oil Prices

In [5]:
# read oil price
data_oil = pd.read_csv(path + 'oil.csv', parse_dates=['date'], infer_datetime_format=True, index_col='date')

########################################################################################################################
# TODO: compute data_oil['ma_oil'] as the moving average of data_oil['dcoilwtico'] with window size 7
# Hint: check the documentation of .rolling() method of pandas.DataFrame
########################################################################################################################
data_oil['ma_oil'] = None # change 'None' to your answer
data_oil['ma_oil'] = data_oil['dcoilwtico'].rolling(7).mean() ### answer



# Create continguous moving average of oil prices
calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))

########################################################################################################################
# TODO 1: merge two DataFrame instances (data_oil and calendar) such that the merged instances has the same indexes
# as calendar.
# TODO 2: replace each NaN in data_oil['ma_oil'] by the first non-null value before it.
# Hint: check the documentation of .merge() and .fillna() methods of pandas.DataFrame
########################################################################################################################
calendar = calendar.merge(data_oil, how='left', left_index=True, right_index=True) ### answer
calendar['ma_oil'].fillna(method='ffill', inplace=True) ### answer

calendar.head(15) # display some entries of calendar

Unnamed: 0,dcoilwtico,ma_oil
2013-01-01,,
2013-01-02,93.14,
2013-01-03,92.97,
2013-01-04,93.12,
2013-01-05,,
2013-01-06,,
2013-01-07,93.2,
2013-01-08,93.21,
2013-01-09,93.08,
2013-01-10,93.81,93.218571


### 2. Create Workday Feature

In [6]:
########################################################################################################################
# TODO: create a True/False feature calendar['wd'] to indicate whether each date is a workday (Monday-Friday) or not.
# Hint: check documentation of pandas.DatetimeIndex.dayofweek
########################################################################################################################
calendar['wd'] = None # change 'None' to your answer
calendar['dofw'] = calendar.index.dayofweek ### answer
calendar['wd'] = True ### answer
calendar.loc[calendar.dofw > 4, 'wd'] = False ### answer

calendar.head(15) # display some entries of calendar

Unnamed: 0,dcoilwtico,ma_oil,wd,dofw
2013-01-01,,,True,1
2013-01-02,93.14,,True,2
2013-01-03,92.97,,True,3
2013-01-04,93.12,,True,4
2013-01-05,,,False,5
2013-01-06,,,False,6
2013-01-07,93.2,,True,0
2013-01-08,93.21,,True,1
2013-01-09,93.08,,True,2
2013-01-10,93.81,93.218571,True,3


### 3. Read Train and Test Data

In [7]:
df_train = pd.read_csv(path + 'train.csv',
                       usecols=['store_nbr', 'family', 'date', 'sales'],
                       dtype={'store_nbr': 'category', 'family': 'category', 'sales': 'float32'},
                       parse_dates=['date'], infer_datetime_format=True)

df_train.date = df_train.date.dt.to_period('D')
df_train = df_train.set_index(['store_nbr', 'family', 'date']).sort_index()

df_train.head(15) # display some entries of the training data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.0
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0
1,AUTOMOTIVE,2013-01-06,2.0
1,AUTOMOTIVE,2013-01-07,0.0
1,AUTOMOTIVE,2013-01-08,2.0
1,AUTOMOTIVE,2013-01-09,2.0
1,AUTOMOTIVE,2013-01-10,2.0


In [8]:
df_test = pd.read_csv(path + 'test.csv',
                      usecols=['store_nbr', 'family', 'date'],
                      dtype={'store_nbr': 'category', 'family': 'category'},
                      parse_dates=['date'], infer_datetime_format=True)

df_test.date = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()

df_test.head(15) # display some entries of the testing data

store_nbr,family,date
1,AUTOMOTIVE,2017-08-16
1,AUTOMOTIVE,2017-08-17
1,AUTOMOTIVE,2017-08-18
1,AUTOMOTIVE,2017-08-19
1,AUTOMOTIVE,2017-08-20
1,AUTOMOTIVE,2017-08-21
1,AUTOMOTIVE,2017-08-22
1,AUTOMOTIVE,2017-08-23
1,AUTOMOTIVE,2017-08-24
1,AUTOMOTIVE,2017-08-25


In [9]:
# set the range of data used in training
sdate = '2017-04-01'
edate = '2017-08-15'

# we will train a model that takes feature of a date as input and predicts the sales for each store and family of goods on that date.
y = df_train.unstack(['store_nbr', 'family']).loc[sdate:edate]


########################################################################################################################
# TODO: create the trend feature X: the value for sdate is 1, the value for the next day of sdate is 2, etc.
# Hint: check the documentation of DeterministicProcess, or this tutorial: https://www.kaggle.com/code/ryanholbrook/trend.
########################################################################################################################
X = None # change 'None' to your answer
dp = DeterministicProcess(index=y.index, ### answer
                          constant=False,
                          order=1)
X = dp.in_sample() ### answer

# Extentions
X['oil']  = calendar.loc[sdate:edate]['ma_oil'].values
X['wd']   = calendar.loc[sdate:edate]['wd'].values

X.head(15)

Unnamed: 0_level_0,trend,oil,wd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-04-01,1.0,48.57,False
2017-04-02,2.0,48.57,False
2017-04-03,3.0,49.034286,True
2017-04-04,4.0,49.561429,True
2017-04-05,5.0,50.15,True
2017-04-06,6.0,50.625714,True
2017-04-07,7.0,51.022857,True
2017-04-08,8.0,51.022857,False
2017-04-09,9.0,51.022857,False
2017-04-10,10.0,51.417143,True


### 4. Train Model!

In [10]:
model = Ridge(fit_intercept=True, solver='auto', alpha=0.5, normalize=True)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [11]:
# Results on the training set

y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()

y_target['sales_pred'] = y_pred['sales'].clip(0.) # Sales should be >= 0

########################################################################################################################
# TODO: show the training loss for each type of product.
# Hint: check the documentation of DataFrame.groupby() and GroupBy.apply().
########################################################################################################################
y_target.groupby('family').apply(lambda r: mean_squared_log_error(r['sales'], r['sales_pred'])) ### answer

family
AUTOMOTIVE                    0.283976
BABY CARE                     0.070799
BEAUTY                        0.298199
BEVERAGES                     0.215508
BOOKS                         0.028585
BREAD/BAKERY                  0.143917
CELEBRATION                   0.349750
CLEANING                      0.220625
DAIRY                         0.156052
DELI                          0.127979
EGGS                          0.210579
FROZEN FOODS                  0.191793
GROCERY I                     0.223829
GROCERY II                    0.386132
HARDWARE                      0.294485
HOME AND KITCHEN I            0.286941
HOME AND KITCHEN II           0.238523
HOME APPLIANCES               0.166385
HOME CARE                     0.140648
LADIESWEAR                    0.293791
LAWN AND GARDEN               0.298802
LINGERIE                      0.430799
LIQUOR,WINE,BEER              0.796780
MAGAZINES                     0.281421
MEATS                         0.182158
PERSONAL CARE     

In [12]:
# Test predictions

stest = '2017-08-16'
etest = '2017-08-31'

########################################################################################################################
# TODO: create the feature matrix of test data.
# Hint: check the documentation of DeterministicProcess.
########################################################################################################################
X_test = None # change 'None' to your answer
X_test = dp.out_of_sample(steps=16) ### answer

# Extentions

X_test['oil']  = calendar.loc[stest:etest]['ma_oil'].values ### answer
X_test['wd']   = calendar.loc[stest:etest]['wd'].values ### answer


sales_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])

sales_pred[sales_pred < 0] = 0. # Sales should be >= 0

In [13]:
# Create submission

df_sub = pd.read_csv(path + 'sample_submission.csv', index_col='id')
df_sub.sales = sales_pred.values
df_sub.to_csv('submission.csv', index=True)