In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess, Fourier

# Time Related Features
def create_date_features(df):
    df['month'] = df.index.month.astype("int8")
    df['day_of_month'] = df.index.day.astype("int8")
    df['day_of_year'] = df.index.dayofyear.astype("int16")
    df['week_of_year'] = (df.index.isocalendar().week).astype("int8")
    df['day_of_week'] = (df.index.dayofweek + 1).astype("int8")
    df['year'] = df.index.year.astype("int32")
    df["is_wknd"] = (df.index.weekday // 4).astype("int8")
    df["quarter"] = df.index.quarter.astype("int8")
    df['is_month_start'] = df.index.is_month_start.astype("int8")
    df['is_month_end'] = df.index.is_month_end.astype("int8")
    df['is_quarter_start'] = df.index.is_quarter_start.astype("int8")
    df['is_quarter_end'] = df.index.is_quarter_end.astype("int8")
    df['is_year_start'] = df.index.is_year_start.astype("int8")
    df['is_year_end'] = df.index.is_year_end.astype("int8")
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    df["season"] = np.where(df.month.isin([12,1,2]), 0, 1)
    df["season"] = np.where(df.month.isin([6,7,8]), 2, df["season"])
    df["season"] = np.where(df.month.isin([9, 10, 11]), 3, df["season"]).astype("int8")
    return df



In [2]:
train = pd.read_csv("data/train.csv", parse_dates=['date'], infer_datetime_format=True)
test = pd.read_csv("data/test.csv", parse_dates=['date'], infer_datetime_format=True)
transactions = pd.read_csv("data/transactions.csv", parse_dates=['date'], infer_datetime_format=True)
stores = pd.read_csv("data/stores.csv",index_col='store_nbr')
oil = pd.read_csv("data/oil.csv", parse_dates=['date'], infer_datetime_format=True, index_col='date')
holidays_events = pd.read_csv("data/holidays_events.csv", parse_dates=['date'], infer_datetime_format=True)

#foutje uit ander notebook meenemen
holidays_events['date'] = holidays_events['date'].replace({'2013-04-29' : 
                                         pd.to_datetime('2013-03-29')})




In [3]:
#Dataframe maken waarin je events in tijd kwijt kan
#let op dit is de train periode EN de test periode
calendar = pd.DataFrame(index=pd.date_range('2013-01-01','2017-08-31'))
#olieprijs toevoegen
calendar = calendar.join(oil, how='left')
calendar = calendar.rename(columns={"dcoilwtico": "oilprice"})
#lege waarden vullen
calendar['oilprice'].fillna(method='ffill', inplace=True)
#eerste waarde vullen
calendar['oilprice'].fillna(method='bfill', inplace=True)

#tijdsgebonden features
calendar = create_date_features(calendar)



In [4]:
calendar.shape

(1704, 16)

In [5]:
special_event = holidays_events['date'][holidays_events['type']=='Event']
additional_day = holidays_events['date'][(holidays_events['type']=='Additional') & (holidays_events['locale'] == 'National')]

national_transferred= holidays_events['date'][(holidays_events['type'] == 'Transfer') & (holidays_events['locale'] == 'National')]
national_bridged = holidays_events['date'][(holidays_events['type']=='Bridge')]
national_Workday = holidays_events['date'][(holidays_events['type']=='Work Day')]

national_holiday = holidays_events['date'][(holidays_events['type'] == 'Holiday') & (holidays_events['locale'] == 'National') & (holidays_events['transferred'] == False)]



local_transferred = holidays_events[(holidays_events['type'] == 'Transfer') & (holidays_events['locale'] == 'Local')]
local_transferred['combined'] = local_transferred[['locale', 'locale_name']].agg('-'.join, axis=1)
local_transferred.index = local_transferred.date
local_transferred = local_transferred[['combined']]

local_holiday = holidays_events[(holidays_events['type'] == 'Holiday') & (holidays_events['locale'] == 'Local') & (holidays_events['transferred'] == False)]
local_holiday['combined'] = local_holiday[['locale', 'locale_name']].agg('-'.join, axis=1)
local_holiday.index = local_holiday.date
local_holiday = local_holiday[['combined']]

regional_holiday = holidays_events[(holidays_events['type'] == 'Holiday') & (holidays_events['locale'] == 'Regional') & (holidays_events['transferred'] == False)]
regional_holiday['combined'] = regional_holiday[['locale', 'locale_name']].agg('-'.join, axis=1)
regional_holiday.index = regional_holiday.date
regional_holiday = regional_holiday[['combined']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  local_transferred['combined'] = local_transferred[['locale', 'locale_name']].agg('-'.join, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  local_holiday['combined'] = local_holiday[['locale', 'locale_name']].agg('-'.join, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regional_holida

In [6]:
sp = pd.DataFrame(calendar.index.isin(special_event.tolist()))
sp.index = calendar.index
calendar['specialevent'] = sp

ad = pd.DataFrame(calendar.index.isin(additional_day.tolist()))
ad.index = calendar.index
calendar['additionalday'] = ad

nt = pd.DataFrame(calendar.index.isin(national_transferred.tolist()))
nt.index = calendar.index
calendar['nationaltransferred'] = nt

nb = pd.DataFrame(calendar.index.isin(national_bridged.tolist()))
nb.index = calendar.index
calendar['nationalbridged'] = nb

wd = pd.DataFrame(calendar.index.isin(national_Workday.tolist()))
wd.index = calendar.index
calendar['nationalworkday'] = wd

nh = pd.DataFrame(calendar.index.isin(national_holiday.tolist()))
nh.index = calendar.index
calendar['nationalholiday'] = nh

In [7]:
tmp = pd.get_dummies(local_holiday,prefix='local_holiday').groupby('date').sum()

tmp

Unnamed: 0_level_0,local_holiday_Local-Ambato,local_holiday_Local-Cayambe,local_holiday_Local-Cuenca,local_holiday_Local-El Carmen,local_holiday_Local-Esmeraldas,local_holiday_Local-Guaranda,local_holiday_Local-Guayaquil,local_holiday_Local-Ibarra,local_holiday_Local-Latacunga,local_holiday_Local-Libertad,local_holiday_Local-Loja,local_holiday_Local-Machala,local_holiday_Local-Manta,local_holiday_Local-Puyo,local_holiday_Local-Quevedo,local_holiday_Local-Quito,local_holiday_Local-Riobamba,local_holiday_Local-Salinas,local_holiday_Local-Santo Domingo
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2012-03-02,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2012-04-12,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2012-04-14,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2012-04-21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2012-05-12,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-11-10,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-11-11,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2017-11-12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-12-08,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [8]:
#onehotting the local ones
calendar = calendar.join(
    pd.get_dummies(local_transferred,prefix='local_transferred')
    , how='left'
    )

calendar = calendar.join(
    pd.get_dummies(local_holiday,prefix='local_holiday').groupby('date').sum()
    , how='left'
    )

calendar = calendar.join(
    pd.get_dummies(regional_holiday,prefix='regional_holiday')
    , how='left'
    )
calendar = calendar.fillna(0)


In [13]:

#promotion avgs
calendar['promotion'] = train[['onpromotion','date']].groupby('date').sum()

In [22]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [27]:
X_time = train.copy()
X_time.date = X_time.date.dt.to_period('D')
X_time = X_time.set_index(['store_nbr', 'family', 'date']).sort_index()
X_time = X_time.drop(columns = ['onpromotion','id'])



In [28]:
X_time

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
store_nbr,family,date,Unnamed: 3_level_1
1,AUTOMOTIVE,2013-01-01,0.0
1,AUTOMOTIVE,2013-01-02,2.0
1,AUTOMOTIVE,2013-01-03,3.0
1,AUTOMOTIVE,2013-01-04,3.0
1,AUTOMOTIVE,2013-01-05,5.0
...,...,...,...
54,SEAFOOD,2017-08-11,0.0
54,SEAFOOD,2017-08-12,1.0
54,SEAFOOD,2017-08-13,2.0
54,SEAFOOD,2017-08-14,0.0


In [None]:
calendar

In [None]:
train['oil']  = calendar.loc[start_date:end_date]['oilprice'].values
train['type'] = calendar.loc[start_date:end_date]['type'].values


In [38]:
end_date='2017-08-15'
start_date='2017-04-01'

In [104]:
y = X_time.unstack(['store_nbr', 'family'])
#.loc[start_date:end_date]

In [105]:
y

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,54,54,54,54,54,54,54,54,54,54
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2013-01-01,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000,0.0,0.0
2013-01-02,2.0,0.0,2.0,1091.0,0.0,470.65200,0.0,1060.0,579.0,164.069,...,0.0,73.771000,228.0,0.0,0.0,15.514000,61.0,0.000,0.0,3.0
2013-01-03,3.0,0.0,0.0,919.0,0.0,310.65500,0.0,836.0,453.0,151.582,...,0.0,50.257000,156.0,0.0,0.0,4.313000,1.0,0.000,0.0,2.0
2013-01-04,3.0,0.0,3.0,953.0,0.0,198.36600,0.0,827.0,460.0,131.411,...,0.0,40.223000,146.0,0.0,0.0,26.743000,38.0,0.000,0.0,2.0
2013-01-05,5.0,0.0,3.0,1160.0,0.0,301.05700,0.0,811.0,464.0,118.613,...,0.0,43.431000,205.0,0.0,0.0,31.118000,32.0,0.000,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1.0,0.0,1.0,1006.0,0.0,145.60700,4.0,341.0,343.0,64.302,...,0.0,50.756000,155.0,0.0,0.0,80.759000,54.0,546.250,0.0,0.0
2017-08-12,6.0,0.0,3.0,1659.0,0.0,243.22000,3.0,351.0,526.0,99.488,...,1.0,53.079002,169.0,0.0,4.0,91.671000,81.0,696.920,0.0,1.0
2017-08-13,1.0,0.0,1.0,803.0,0.0,136.67900,1.0,169.0,266.0,47.770,...,3.0,67.435000,244.0,0.0,2.0,79.062996,91.0,877.304,0.0,2.0
2017-08-14,1.0,0.0,6.0,2201.0,0.0,346.03800,4.0,571.0,699.0,154.578,...,1.0,64.224000,200.0,0.0,1.0,56.155000,147.0,585.615,0.0,0.0


In [106]:
fourier = CalendarFourier(freq='W', order=4)
dp = DeterministicProcess(index=y.index,
                          constant=False,
                          order=1,
                          seasonal=False,
                          additional_terms=[fourier],
                          drop=True)
X = dp.in_sample()

In [107]:
X

Unnamed: 0_level_0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","sin(3,freq=W-SUN)","cos(3,freq=W-SUN)"
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,1.0,0.781831,0.623490,0.974928,-0.222521,0.433884,-0.900969
2013-01-02,2.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.623490
2013-01-03,3.0,0.433884,-0.900969,-0.781831,0.623490,0.974928,-0.222521
2013-01-04,4.0,-0.433884,-0.900969,0.781831,0.623490,-0.974928,-0.222521
2013-01-05,5.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.623490
...,...,...,...,...,...,...,...
2017-08-11,1680.0,-0.433884,-0.900969,0.781831,0.623490,-0.974928,-0.222521
2017-08-12,1681.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.623490
2017-08-13,1682.0,-0.781831,0.623490,-0.974928,-0.222521,-0.433884,-0.900969
2017-08-14,1683.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000


In [108]:
calendar.index = calendar.index.to_period('D')

AttributeError: 'PeriodIndex' object has no attribute 'to_period'

In [109]:
calendar['promotion'] = calendar['promotion'].fillna(0)

In [110]:
Xfinal = X.join(calendar)

In [111]:
Xfinal

Unnamed: 0_level_0,trend,"sin(1,freq=W-SUN)","cos(1,freq=W-SUN)","sin(2,freq=W-SUN)","cos(2,freq=W-SUN)","sin(3,freq=W-SUN)","cos(3,freq=W-SUN)",oilprice,month,day_of_month,...,local_holiday_Local-Quevedo,local_holiday_Local-Quito,local_holiday_Local-Riobamba,local_holiday_Local-Salinas,local_holiday_Local-Santo Domingo,regional_holiday_Regional-Cotopaxi,regional_holiday_Regional-Imbabura,regional_holiday_Regional-Santa Elena,regional_holiday_Regional-Santo Domingo de los Tsachilas,promotion
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,1.0,0.781831,0.623490,0.974928,-0.222521,0.433884,-0.900969,93.14,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-02,2.0,0.974928,-0.222521,-0.433884,-0.900969,-0.781831,0.623490,93.14,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-03,3.0,0.433884,-0.900969,-0.781831,0.623490,0.974928,-0.222521,92.97,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-04,4.0,-0.433884,-0.900969,0.781831,0.623490,-0.974928,-0.222521,93.12,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-05,5.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.623490,93.12,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,1680.0,-0.433884,-0.900969,0.781831,0.623490,-0.974928,-0.222521,48.81,8,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14179.0
2017-08-12,1681.0,-0.974928,-0.222521,0.433884,-0.900969,0.781831,0.623490,48.81,8,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8312.0
2017-08-13,1682.0,-0.781831,0.623490,-0.974928,-0.222521,-0.433884,-0.900969,48.81,8,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9283.0
2017-08-14,1683.0,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,47.59,8,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8043.0


In [112]:
model = Ridge(fit_intercept=True, solver='auto', alpha=0.5, normalize=True) # try alpha,0.1 ,0.3 ,0.6 and 0.9
model.fit(Xfinal, y)
y_pred = pd.DataFrame(model.predict(Xfinal), index=Xfinal.index, columns=y.columns)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [113]:
y_pred

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,1,1,1,1,1,1,1,1,1,...,54,54,54,54,54,54,54,54,54,54
family,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2013-01-01,0.264284,0.0,0.371997,-130.012893,-0.120011,31.742421,-5.349564,153.744896,31.571712,16.001807,...,-0.315671,9.708748,18.527546,-0.050729,-0.896903,6.675033,2.071352,-104.674056,-0.095596,0.807393
2013-01-02,2.660715,0.0,1.791966,1113.256350,-0.086738,349.892531,-0.852602,802.956899,607.786032,124.601684,...,-0.263974,38.557608,125.894319,-0.015913,-0.487926,31.265099,35.120587,-69.161106,-0.035862,2.152670
2013-01-03,2.373424,0.0,1.822212,972.181385,-0.077299,312.808418,-0.184395,695.876816,498.653175,109.664656,...,-0.213710,47.690649,118.878635,-0.020286,-0.321650,33.678624,37.917107,-52.541628,-0.020040,2.303980
2013-01-04,2.821179,0.0,1.597087,1028.964312,-0.092771,308.781020,0.712456,702.778715,527.414100,131.015481,...,-0.139347,43.701069,142.041187,-0.036368,-0.115489,43.459148,36.887691,-40.270509,-0.043706,1.728338
2013-01-05,2.462718,0.0,1.907371,1231.194621,-0.107639,320.308896,-2.184847,664.778625,550.967237,119.556494,...,-0.235728,49.339288,181.660182,-0.041242,0.919066,45.352156,38.675858,-1.656110,-0.075927,1.702107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-08-11,2.835058,0.0,2.108606,1716.122676,0.229261,274.852670,12.705581,471.984502,557.066527,100.399633,...,1.420785,53.465506,185.267661,0.208562,4.847699,75.992730,72.576754,655.206525,0.948913,2.275882
2017-08-12,4.433681,0.0,3.288787,2308.892630,0.328741,387.725153,16.142866,621.015016,792.572929,138.267503,...,1.136967,61.288209,238.923272,0.231831,4.044452,67.301385,74.998690,695.326515,1.277086,1.729563
2017-08-13,2.871517,0.0,2.337701,1631.960173,0.307112,251.158767,12.273521,381.109399,536.762130,88.229226,...,1.087787,70.283376,298.035790,0.240294,4.329339,75.509330,92.970556,745.371715,1.473177,2.150998
2017-08-14,4.298568,0.0,3.362152,2308.552021,0.341180,427.336191,20.749703,722.711801,805.832712,148.551069,...,0.881021,55.345972,209.223001,0.214109,3.620661,55.609730,77.294062,645.149869,1.222848,1.768850


In [114]:
y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) 
y_target.groupby('family').apply(lambda r: mean_squared_log_error(r['sales'], r['sales_pred']))

family
AUTOMOTIVE                     0.356634
BABY CARE                      0.048603
BEAUTY                         0.292654
BEVERAGES                      2.211816
BOOKS                          0.033347
BREAD/BAKERY                   1.262289
CELEBRATION                    1.225508
CLEANING                       1.691219
DAIRY                          1.428325
DELI                           0.988227
EGGS                           0.806652
FROZEN FOODS                   0.881626
GROCERY I                      2.530846
GROCERY II                     0.502886
HARDWARE                       0.286573
HOME AND KITCHEN I             1.731620
HOME AND KITCHEN II            1.169959
HOME APPLIANCES                0.153052
HOME CARE                      6.862903
LADIESWEAR                     1.070523
LAWN AND GARDEN                0.446476
LINGERIE                       0.522901
LIQUOR,WINE,BEER               2.218664
MAGAZINES                      0.411709
MEATS                          0.

In [115]:
from joblib import Parallel, delayed
import warnings

from sklearn.linear_model import Ridge
from sklearn.ensemble     import RandomForestRegressor

class CustomRegressor():
    
    def __init__(self, n_jobs=-1, verbose=0):
        
        self.n_jobs = n_jobs
        self.verbose = verbose
        
        self.estimators_ = None
        
    def _estimator_(self, X, y):
    
        warnings.simplefilter(action='ignore', category=FutureWarning)
        
        if y.name[2] == 'SCHOOL AND OFFICE SUPPLIES':
            
            model = RandomForestRegressor(n_estimators = 300, n_jobs=-1, random_state=1)
            
        else:
            
            model = Ridge(fit_intercept=True, solver='auto', alpha=0.5, normalize=True)
            
        model.fit(X, y)

        return model

    def fit(self, X, y):

        self.estimators_ = Parallel(n_jobs=self.n_jobs, 
                              verbose=self.verbose,
                              )(delayed(self._estimator_)(X, y.iloc[:, i]) for i in range(y.shape[1]))
        
        return
    
    def predict(self, X):
        
        y_pred = Parallel(n_jobs=self.n_jobs, 
                          verbose=self.verbose)(delayed(e.predict)(X) for e in self.estimators_)
        
        return np.stack(y_pred, axis=1)

In [116]:
model = CustomRegressor(n_jobs=-1, verbose=0)
model.fit(Xfinal, y)
y_pred = pd.DataFrame(model.predict(Xfinal), index=Xfinal.index, columns=y.columns)

In [117]:
y_pred   = y_pred.stack(['store_nbr', 'family']).reset_index()
y_target = y.stack(['store_nbr', 'family']).reset_index().copy()
y_target['sales_pred'] = y_pred['sales'].clip(0.) 
y_target.groupby('family').apply(lambda r: mean_squared_log_error(r['sales'], r['sales_pred']))

family
AUTOMOTIVE                     0.356634
BABY CARE                      0.048603
BEAUTY                         0.292654
BEVERAGES                      2.211816
BOOKS                          0.033347
BREAD/BAKERY                   1.262289
CELEBRATION                    1.225508
CLEANING                       1.691219
DAIRY                          1.428325
DELI                           0.988227
EGGS                           0.806652
FROZEN FOODS                   0.881626
GROCERY I                      2.530846
GROCERY II                     0.502886
HARDWARE                       0.286573
HOME AND KITCHEN I             1.731620
HOME AND KITCHEN II            1.169959
HOME APPLIANCES                0.153052
HOME CARE                      6.862903
LADIESWEAR                     1.070523
LAWN AND GARDEN                0.446476
LINGERIE                       0.522901
LIQUOR,WINE,BEER               2.218664
MAGAZINES                      0.411709
MEATS                          0.

In [118]:
end_test='2017-08-31'
start_test='2017-08-16'
X_test = dp.out_of_sample(steps=16)


X_test = X_test.join(calendar)


sales_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
sales_pred = sales_pred.stack(['store_nbr', 'family'])
sales_pred[sales_pred < 0] = 0. 

In [119]:
sales_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales
Unnamed: 0_level_1,store_nbr,family,Unnamed: 3_level_1
2017-08-16,1,AUTOMOTIVE,4.140577
2017-08-16,1,BABY CARE,0.000000
2017-08-16,1,BEAUTY,2.919929
2017-08-16,1,BEVERAGES,2155.784718
2017-08-16,1,BOOKS,0.131584
...,...,...,...
2017-08-31,54,POULTRY,53.833367
2017-08-31,54,PREPARED FOODS,76.766223
2017-08-31,54,PRODUCE,543.606592
2017-08-31,54,SCHOOL AND OFFICE SUPPLIES,0.000000


In [120]:
My_submission = pd.read_csv('data/sample_submission.csv', index_col='id')
My_submission.sales = sales_pred.values
My_submission.to_csv('submission.csv', index=True)