In [1]:
import datetime 

import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import IterativeImputer

In [2]:
features_df = pd.read_csv('data/features.csv')
sales_df = pd.read_csv('data/sales.csv')
stores_df = pd.read_csv('data/stores.csv')

In [3]:
features_df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.24217,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False
3,1,26/02/2010,46.63,2.561,,,,,,211.319643,8.106,False
4,1,05/03/2010,46.5,2.625,,,,,,211.350143,8.106,False


In [4]:
sales_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,05/02/2010,24924.5,False
1,1,1,12/02/2010,46039.49,True
2,1,1,19/02/2010,41595.55,False
3,1,1,26/02/2010,19403.54,False
4,1,1,05/03/2010,21827.9,False


In [5]:
stores_df.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [6]:
one_hot_encoding = pd.get_dummies(stores_df['Type'])
stores_df = stores_df.join(one_hot_encoding)
stores_df = stores_df.drop(columns=['Type'])
stores_df.head()

Unnamed: 0,Store,Size,A,B,C
0,1,151315,1,0,0
1,2,202307,1,0,0
2,3,37392,0,1,0
3,4,205863,1,0,0
4,5,34875,0,1,0


In [7]:
week_sales_df = sales_df.groupby(['Date', 'Store', 'Dept']).agg({'Weekly_Sales': 'sum'}).sort_index()
week_sales_df.reset_index(inplace=True)
training_df = pd.merge(features_df, week_sales_df, how='left', on=["Date", "Store"])
training_df = pd.merge(training_df, stores_df, how='left', on=["Store"])

In [8]:
training_df.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Size,A,B,C
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,1.0,24924.5,151315,1,0,0
1,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,2.0,50605.27,151315,1,0,0
2,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,3.0,13740.12,151315,1,0,0
3,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,4.0,39954.04,151315,1,0,0
4,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,5.0,32229.38,151315,1,0,0


In [9]:
training_df.isna().sum()

Store                0
Date                 0
Temperature          0
Fuel_Price           0
MarkDown1       270892
MarkDown2       310793
MarkDown3       284667
MarkDown4       286859
MarkDown5       270138
CPI                585
Unemployment       585
IsHoliday            0
Dept              1755
Weekly_Sales      1755
Size                 0
A                    0
B                    0
C                    0
dtype: int64

In [10]:
def gen_datetime(date_str):
    return datetime.datetime.strptime(date_str, '%d/%m/%Y')

training_df['DateTime'] = training_df['Date'].map(gen_datetime)

In [11]:
training_df = training_df.set_index('DateTime').sort_index()

In [12]:
training_df.head()

Unnamed: 0_level_0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Size,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-02-05,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False,1.0,24924.5,151315,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,,,,,,204.247194,8.187,False,74.0,26726.1,203742,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,,,,,,204.247194,8.187,False,72.0,173075.66,203742,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,,,,,,204.247194,8.187,False,71.0,20512.14,203742,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,,,,,,204.247194,8.187,False,67.0,19251.67,203742,1,0,0


In [13]:
training_df['CPI'] = training_df['CPI'].interpolate(method='index')
training_df['Unemployment'] = training_df['Unemployment'].interpolate(method='index')
training_df['Weekly_Sales'] = training_df['Weekly_Sales'].interpolate(method='index')

In [14]:
training_df.isna().sum()

Store                0
Date                 0
Temperature          0
Fuel_Price           0
MarkDown1       270892
MarkDown2       310793
MarkDown3       284667
MarkDown4       286859
MarkDown5       270138
CPI                  0
Unemployment         0
IsHoliday            0
Dept              1755
Weekly_Sales         0
Size                 0
A                    0
B                    0
C                    0
dtype: int64

In [15]:
training_df['MarkDown1'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown1'].values.reshape(-1, 1))
training_df['MarkDown2'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown2'].values.reshape(-1, 1))
training_df['MarkDown3'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown3'].values.reshape(-1, 1))
training_df['MarkDown4'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown4'].values.reshape(-1, 1))
training_df['MarkDown5'] = IterativeImputer(random_state=0).fit_transform(training_df['MarkDown5'].values.reshape(-1, 1))

In [16]:
training_df['Dept'] = training_df['Dept'].fillna(1)

In [17]:
training_df.isna().sum()

Store           0
Date            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
IsHoliday       0
Dept            0
Weekly_Sales    0
Size            0
A               0
B               0
C               0
dtype: int64

In [18]:
training_df.head()

Unnamed: 0_level_0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Size,A,B,C
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-02-05,1,05/02/2010,42.31,2.572,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,211.096358,8.106,False,1.0,24924.5,151315,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,74.0,26726.1,203742,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,72.0,173075.66,203742,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,71.0,20512.14,203742,1,0,0
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,67.0,19251.67,203742,1,0,0


In [19]:
training_df['Year'] = training_df['Date'].map(lambda x: int(x[-4:]))

In [20]:
training_df.head()

Unnamed: 0_level_0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Dept,Weekly_Sales,Size,A,B,C,Year
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2010-02-05,1,05/02/2010,42.31,2.572,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,211.096358,8.106,False,1.0,24924.5,151315,1,0,0,2010
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,74.0,26726.1,203742,1,0,0,2010
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,72.0,173075.66,203742,1,0,0,2010
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,71.0,20512.14,203742,1,0,0,2010
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,False,67.0,19251.67,203742,1,0,0,2010


In [21]:
def day_of_year(date_str):
    date = datetime.datetime.strptime(date_str, '%d/%m/%Y')
    return date.timetuple().tm_yday

training_df['DayOfYear'] = training_df['Date'].map(day_of_year)
training_df['DayOfYearCos'] = np.cos(training_df['DayOfYear'])
training_df['DayOfYearSin'] = np.sin(training_df['DayOfYear'])

In [22]:
training_df.head()

Unnamed: 0_level_0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,1,05/02/2010,42.31,2.572,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,211.096358,...,1.0,24924.5,151315,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,...,74.0,26726.1,203742,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,...,72.0,173075.66,203742,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,...,71.0,20512.14,203742,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,05/02/2010,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,...,67.0,19251.67,203742,1,0,0,2010,36,-0.127964,-0.991779


In [23]:
training_df = training_df.drop(columns=['Date'])

In [24]:
training_df.head()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,1,42.31,2.572,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,211.096358,8.106,...,1.0,24924.5,151315,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,...,74.0,26726.1,203742,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,...,72.0,173075.66,203742,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,...,71.0,20512.14,203742,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,20,25.92,2.784,7246.604247,3337.597289,1449.098932,3382.019834,4618.743783,204.247194,8.187,...,67.0,19251.67,203742,1,0,0,2010,36,-0.127964,-0.991779


In [25]:
training_df['Store'] = MinMaxScaler().fit_transform(training_df['Store'].values.reshape((-1, 1)))
training_df['Temperature'] = MinMaxScaler().fit_transform(training_df['Temperature'].values.reshape((-1, 1)))
training_df['Fuel_Price'] = MinMaxScaler().fit_transform(training_df['Fuel_Price'].values.reshape((-1, 1)))
training_df['MarkDown1'] = MinMaxScaler().fit_transform(training_df['MarkDown1'].values.reshape((-1, 1)))
training_df['MarkDown2'] = MinMaxScaler().fit_transform(training_df['MarkDown2'].values.reshape((-1, 1)))
training_df['MarkDown3'] = MinMaxScaler().fit_transform(training_df['MarkDown3'].values.reshape((-1, 1)))
training_df['MarkDown4'] = MinMaxScaler().fit_transform(training_df['MarkDown4'].values.reshape((-1, 1)))
training_df['MarkDown5'] = MinMaxScaler().fit_transform(training_df['MarkDown5'].values.reshape((-1, 1)))
training_df['CPI'] = MinMaxScaler().fit_transform(training_df['CPI'].values.reshape((-1, 1)))
training_df['Unemployment'] = MinMaxScaler().fit_transform(training_df['Unemployment'].values.reshape((-1, 1)))
training_df['IsHoliday'] = training_df['IsHoliday'].astype(int)
training_df['Dept'] = MinMaxScaler().fit_transform(training_df['Dept'].values.reshape((-1, 1)))
training_df['Weekly_Sales'] = MinMaxScaler().fit_transform(training_df['Weekly_Sales'].values.reshape((-1, 1)))
training_df['Size'] = MinMaxScaler().fit_transform(training_df['Size'].values.reshape((-1, 1)))

In [26]:
training_df.head()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,0.0,0.454046,0.0501,0.094634,0.034388,0.01088,0.05012,0.006226,0.826259,0.416032,...,0.0,0.042851,0.630267,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.744898,0.045431,0.914045,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.72449,0.255075,0.914045,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.714286,0.03653,0.914045,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.673469,0.034724,0.914045,1,0,0,2010,36,-0.127964,-0.991779


In [27]:
training_df.tail()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-07-26,0.568182,0.644086,0.70491,0.031333,0.01039,0.0012,0.001264,0.001647,0.064638,0.582369,...,0.0,0.037638,0.636752,1,0,0,2013,207,0.941037,-0.338305
2013-07-26,0.545455,0.695807,0.667335,0.028541,0.011044,0.00121,0.000697,0.001704,0.064638,0.582369,...,0.0,0.037638,0.504647,0,1,0,2013,207,0.941037,-0.338305
2013-07-26,0.75,0.779751,0.55511,0.028597,0.009486,0.001219,0.000588,0.002189,0.064638,0.582369,...,0.0,0.037638,0.667069,1,0,0,2013,207,0.941037,-0.338305
2013-07-26,0.227273,0.850696,0.57515,0.048776,0.009381,0.001911,0.016074,0.004471,0.064638,0.582369,...,0.0,0.037638,0.934381,1,0,0,2013,207,0.941037,-0.338305
2013-07-26,1.0,0.762999,0.667335,0.028249,0.010665,0.001212,0.000158,0.002656,0.064638,0.582369,...,0.0,0.037638,0.451136,0,1,0,2013,207,0.941037,-0.338305


Split training set from 2010 to 2011 and test set from 2012 to 2013

In [28]:
training_set_df = training_df[:'2012-01-01']

In [29]:
training_set_df.head()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,0.0,0.454046,0.0501,0.094634,0.034388,0.01088,0.05012,0.006226,0.826259,0.416032,...,0.0,0.042851,0.630267,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.744898,0.045431,0.914045,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.72449,0.255075,0.914045,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.714286,0.03653,0.914045,1,0,0,2010,36,-0.127964,-0.991779
2010-02-05,0.431818,0.30401,0.156313,0.094634,0.034388,0.01088,0.05012,0.006226,0.759706,0.423652,...,0.673469,0.034724,0.914045,1,0,0,2010,36,-0.127964,-0.991779


In [30]:
training_set_df.tail()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-12-30,0.931818,0.514555,0.478958,0.028536,0.016348,0.001222,0.002723,0.000835,0.038936,0.394205,...,0.918367,0.047118,0.026063,0,0,1,2011,364,0.911143,-0.412091
2011-12-30,0.931818,0.514555,0.478958,0.028536,0.016348,0.001222,0.002723,0.000835,0.038936,0.394205,...,0.908163,0.072088,0.026063,0,0,1,2011,364,0.911143,-0.412091
2011-12-30,0.931818,0.514555,0.478958,0.028536,0.016348,0.001222,0.002723,0.000835,0.038936,0.394205,...,0.877551,0.008676,0.026063,0,0,1,2011,364,0.911143,-0.412091
2011-12-30,0.931818,0.514555,0.478958,0.028536,0.016348,0.001222,0.002723,0.000835,0.038936,0.394205,...,0.857143,0.007245,0.026063,0,0,1,2011,364,0.911143,-0.412091
2011-12-30,0.931818,0.514555,0.478958,0.028536,0.016348,0.001222,0.002723,0.000835,0.038936,0.394205,...,0.938776,0.043392,0.026063,0,0,1,2011,364,0.911143,-0.412091


In [31]:
test_set_df = training_df['2012-01-01':]

In [32]:
test_set_df.head()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-06,0.386364,0.352984,0.484469,0.084727,0.419879,0.002056,0.024497,0.011291,0.103332,0.413115,...,0.346939,0.010419,0.4643,0,1,0,2012,6,0.96017,-0.279415
2012-01-06,0.386364,0.352984,0.484469,0.084727,0.419879,0.002056,0.024497,0.011291,0.103332,0.413115,...,0.336735,0.027423,0.4643,0,1,0,2012,6,0.96017,-0.279415
2012-01-06,0.386364,0.352984,0.484469,0.084727,0.419879,0.002056,0.024497,0.011291,0.103332,0.413115,...,0.326531,0.014814,0.4643,0,1,0,2012,6,0.96017,-0.279415
2012-01-06,0.386364,0.352984,0.484469,0.084727,0.419879,0.002056,0.024497,0.011291,0.103332,0.413115,...,0.316327,0.013884,0.4643,0,1,0,2012,6,0.96017,-0.279415
2012-01-06,0.386364,0.352984,0.484469,0.084727,0.419879,0.002056,0.024497,0.011291,0.103332,0.413115,...,0.377551,0.099799,0.4643,0,1,0,2012,6,0.96017,-0.279415


In [33]:
test_set_df.tail()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Dept,Weekly_Sales,Size,A,B,C,Year,DayOfYear,DayOfYearCos,DayOfYearSin
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-07-26,0.568182,0.644086,0.70491,0.031333,0.01039,0.0012,0.001264,0.001647,0.064638,0.582369,...,0.0,0.037638,0.636752,1,0,0,2013,207,0.941037,-0.338305
2013-07-26,0.545455,0.695807,0.667335,0.028541,0.011044,0.00121,0.000697,0.001704,0.064638,0.582369,...,0.0,0.037638,0.504647,0,1,0,2013,207,0.941037,-0.338305
2013-07-26,0.75,0.779751,0.55511,0.028597,0.009486,0.001219,0.000588,0.002189,0.064638,0.582369,...,0.0,0.037638,0.667069,1,0,0,2013,207,0.941037,-0.338305
2013-07-26,0.227273,0.850696,0.57515,0.048776,0.009381,0.001911,0.016074,0.004471,0.064638,0.582369,...,0.0,0.037638,0.934381,1,0,0,2013,207,0.941037,-0.338305
2013-07-26,1.0,0.762999,0.667335,0.028249,0.010665,0.001212,0.000158,0.002656,0.064638,0.582369,...,0.0,0.037638,0.451136,0,1,0,2013,207,0.941037,-0.338305


In [34]:
X_training = training_set_df.drop(columns=['Weekly_Sales', 'Year', 'DayOfYear']).values
y_training = training_set_df['Weekly_Sales'].values
X_prediction = test_set_df.drop(columns=['Weekly_Sales', 'Year', 'DayOfYear']).values
y_prediction = test_set_df['Weekly_Sales'].values

In [35]:
X_training.shape, y_training.shape, X_prediction.shape, y_prediction.shape

((294132, 18), (294132,), (129193, 18), (129193,))

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_training, y_training, test_size=0.33, random_state=0)

In [37]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((197068, 18), (197068,), (97064, 18), (97064,))

In [38]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

dtr_regressor = DecisionTreeRegressor(max_depth=50, min_samples_leaf=200, random_state=0)

In [39]:
# cv_scores = cross_val_score(dtr_regressor, X_train, y_train, cv=10)
# max(cv_scores), min(cv_scores)

In [40]:
dtr_regressor = dtr_regressor.fit(X_train, y_train)
dtr_regressor.score(X_test, y_test)

0.8274291825709694

In [41]:
future_pred = dtr_regressor.predict(X_prediction)
dtr_regressor.score(X_prediction, y_prediction)

0.8385161351390665

In [67]:
from sklearn.ensemble import RandomForestRegressor

rfr_regressor = RandomForestRegressor(n_estimators=20, max_depth=100, min_samples_leaf=150, random_state=0)

In [68]:
rfr_regressor = rfr_regressor.fit(X_train, y_train)
rfr_regressor.score(X_test, y_test)

0.8323838928627819

In [69]:
future_pred = rfr_regressor.predict(X_prediction)
rfr_regressor.score(X_prediction, y_prediction)

0.8561267374550674

In [80]:
from sklearn.ensemble import GradientBoostingRegressor

gbr_regressor = GradientBoostingRegressor(n_estimators=20, max_depth=150, min_samples_leaf=150, random_state=0)

In [81]:
gbr_regressor = gbr_regressor.fit(X_train, y_train)
gbr_regressor.score(X_test, y_test)

0.8768555612876743

In [82]:
future_pred = gbr_regressor.predict(X_prediction)
gbr_regressor.score(X_prediction, y_prediction)

0.8662145894883486