In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import xgboost

from pandas.api.types import CategoricalDtype

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from statsmodels.graphics.tsaplots import plot_pacf
from xgboost import XGBRegressor
#import lightgbm

In [4]:
all_csv = pd.read_csv('/Users/spartan/Downloads/store-sales-forecasting-main/cleaned.csv',
                 dtype = {
                     'store_nbr' : 'category',
                     'family' : 'category',
                     'sales': 'float',
                     'city': 'category',
                     'state': 'category',
                     'type': 'category',
                     'holiday_type': 'category',
                     'holiday_transferred': 'category'
                 },
                  parse_dates=['date'])
all_csv['date'] = pd.to_datetime(all_csv['date']).dt.to_period('D')

In [5]:
all = all_csv.copy()  # we can start experimenting from here without reloading the csv file

In [6]:
# this is for experimentation

filter_by_stores = None  # note: please use string here (unlike Mine.ipynb)
filter_by_family = None
filter_by_dates = None

#filter_by_stores = ['1', '2']  # note: please use string here (unlike Mine.ipynb)
#filter_by_family = ['DAIRY', 'PRODUCE']
#filter_by_family = ['']
#filter_by_dates = '2014-06-05'

In [7]:
if filter_by_dates == None:
    train_start_date = '2013-01-01'
else:
    train_start_date = filter_by_dates
train_end_date = '2017-08-15'
test_start_date = '2017-08-16'
test_end_date = '2017-08-31'

In [8]:
if filter_by_family != None:
    all = all[all['family'].isin(filter_by_family)]
if filter_by_stores != None:
    all = all[all['store_nbr'].isin(filter_by_stores)]
if filter_by_dates != None:
    all = all[all['date'] >= filter_by_dates]

In [9]:
all.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036528 entries, 0 to 3036527
Data columns (total 38 columns):
 #   Column               Dtype    
---  ------               -----    
 0   date                 period[D]
 1   store_nbr            category 
 2   family               category 
 3   sales                float64  
 4   onpromotion          int64    
 5   sales_lag_01         float64  
 6   sales_lag_02         float64  
 7   sales_lag_03         float64  
 8   sales_lag_04         float64  
 9   sales_lag_05         float64  
 10  sales_lag_06         float64  
 11  sales_lag_07         float64  
 12  sales_lag_08         float64  
 13  sales_lag_09         float64  
 14  sales_lag_10         float64  
 15  sales_lag_11         float64  
 16  sales_lag_12         float64  
 17  sales_lag_13         float64  
 18  sales_lag_14         float64  
 19  sales_lag_15         float64  
 20  sales_lag_16         float64  
 21  sales_lag_17         float64  
 22  sales_lag_18      

## One Hot Encoding

In [10]:
def one_hot_encode(df):
    return pd.get_dummies(data=df, columns=['store_nbr', 'family', 'city', 'state', 'type',
                                     'cluster', 'holiday_type', 'holiday_transferred', 'weekday'])    

In [11]:
all_ohe = one_hot_encode(all)
all_ohe = all_ohe.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))  # remove bad char in column names

X = all_ohe[all_ohe['date'] <= train_end_date]
X = X.drop(['sales'], axis=1)
y = all_ohe[['date', 'sales']][all_ohe['date'] <= train_end_date]
y.set_index('date', inplace=True)

X_test = all_ohe[all_ohe['date'] >= test_start_date]
X_test = X_test.drop(['sales'], axis=1)

X.drop('date', axis=1, inplace=True)
X_test.drop('date', axis=1, inplace=True)
y.set_index(X.index, inplace=True)
X.columns.size

190

In [24]:
X_test.columns.size

190

In [12]:
y.columns

Index(['sales'], dtype='object')

## PCA

In [42]:
from sklearn.decomposition import PCA
pca = PCA(n_components=150)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)
principalDf['day_of_month'] = X['day_of_month']

X_train, X_val, y_train, y_val = train_test_split(principalDf, y, random_state=1)

X_train.columns.size

151

In [123]:
X_train.columns.size

151

In [113]:
X_train['day_of_month'].value_counts()

10    75168
9     75086
1     75066
4     74974
2     74918
12    74911
15    74897
14    74871
5     74800
6     74797
11    74751
13    74742
7     74741
8     74692
3     74568
28    73704
23    73683
27    73622
18    73562
21    73537
25    73511
20    73484
16    73483
17    73480
24    73416
19    73375
22    73344
26    73330
29    68216
30    66654
31    42629
Name: day_of_month, dtype: int64

In [116]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,day_of_month
1198478,135.592885,-10.015295,9.476951,-12.823888,0.368836,-2.006899,0.562564,-0.628737,0.44003,0.235081,...,2.228957e-15,1.95189e-15,-2.742138e-16,-2.492552e-15,-2.823452e-15,-9.586653e-16,1.97933e-15,2.563934e-15,3.444848e-16,4
1649869,22.473936,-10.872151,11.456803,-0.950868,0.576632,-1.009578,-0.212883,-0.417954,-0.087567,0.109136,...,-2.592875e-15,-1.753002e-16,2.749658e-15,-2.194168e-15,-1.735113e-15,-1.730487e-15,-4.728794e-16,-1.937153e-16,-2.614478e-15,15
615038,173.993823,-10.728431,10.330419,-5.115483,0.563089,-0.080648,1.486487,-0.632651,0.230683,-0.164606,...,3.850906e-16,1.311421e-15,-1.75678e-15,3.039522e-15,-4.633274e-15,-2.168064e-15,-2.377412e-15,-1.970937e-15,5.396037e-15,12
413595,59.866926,-5.603041,3.334956,4.827683,0.759999,-1.136873,1.68788,-0.567452,-0.646616,0.557999,...,4.52914e-15,1.168374e-16,-2.746171e-16,-2.022751e-15,-3.501123e-16,-9.687222e-16,3.6538490000000005e-17,2.310413e-15,4.911112e-15,21
1398041,-119.955189,-3.461478,1.191805,9.243761,0.662329,-2.068293,0.012483,-1.034461,-0.14423,-0.706314,...,3.281796e-15,3.159497e-16,4.916253e-16,-1.413712e-15,-5.601481000000001e-17,-1.276532e-15,-1.228001e-15,1.035787e-15,1.365768e-15,24


In [28]:
day_of_month = X_test['day_of_month']

In [29]:
day_of_month.head()

3008016    16
3008017    16
3008018    16
3008019    16
3008020    16
Name: day_of_month, dtype: int64

In [31]:
X_test.isna().sum()

onpromotion         0
sales_lag_01    26730
sales_lag_02    24948
sales_lag_03    23166
sales_lag_04    21384
                ...  
weekday_2           0
weekday_3           0
weekday_4           0
weekday_5           0
weekday_6           0
Length: 190, dtype: int64

In [32]:
X_test.size

5417280

In [34]:
X_test = X_test.fillna(0)

In [35]:
X_test.isna().sum()

onpromotion     0
sales_lag_01    0
sales_lag_02    0
sales_lag_03    0
sales_lag_04    0
               ..
weekday_2       0
weekday_3       0
weekday_4       0
weekday_5       0
weekday_6       0
Length: 190, dtype: int64

In [15]:
sum(pca.explained_variance_ratio_)

1.0000000000000095

In [36]:
X_test.head()

Unnamed: 0,onpromotion,sales_lag_01,sales_lag_02,sales_lag_03,sales_lag_04,sales_lag_05,sales_lag_06,sales_lag_07,sales_lag_08,sales_lag_09,...,holiday_type_Bridge,holiday_transferred_False,holiday_transferred_True,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
3008016,0,1.609438,0.693147,0.693147,1.94591,0.693147,2.302585,2.079442,1.609438,2.079442,...,0,0,0,0,0,1,0,0,0,0
3008017,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3008018,2,1.609438,1.94591,0.693147,1.386294,0.693147,2.397895,1.609438,1.098612,1.791759,...,0,0,0,0,0,1,0,0,0,0
3008019,20,7.571988,7.697121,6.689599,7.414573,6.914731,7.774015,7.745868,7.79111,7.643483,...,0,0,0,0,0,1,0,0,0,0
3008020,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [37]:
X_test = pca.transform(X_test)

In [39]:
X_test = pd.DataFrame(X_test)

In [40]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,54.735016,-6.82142,5.321452,-0.121539,0.187416,-0.897372,-2.291834,1.178392,-0.646932,-0.39629,...,-1.179612e-15,7.060325e-16,-8.595555e-16,-7.627362e-16,-1.020017e-15,7.615436e-16,-1.099381e-15,1.465841e-15,4.753142e-16,5.2041700000000004e-18
1,54.738457,-10.901443,11.417594,-0.174549,0.200042,-1.004832,-2.272362,1.291243,-0.184776,0.090314,...,-2.359224e-16,1.025222e-15,-8.317999e-16,-1.104802e-16,-5.620504e-16,1.066855e-15,-4.466913e-17,7.441964e-16,2.255141e-16,3.382711e-16
2,54.741884,-5.762823,7.334577,-0.143222,0.198285,-0.977237,-2.248057,1.234142,0.22747,-0.067357,...,-6.800116e-16,1.344411e-15,-8.040443e-16,9.768662000000001e-17,-7.424616e-16,6.782769e-16,9.410875e-17,8.829742e-16,7.251144e-16,3.660267e-16
3,54.783501,24.485242,-5.499398,-0.015502,0.232918,-0.890922,-1.898497,0.905668,-0.373819,-0.73831,...,-8.187895e-16,1.191755e-15,-9.705778e-16,-2.631359e-16,4.8572260000000006e-17,1.344411e-15,-3.77736e-16,4.38885e-16,1.307982e-15,-3.139849e-16
4,54.738325,-10.815425,11.28867,-0.172892,0.199916,-1.038345,-2.272757,1.286362,-0.228206,-0.121589,...,-3.747003e-16,9.558326e-16,-3.04444e-16,-1.24358e-16,-2.151057e-16,9.003215e-16,-4.054916e-16,9.384854e-16,1.144917e-16,5.2041700000000004e-18


In [41]:
X_test['day_of_month'] = day_of_month

In [67]:
day_of_month.isna().sum()

0

In [61]:
X_test['day_of_month'].isna().sum()

28512

In [43]:
X_test.columns.size

151

In [71]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,day_of_month
0,54.735016,-6.82142,5.321452,-0.121539,0.187416,-0.897372,-2.291834,1.178392,-0.646932,-0.39629,...,7.060325e-16,-8.595555e-16,-7.627362e-16,-1.020017e-15,7.615436e-16,-1.099381e-15,1.465841e-15,4.753142e-16,5.2041700000000004e-18,
1,54.738457,-10.901443,11.417594,-0.174549,0.200042,-1.004832,-2.272362,1.291243,-0.184776,0.090314,...,1.025222e-15,-8.317999e-16,-1.104802e-16,-5.620504e-16,1.066855e-15,-4.466913e-17,7.441964e-16,2.255141e-16,3.382711e-16,
2,54.741884,-5.762823,7.334577,-0.143222,0.198285,-0.977237,-2.248057,1.234142,0.22747,-0.067357,...,1.344411e-15,-8.040443e-16,9.768662000000001e-17,-7.424616e-16,6.782769e-16,9.410875e-17,8.829742e-16,7.251144e-16,3.660267e-16,
3,54.783501,24.485242,-5.499398,-0.015502,0.232918,-0.890922,-1.898497,0.905668,-0.373819,-0.73831,...,1.191755e-15,-9.705778e-16,-2.631359e-16,4.8572260000000006e-17,1.344411e-15,-3.77736e-16,4.38885e-16,1.307982e-15,-3.139849e-16,
4,54.738325,-10.815425,11.28867,-0.172892,0.199916,-1.038345,-2.272757,1.286362,-0.228206,-0.121589,...,9.558326e-16,-3.04444e-16,-1.24358e-16,-2.151057e-16,9.003215e-16,-4.054916e-16,9.384854e-16,1.144917e-16,5.2041700000000004e-18,


In [117]:
X_test.drop(["day_of_month"],axis=1,inplace=True)

In [118]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,54.735016,-6.82142,5.321452,-0.121539,0.187416,-0.897372,-2.291834,1.178392,-0.646932,-0.39629,...,-1.179612e-15,7.060325e-16,-8.595555e-16,-7.627362e-16,-1.020017e-15,7.615436e-16,-1.099381e-15,1.465841e-15,4.753142e-16,5.2041700000000004e-18
1,54.738457,-10.901443,11.417594,-0.174549,0.200042,-1.004832,-2.272362,1.291243,-0.184776,0.090314,...,-2.359224e-16,1.025222e-15,-8.317999e-16,-1.104802e-16,-5.620504e-16,1.066855e-15,-4.466913e-17,7.441964e-16,2.255141e-16,3.382711e-16
2,54.741884,-5.762823,7.334577,-0.143222,0.198285,-0.977237,-2.248057,1.234142,0.22747,-0.067357,...,-6.800116e-16,1.344411e-15,-8.040443e-16,9.768662000000001e-17,-7.424616e-16,6.782769e-16,9.410875e-17,8.829742e-16,7.251144e-16,3.660267e-16
3,54.783501,24.485242,-5.499398,-0.015502,0.232918,-0.890922,-1.898497,0.905668,-0.373819,-0.73831,...,-8.187895e-16,1.191755e-15,-9.705778e-16,-2.631359e-16,4.8572260000000006e-17,1.344411e-15,-3.77736e-16,4.38885e-16,1.307982e-15,-3.139849e-16
4,54.738325,-10.815425,11.28867,-0.172892,0.199916,-1.038345,-2.272757,1.286362,-0.228206,-0.121589,...,-3.747003e-16,9.558326e-16,-3.04444e-16,-1.24358e-16,-2.151057e-16,9.003215e-16,-4.054916e-16,9.384854e-16,1.144917e-16,5.2041700000000004e-18


In [85]:
day_of_month = day_of_month.to_frame()

In [95]:
day_of_month.head()

Unnamed: 0,day_of_month
3008016,16
3008017,16
3008018,16
3008019,16
3008020,16


In [96]:
day_of_month.isna().sum()

day_of_month    0
dtype: int64

In [98]:
day_of_month.shape

(28512, 1)

In [105]:
X_test.shape

(28512, 150)

In [100]:
# X_test['day_of_month'] = day_of_month

In [119]:
X_test.insert(150, "day_of_month", day_of_month, True)

In [120]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,day_of_month
0,54.735016,-6.82142,5.321452,-0.121539,0.187416,-0.897372,-2.291834,1.178392,-0.646932,-0.39629,...,7.060325e-16,-8.595555e-16,-7.627362e-16,-1.020017e-15,7.615436e-16,-1.099381e-15,1.465841e-15,4.753142e-16,5.2041700000000004e-18,16
1,54.738457,-10.901443,11.417594,-0.174549,0.200042,-1.004832,-2.272362,1.291243,-0.184776,0.090314,...,1.025222e-15,-8.317999e-16,-1.104802e-16,-5.620504e-16,1.066855e-15,-4.466913e-17,7.441964e-16,2.255141e-16,3.382711e-16,16
2,54.741884,-5.762823,7.334577,-0.143222,0.198285,-0.977237,-2.248057,1.234142,0.22747,-0.067357,...,1.344411e-15,-8.040443e-16,9.768662000000001e-17,-7.424616e-16,6.782769e-16,9.410875e-17,8.829742e-16,7.251144e-16,3.660267e-16,16
3,54.783501,24.485242,-5.499398,-0.015502,0.232918,-0.890922,-1.898497,0.905668,-0.373819,-0.73831,...,1.191755e-15,-9.705778e-16,-2.631359e-16,4.8572260000000006e-17,1.344411e-15,-3.77736e-16,4.38885e-16,1.307982e-15,-3.139849e-16,16
4,54.738325,-10.815425,11.28867,-0.172892,0.199916,-1.038345,-2.272757,1.286362,-0.228206,-0.121589,...,9.558326e-16,-3.04444e-16,-1.24358e-16,-2.151057e-16,9.003215e-16,-4.054916e-16,9.384854e-16,1.144917e-16,5.2041700000000004e-18,16


In [111]:
X_test['day_of_month'].value_counts()

16    1782
17    1782
18    1782
19    1782
20    1782
21    1782
22    1782
23    1782
24    1782
25    1782
26    1782
27    1782
28    1782
29    1782
30    1782
31    1782
Name: day_of_month, dtype: int64

## Experiment I -- Linear Regression

In [44]:
run_experiment_I = True
if run_experiment_I:
    lr = LinearRegression()
    lr.fit(X_train, y_train)



In [45]:
if run_experiment_I:
    y_pred_train = lr.predict(X_train)
    y_pred_train[y_pred_train < 0] = 0
    y_pred_val = lr.predict(X_val)
    y_pred_val[y_pred_val < 0] = 0

    print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
    print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
    print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
    print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))



RMS log-error train:  0.16114941007025096
RMS log-error val:  0.16183531040518712
RMS log-error train (actual):  0.4264715466621936
RMS log-error val (actual):  0.42689531109800605


## Experiment II -- Lightgbm

In [12]:
lgb_params = {
    'boosting_type' : 'gbdt',  # gradient boosting decision tree
    'early_stopping_rounds': 200,
    'force_col_wise': True,
    'learning_rate': 0.1,
    'max_depth': 10,
    'metric': 'mse',  # mean square error
    'num_iterations': 5000,
    'num_leaves': 10,
    'random_state': 1,
    'verbose': 0
}

run_experiment_II = True  # should be true
if run_experiment_II:
    X_train_lgb = lightgbm.Dataset(data=X_train, label=y_train, feature_name='auto')
    X_val_lgb = lightgbm.Dataset(data=X_val, label=y_val, reference=X_train_lgb, feature_name='auto')

In [13]:
if run_experiment_II:
    lgb = lightgbm.train(
        params=lgb_params, 
        train_set=X_train_lgb,
        valid_sets=[X_train_lgb, X_val_lgb],
    )

[1]	training's l2: 3.08025	valid_1's l2: 3.32536
Training until validation scores don't improve for 200 rounds
[2]	training's l2: 2.64088	valid_1's l2: 2.8625
[3]	training's l2: 2.28494	valid_1's l2: 2.49129
[4]	training's l2: 1.99155	valid_1's l2: 2.18464
[5]	training's l2: 1.75269	valid_1's l2: 1.93936
[6]	training's l2: 1.55293	valid_1's l2: 1.73581
[7]	training's l2: 1.38871	valid_1's l2: 1.56638
[8]	training's l2: 1.2528	valid_1's l2: 1.43213
[9]	training's l2: 1.14397	valid_1's l2: 1.32266
[10]	training's l2: 1.05567	valid_1's l2: 1.23178
[11]	training's l2: 0.982369	valid_1's l2: 1.16004
[12]	training's l2: 0.919926	valid_1's l2: 1.10163
[13]	training's l2: 0.869092	valid_1's l2: 1.05099
[14]	training's l2: 0.826757	valid_1's l2: 1.01009
[15]	training's l2: 0.788786	valid_1's l2: 0.980336
[16]	training's l2: 0.753583	valid_1's l2: 0.945958
[17]	training's l2: 0.724031	valid_1's l2: 0.917791
[18]	training's l2: 0.69783	valid_1's l2: 0.900712
[19]	training's l2: 0.674961	valid_1's



[171]	training's l2: 0.199387	valid_1's l2: 0.764011
[172]	training's l2: 0.197458	valid_1's l2: 0.763854
[173]	training's l2: 0.1966	valid_1's l2: 0.761488
[174]	training's l2: 0.195455	valid_1's l2: 0.761336
[175]	training's l2: 0.194324	valid_1's l2: 0.763025
[176]	training's l2: 0.193571	valid_1's l2: 0.763779
[177]	training's l2: 0.192727	valid_1's l2: 0.763965
[178]	training's l2: 0.191959	valid_1's l2: 0.764188
[179]	training's l2: 0.191085	valid_1's l2: 0.76299
[180]	training's l2: 0.190247	valid_1's l2: 0.761166
[181]	training's l2: 0.189809	valid_1's l2: 0.761296
[182]	training's l2: 0.188997	valid_1's l2: 0.761071
[183]	training's l2: 0.18798	valid_1's l2: 0.760581
[184]	training's l2: 0.186727	valid_1's l2: 0.760571
[185]	training's l2: 0.185686	valid_1's l2: 0.760414
[186]	training's l2: 0.185	valid_1's l2: 0.760588
[187]	training's l2: 0.183842	valid_1's l2: 0.76059
[188]	training's l2: 0.182729	valid_1's l2: 0.760463
[189]	training's l2: 0.181973	valid_1's l2: 0.761721
[

[374]	training's l2: 0.0847432	valid_1's l2: 0.771207
[375]	training's l2: 0.0843536	valid_1's l2: 0.77109
[376]	training's l2: 0.0840959	valid_1's l2: 0.770744
[377]	training's l2: 0.0837883	valid_1's l2: 0.771273
[378]	training's l2: 0.0834566	valid_1's l2: 0.771106
[379]	training's l2: 0.0833779	valid_1's l2: 0.771085
[380]	training's l2: 0.0832138	valid_1's l2: 0.770778
[381]	training's l2: 0.082857	valid_1's l2: 0.770971
[382]	training's l2: 0.0827012	valid_1's l2: 0.771202
[383]	training's l2: 0.0824927	valid_1's l2: 0.771702
[384]	training's l2: 0.0821722	valid_1's l2: 0.771245
[385]	training's l2: 0.0817519	valid_1's l2: 0.770176
[386]	training's l2: 0.0814709	valid_1's l2: 0.770151
[387]	training's l2: 0.0810742	valid_1's l2: 0.769944
[388]	training's l2: 0.0809308	valid_1's l2: 0.769869
[389]	training's l2: 0.0807303	valid_1's l2: 0.769868
[390]	training's l2: 0.0804766	valid_1's l2: 0.769616
[391]	training's l2: 0.0801384	valid_1's l2: 0.768602
[392]	training's l2: 0.0797511

In [14]:
if run_experiment_II:
    y_pred_train = lgb.predict(X_train, num_iteration=lgb.best_iteration)
    y_pred_train[y_pred_train < 0] = 0
    y_pred_val = lgb.predict(X_val, num_iteration=lgb.best_iteration)
    y_pred_val[y_pred_val < 0] = 0

    print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
    print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
    print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
    print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))

RMS log-error train:  0.09706488341239258
RMS log-error val:  0.2018445806471314
RMS log-error train (actual):  0.3953365923801808
RMS log-error val (actual):  0.8708139540731853


## Experiment III -- Random Forest (Too Slow)

In [11]:
run_experiment_III = True
if run_experiment_III:
    random_forest = RandomForestRegressor(random_state=1)
    random_forest.fit(X_train, y_train.values.ravel())

KeyboardInterrupt: 

In [16]:
if run_experiment_III:
    y_pred_train = random_forest.predict(X_train)
    y_pred_train[y_pred_train < 0] = 0
    y_pred_val = random_forest.predict(X_val)
    y_pred_val[y_pred_val < 0] = 0

    print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
    print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
    print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
    print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))

RMS log-error train:  0.07991230493098243
RMS log-error val:  0.198402429075905
RMS log-error train (actual):  0.29627903576647857
RMS log-error val (actual):  0.8535536261784764


## Experiment IV -- XGBoost

In [10]:
param = {
      "max_depth": [6,10],
      "n_estimators": [80,100],
#        "early_stopping_rounds"=[50,55,60],
      "learning_rate": [0.01,0.1],
      #'colsample_bylevel': [0.1,0.5,1]
      
  }

In [14]:
run_experiment_IV = True
if run_experiment_IV:
    xgb = xgboost.XGBRegressor()
# xgb.fit(X_train, y_train,
#         eval_set=[(X_train, y_train), (X_val, y_val)],
#         verbose=True)

In [17]:
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param,
    scoring = 'neg_mean_squared_error',
    n_jobs = -1,
    cv = 2,
    verbose=True
)

In [None]:
grid_search.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=True)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[0]	validation_0-rmse:3.81863	validation_1-rmse:3.82278
[0]	validation_0-rmse:3.81863	validation_1-rmse:3.82278
[0]	validation_0-rmse:3.81863	validation_1-rmse:3.82278
[0]	validation_0-rmse:3.81863	validation_1-rmse:3.82278
[0]	validation_0-rmse:3.81854	validation_1-rmse:3.82270
[0]	validation_0-rmse:3.81855	validation_1-rmse:3.82270
[0]	validation_0-rmse:3.81855	validation_1-rmse:3.82270
[0]	validation_0-rmse:3.81854	validation_1-rmse:3.82270
[1]	validation_0-rmse:3.78101	validation_1-rmse:3.78513
[1]	validation_0-rmse:3.78101	validation_1-rmse:3.78513
[1]	validation_0-rmse:3.78101	validation_1-rmse:3.78513
[1]	validation_0-rmse:3.78101	validation_1-rmse:3.78513
[1]	validation_0-rmse:3.78083	validation_1-rmse:3.78497
[1]	validation_0-rmse:3.78085	validation_1-rmse:3.78498
[1]	validation_0-rmse:3.78085	validation_1-rmse:3.78498
[1]	validation_0-rmse:3.78083	validation_1-rmse:3.78497
[2]	validation_0-rmse:3.74378	validation_1-r

In [46]:
xgb = xgboost.XGBRegressor(n_estimators = 110, early_stopping_rounds=60, learning_rate=0.4)
xgb.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)], verbose = True)

[0]	validation_0-rmse:2.35245	validation_1-rmse:2.35540
[1]	validation_0-rmse:1.46719	validation_1-rmse:1.46957
[2]	validation_0-rmse:0.96064	validation_1-rmse:0.96253
[3]	validation_0-rmse:0.68641	validation_1-rmse:0.68819
[4]	validation_0-rmse:0.54877	validation_1-rmse:0.55048
[5]	validation_0-rmse:0.48273	validation_1-rmse:0.48479
[6]	validation_0-rmse:0.45276	validation_1-rmse:0.45509
[7]	validation_0-rmse:0.43745	validation_1-rmse:0.43973
[8]	validation_0-rmse:0.42934	validation_1-rmse:0.43185
[9]	validation_0-rmse:0.42333	validation_1-rmse:0.42601
[10]	validation_0-rmse:0.42010	validation_1-rmse:0.42289
[11]	validation_0-rmse:0.41676	validation_1-rmse:0.41988
[12]	validation_0-rmse:0.41425	validation_1-rmse:0.41774
[13]	validation_0-rmse:0.41218	validation_1-rmse:0.41607
[14]	validation_0-rmse:0.41042	validation_1-rmse:0.41482
[15]	validation_0-rmse:0.40899	validation_1-rmse:0.41352
[16]	validation_0-rmse:0.40782	validation_1-rmse:0.41252
[17]	validation_0-rmse:0.40704	validation

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=60, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.4, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=110, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [47]:
#if run_experiment_IV:
y_pred_train = xgb.predict(X_train)
y_pred_train[y_pred_train < 0] = 0
y_pred_val = xgb.predict(X_val)
y_pred_val[y_pred_val < 0] = 0

print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
print("RMS log-error train (actual): ",
        np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
print("RMS log-error val (actual): ",
        np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))

RMS log-error train:  0.14556416036555803
RMS log-error val:  0.14916261370334957
RMS log-error train (actual):  0.37001666967804664
RMS log-error val (actual):  0.3866786406166397


## Experiment V -- SVR

In [25]:
from sklearn.svm import SVR
run_experiment_V = False  # slow
if run_experiment_V:
    # svr = SVR(C=1.0, epsilon=0.2)
    svr = SVR(kernel='rbf')
    svr.fit(X_train, y_train.values.ravel())

In [26]:
if run_experiment_V:
    y_pred_train = svr.predict(X_train)
    y_pred_train[y_pred_train < 0] = 0
    y_pred_val = svr.predict(X_val)
    y_pred_val[y_pred_val < 0] = 0

    print("RMS log-error train: ", np.sqrt(mean_squared_log_error(y_train, y_pred_train)))
    print("RMS log-error val: ", np.sqrt(mean_squared_log_error(y_val, y_pred_val)))
    print("RMS log-error train (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_train), np.expm1(y_pred_train))))
    print("RMS log-error val (actual): ",
          np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred_val))))

## Test (Moment of Truth)

In [114]:
X_test['day_of_month'].head()

0    16
1    16
2    16
3    16
4    16
Name: day_of_month, dtype: int64

In [128]:
X_test.shape

(28512, 151)

In [129]:
X_test.head()
X_test_mod = X_test.copy()
# pred = xgb.predict(X_test_mod[X_test_mod['day_of_month'] == 2])

In [141]:
def main_predict(model, X_test):
    X_test_mod = X_test.copy()
    output = np.array([])
    start_day, end_day = X_test['day_of_month'].min(), X_test['day_of_month'].max()
        # we lost the dates, but we still have day_of_month, which is good enough for our experiment
        
    for day in range(start_day, end_day + 1):
        pred = model.predict(X_test_mod[X_test_mod['day_of_month'] == day])
        pred[pred < 0] = 0
        print(pred)
        print(pred.shape)
        output = np.concatenate([output, pred], axis=0)
        for future in range(day + 1, end_day + 1):
            X_test_mod.loc[X_test_mod[X_test_mod['day_of_month'] == future].index,f'sales_lag_{(future - day):02d}'] = pred
            # fill out future values now that this sales figure is available
            
    return output

In [142]:
X_test_mod[X_test_mod['day_of_month'] == 16].index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1772, 1773, 1774, 1775, 1776, 1777, 1778, 1779, 1780, 1781],
           dtype='int64', length=1782)

In [143]:
y_pred_test = main_predict(xgb, X_test)

[1.6163301  0.05990369 1.7578124  ... 7.614691   5.3166647  2.7478967 ]
(1782,)


ValueError: Feature shape mismatch, expected: 151, got 166

In [24]:
delta_index = 3008016 - 3000888  # we inserted 4 Christmas days, 4 x 54 x 33 = 7128, which is the difference
#submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': np.expm1(y_pred_test)})
#submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': max(y_pred_test, 0)})
submission = pd.DataFrame({'id': X_test.index - delta_index, 'sales': np.expm1(y_pred_test)})
submission.to_csv('submissionxgboostpca.csv', index=False)