In [99]:
# importing necessary libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [100]:
import xgboost as xg
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [101]:
# importing dataset
train = pd.read_csv('TRAIN.csv')
test = pd.read_csv('TEST_FINAL.csv')
train.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [102]:
test.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount
0,T1188341,171,S4,L2,R3,2019-06-01,0,No
1,T1188342,172,S1,L1,R1,2019-06-01,0,No
2,T1188343,173,S4,L2,R1,2019-06-01,0,No
3,T1188344,174,S1,L1,R4,2019-06-01,0,No
4,T1188345,170,S1,L1,R2,2019-06-01,0,No


In [103]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             188340 non-null  object 
 1   Store_id       188340 non-null  int64  
 2   Store_Type     188340 non-null  object 
 3   Location_Type  188340 non-null  object 
 4   Region_Code    188340 non-null  object 
 5   Date           188340 non-null  object 
 6   Holiday        188340 non-null  int64  
 7   Discount       188340 non-null  object 
 8   #Order         188340 non-null  int64  
 9   Sales          188340 non-null  float64
dtypes: float64(1), int64(3), object(6)
memory usage: 14.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22265 entries, 0 to 22264
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             22265 non-null  object
 1   Store_id       22265 non-null  int64 
 

In [104]:
train['Year'] = [i[0] for i in train.Date.str.split('-').values]
test['Year'] = [i[0] for i in test.Date.str.split('-').values]

train['Month'] = [i[1] for i in train.Date.str.split('-').values]
test['Month'] = [i[1] for i in test.Date.str.split('-').values]

train['Day'] = [i[1] for i in train.Date.str.split('-').values]
test['Day'] = [i[1] for i in test.Date.str.split('-').values]

In [105]:
# train['Year'] = train['Year'].astype('int64')
# test['Year']=test['Year'].astype('int64')

# train['Month'] = train['Month'].astype('int64')
# test['Month']=test['Month'].astype('int64')

# train['Day'] = train['Day'].astype('int64')
# test['Day']=test['Day'].astype('int64')

In [106]:
train = train.drop(['ID','#Order', 'Date'], axis=1)
test = test.drop(['ID','Date'], axis=1)

In [107]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Store_id       188340 non-null  int64  
 1   Store_Type     188340 non-null  object 
 2   Location_Type  188340 non-null  object 
 3   Region_Code    188340 non-null  object 
 4   Holiday        188340 non-null  int64  
 5   Discount       188340 non-null  object 
 6   Sales          188340 non-null  float64
 7   Year           188340 non-null  object 
 8   Month          188340 non-null  object 
 9   Day            188340 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 14.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22265 entries, 0 to 22264
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Store_id       22265 non-null  int64 
 1   Store_Type     22265 non-null  object
 

In [108]:
train.head()

Unnamed: 0,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,Sales,Year,Month,Day
0,1,S1,L3,R1,1,Yes,7011.84,2018,1,1
1,253,S4,L2,R1,1,Yes,51789.12,2018,1,1
2,252,S3,L2,R1,1,Yes,36868.2,2018,1,1
3,251,S2,L3,R1,1,Yes,19715.16,2018,1,1
4,250,S2,L3,R4,1,Yes,45614.52,2018,1,1


In [109]:
obj_col = train.loc[:, train.dtypes == object]
dummy = pd.get_dummies(obj_col, drop_first=True)
dummy

Unnamed: 0,Store_Type_S2,Store_Type_S3,Store_Type_S4,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,Region_Code_R2,Region_Code_R3,Region_Code_R4,...,Day_03,Day_04,Day_05,Day_06,Day_07,Day_08,Day_09,Day_10,Day_11,Day_12
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188335,1,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
188336,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
188337,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
188338,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [110]:
date_col = train.loc[:, ['Year', 'Month', 'Day']]
date_col

Unnamed: 0,Year,Month,Day
0,2018,01,01
1,2018,01,01
2,2018,01,01
3,2018,01,01
4,2018,01,01
...,...,...,...
188335,2019,05,05
188336,2019,05,05
188337,2019,05,05
188338,2019,05,05


In [115]:
date_col['Day'].value_counts()

05    22630
03    22630
01    22630
04    21900
02    20440
07    11315
08    11315
12    11315
10    11315
11    10950
09    10950
06    10950
Name: Day, dtype: int64

In [118]:
date_dummy = pd.get_dummies(date_col, drop_first=True)
date_dummy.head()

Unnamed: 0,Year_2019,Month_02,Month_03,Month_04,Month_05,Month_06,Month_07,Month_08,Month_09,Month_10,...,Day_03,Day_04,Day_05,Day_06,Day_07,Day_08,Day_09,Day_10,Day_11,Day_12
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
date_dummy.columns =['Year', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'M10', 'M11', 'D1', 'D2',
                     'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11']
date_dummy.head()

Unnamed: 0,Year,M1,M2,M3,M4,M5,M6,M7,M8,M9,...,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
train = pd.concat([train, dummy, date_dummy], axis=1)
train.head()

Unnamed: 0,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,Sales,Year,Month,Day,...,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11
0,1,S1,L3,R1,1,Yes,7011.84,2018,1,1,...,0,0,0,0,0,0,0,0,0,0
1,253,S4,L2,R1,1,Yes,51789.12,2018,1,1,...,0,0,0,0,0,0,0,0,0,0
2,252,S3,L2,R1,1,Yes,36868.2,2018,1,1,...,0,0,0,0,0,0,0,0,0,0
3,251,S2,L3,R1,1,Yes,19715.16,2018,1,1,...,0,0,0,0,0,0,0,0,0,0
4,250,S2,L3,R4,1,Yes,45614.52,2018,1,1,...,0,0,0,0,0,0,0,0,0,0


In [121]:
train = train.drop(list(obj_col.columns), axis=1)
train.head()

Unnamed: 0,Store_id,Holiday,Sales,Store_Type_S2,Store_Type_S3,Store_Type_S4,Location_Type_L2,Location_Type_L3,Location_Type_L4,Location_Type_L5,...,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11
0,1,1,7011.84,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,253,1,51789.12,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,252,1,36868.2,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,251,1,19715.16,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,250,1,45614.52,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
X = train.drop('Sales', axis=1)
y = train.Sales

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [124]:
X_train.shape, y_train.shape

((131838, 58), (131838,))

In [125]:
X_train.columns

Index(['Store_id', 'Holiday', 'Store_Type_S2', 'Store_Type_S3',
       'Store_Type_S4', 'Location_Type_L2', 'Location_Type_L3',
       'Location_Type_L4', 'Location_Type_L5', 'Region_Code_R2',
       'Region_Code_R3', 'Region_Code_R4', 'Discount_Yes', 'Year_2019',
       'Month_02', 'Month_03', 'Month_04', 'Month_05', 'Month_06', 'Month_07',
       'Month_08', 'Month_09', 'Month_10', 'Month_11', 'Month_12', 'Day_02',
       'Day_03', 'Day_04', 'Day_05', 'Day_06', 'Day_07', 'Day_08', 'Day_09',
       'Day_10', 'Day_11', 'Day_12', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7',
       'M8', 'M9', 'M10', 'M11', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7',
       'D8', 'D9', 'D10', 'D11'],
      dtype='object')

In [136]:
help(boost)

Help on XGBRegressor in module xgboost.sklearn object:

class XGBRegressor(XGBModel, sklearn.base.RegressorMixin)
 |  XGBRegressor(*, objective='reg:squarederror', **kwargs)
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  
 |  
 |  Parameters
 |  ----------
 |  
 |      n_estimators : int
 |          Number of gradient boosted trees.  Equivalent to number of boosting
 |          rounds.
 |  
 |      max_depth : int
 |          Maximum tree depth for base learners.
 |      learning_rate : float
 |          Boosting learning rate (xgb's "eta")
 |      verbosity : int
 |          The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 |      objective : string or callable
 |          Specify the learning task and the corresponding learning objective or
 |          a custom objective function to be used (see note below).
 |      booster: string
 |          Specify which booster to use: gbtree, gblinear or dart.
 |      tree_method: string
 |          

In [137]:
params = {
    "n_estimators" : [100, 200, 300], 
    "max_depth" : [3, 5, 7, 8],
    "learning_rate" : [0.05, 0.10, 0.15, 0.20],
    "gamma" : [0.0, 0.1, 0.2, 0.3, 0.4], 
    "min_child_weight": [1,3,5,7]
}

In [138]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [126]:
# boost = xg.XGBRegressor(n_estimators=320, max_depth=7, learning_rate=0.1, gamma=1,
#                         booster='gbtree', tree_method='exact', reg_alpha=40, reg_lambda=50)
# boost.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=320, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=40, reg_lambda=50, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [139]:
classifier = xg.XGBRegressor()

In [149]:
random_search = RandomizedSearchCV(classifier, param_distributions=params, n_iter=5, n_jobs=-1, cv=2, verbose=3)

In [150]:
random_search.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV 2/2] END gamma=0.1, learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=300; total time=11.0min
[CV 1/2] END gamma=0.1, learning_rate=0.2, max_depth=5, min_child_weight=5, n_estimators=300; total time=11.1min


RandomizedSearchCV(cv=2,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                          num_parallel_tree=None,
                                          random_state=None, reg_alpha=None,
                                          reg_l

[CV 2/2] END gamma=0.0, learning_rate=0.15, max_depth=7, min_child_weight=7, n_estimators=200; total time=13.0min
[CV 2/2] END gamma=0.4, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=200; total time=13.2min
[CV 1/2] END gamma=0.4, learning_rate=0.2, max_depth=7, min_child_weight=5, n_estimators=200; total time=13.2min
[CV 1/2] END gamma=0.0, learning_rate=0.15, max_depth=7, min_child_weight=7, n_estimators=200; total time=13.2min


In [151]:
random_search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=5,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [153]:
random_search.best_params_

{'n_estimators': 300,
 'min_child_weight': 5,
 'max_depth': 5,
 'learning_rate': 0.2,
 'gamma': 0.1}

In [155]:
# Randomized parameter
reg = xg.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=5,
             min_child_weight=5,
             n_estimators=300, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)
reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=5,
             min_child_weight=5, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [156]:
rmse = np.sqrt(mean_squared_error(y_train, reg.predict(X_train)))
rmse

9917.129022276213

In [157]:
rmse = np.sqrt(mean_squared_error(y_test, reg.predict(X_test)))
rmse

10228.369097129123

[CV 2/2] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=200; total time= 4.1min
[CV 2/2] END gamma=0.4, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=200; total time=13.3min
[CV 1/2] END gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=200; total time= 4.1min
[CV 1/2] END gamma=0.4, learning_rate=0.05, max_depth=8, min_child_weight=5, n_estimators=200; total time=13.3min


In [129]:
X_train_sm = sm.add_constant(X_train)
LR = sm.OLS(y_train, X_train_sm)
LR_model = LR.fit()

In [130]:
LR_model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.569
Model:,OLS,Adj. R-squared:,0.569
Method:,Least Squares,F-statistic:,6961.0
Date:,"Sat, 18 Sep 2021",Prob (F-statistic):,0.0
Time:,15:53:55,Log-Likelihood:,-1426500.0
No. Observations:,131838,AIC:,2853000.0
Df Residuals:,131812,BIC:,2853000.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.688e+04,148.862,247.774,0.000,3.66e+04,3.72e+04
Store_id,-0.1568,0.324,-0.484,0.629,-0.792,0.479
Holiday,-7796.6941,100.481,-77.594,0.000,-7993.635,-7599.753
Store_Type_S2,-4811.1755,116.843,-41.176,0.000,-5040.186,-4582.165
Store_Type_S3,4747.4964,111.461,42.593,0.000,4529.035,4965.958
Store_Type_S4,1.354e+04,113.908,118.882,0.000,1.33e+04,1.38e+04
Location_Type_L2,1.048e+04,99.424,105.449,0.000,1.03e+04,1.07e+04
Location_Type_L3,-4527.3583,104.778,-43.209,0.000,-4732.722,-4321.995
Location_Type_L4,-7798.1089,157.747,-49.434,0.000,-8107.290,-7488.928

0,1,2,3
Omnibus:,41506.645,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,332909.806
Skew:,1.293,Prob(JB):,0.0
Kurtosis:,10.343,Cond. No.,1.15e+16


In [131]:
# Checking VIF score so that if any feature score is > 5, we can drop it to improve our model
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif.sort_values(by='VIF',ascending=False)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Features,VIF
29,Day_06,inf
30,Day_07,inf
32,Day_09,inf
33,Day_10,inf
34,Day_11,inf
35,Day_12,inf
36,M1,inf
37,M2,inf
38,M3,inf
39,M4,inf


In [132]:
X_train_sm = X_train_sm.drop('Store_id', axis=1)

In [133]:
LR = sm.OLS(y_train, X_train_sm)
LR_model = LR.fit()

In [134]:
LR_model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.569
Model:,OLS,Adj. R-squared:,0.569
Method:,Least Squares,F-statistic:,7251.0
Date:,"Sat, 18 Sep 2021",Prob (F-statistic):,0.0
Time:,15:57:03,Log-Likelihood:,-1426500.0
No. Observations:,131838,AIC:,2853000.0
Df Residuals:,131813,BIC:,2853000.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.686e+04,136.228,270.540,0.000,3.66e+04,3.71e+04
Holiday,-7796.7840,100.481,-77.595,0.000,-7993.724,-7599.844
Store_Type_S2,-4816.4570,116.331,-41.403,0.000,-5044.464,-4588.450
Store_Type_S3,4747.5403,111.461,42.594,0.000,4529.080,4966.001
Store_Type_S4,1.354e+04,113.486,119.283,0.000,1.33e+04,1.38e+04
Location_Type_L2,1.049e+04,99.383,105.506,0.000,1.03e+04,1.07e+04
Location_Type_L3,-4529.8126,104.655,-43.283,0.000,-4734.934,-4324.691
Location_Type_L4,-7803.9727,157.280,-49.618,0.000,-8112.238,-7495.707
Location_Type_L5,-1.019e+04,158.769,-64.157,0.000,-1.05e+04,-9874.964

0,1,2,3
Omnibus:,41506.669,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,332931.024
Skew:,1.293,Prob(JB):,0.0
Kurtosis:,10.343,Cond. No.,3.07e+16


In [52]:
# Checking VIF score so that if any feature score is > 5, we can drop it to improve our model
vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif.sort_values(by='VIF',ascending=False)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Features,VIF
3,Month,inf
4,Day,inf
0,const,25271661.49
7,Store_Type_S4,2.14
8,Location_Type_L2,1.7
5,Store_Type_S2,1.59
11,Location_Type_L5,1.55
12,Region_Code_R2,1.5
13,Region_Code_R3,1.46
14,Region_Code_R4,1.41


In [53]:
#X_train_sm = X_train_sm.drop(['Month', 'Day'], axis=1)

In [71]:
X_test_sm = sm.add_constant(X_test)
#X_test_sm = X_test_sm.drop(['Month', 'Day'], axis=1)

In [59]:
X_test_sm = X_test_sm.drop('Store_id', axis=1)

In [54]:
LR = sm.OLS(y_train, X_train_sm)
LR_model = LR.fit()

In [55]:
LR_model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.549
Model:,OLS,Adj. R-squared:,0.549
Method:,Least Squares,F-statistic:,12320.0
Date:,"Sat, 18 Sep 2021",Prob (F-statistic):,0.0
Time:,14:59:31,Log-Likelihood:,-1429500.0
No. Observations:,131838,AIC:,2859000.0
Df Residuals:,131824,BIC:,2859000.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.411e+06,1.51e+05,-15.928,0.000,-2.71e+06,-2.11e+06
Holiday,-8742.3106,101.003,-86.555,0.000,-8940.275,-8544.347
Year,1211.9547,74.995,16.160,0.000,1064.966,1358.944
Store_Type_S2,-4821.2869,119.046,-40.499,0.000,-5054.614,-4587.960
Store_Type_S3,4755.5065,114.061,41.693,0.000,4531.948,4979.065
Store_Type_S4,1.354e+04,116.135,116.606,0.000,1.33e+04,1.38e+04
Location_Type_L2,1.051e+04,101.702,103.334,0.000,1.03e+04,1.07e+04
Location_Type_L3,-4505.4347,107.097,-42.069,0.000,-4715.342,-4295.527
Location_Type_L4,-7775.0593,160.949,-48.307,0.000,-8090.517,-7459.601

0,1,2,3
Omnibus:,42079.385,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,322290.012
Skew:,1.329,Prob(JB):,0.0
Kurtosis:,10.184,Cond. No.,8960000.0


In [56]:
# Checking VIF score so that if any feature score is > 5, we can drop it to improve our model
vif = pd.DataFrame()
vif['Features'] = X_train_sm.columns
vif['VIF'] = [variance_inflation_factor(X_train_sm.values, i) for i in range(X_train_sm.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif.sort_values(by='VIF',ascending=False)

Unnamed: 0,Features,VIF
0,const,19700094.2
5,Store_Type_S4,2.14
6,Location_Type_L2,1.7
3,Store_Type_S2,1.59
9,Location_Type_L5,1.55
10,Region_Code_R2,1.5
11,Region_Code_R3,1.46
12,Region_Code_R4,1.41
7,Location_Type_L3,1.32
4,Store_Type_S3,1.28


In [68]:
boost = xg.XGBRegressor(n_estimators=320, max_depth=7, learning_rate=0.1, gamma=1,
                        booster='gbtree', tree_method='exact', reg_alpha=40, reg_lambda=50)
boost.fit(X_train_sm, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=320, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=40, reg_lambda=50, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
rmse = np.sqrt(mean_squared_error(y_train, boost.predict(X_train_sm)))
rmse

9835.867208432637

In [72]:
rmse = np.sqrt(mean_squared_error(y_test, boost.predict(X_test_sm)))
rmse

10156.085873616557