In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
df = pd.read_csv('Walmart_Store_sales.csv')

In [3]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,5/2/2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12/2/2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,5/3/2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [4]:
df.shape

(6435, 8)

In [5]:
df.corr()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
Store,1.0,-0.335332,6.250842e-20,-0.022659,0.060023,-0.209492,0.223531
Weekly_Sales,-0.335332,1.0,0.03689097,-0.06381,0.009464,-0.072634,-0.106176
Holiday_Flag,6.250842e-20,0.036891,1.0,-0.155091,-0.078347,-0.002162,0.01096
Temperature,-0.02265908,-0.06381,-0.1550913,1.0,0.144982,0.176888,0.101158
Fuel_Price,0.06002295,0.009464,-0.07834652,0.144982,1.0,-0.170642,-0.034684
CPI,-0.2094919,-0.072634,-0.002162091,0.176888,-0.170642,1.0,-0.30202
Unemployment,0.2235313,-0.106176,0.01096028,0.101158,-0.034684,-0.30202,1.0


In [6]:
X = df.loc[:,['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']]
y = df['Weekly_Sales']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [8]:
# feature selection

In [9]:
rf = RandomForestRegressor().fit(X_train, y_train)
feature_importances = pd.DataFrame({'features': X_train.columns, 'importances': rf.feature_importances_})

In [10]:
feature_importances.sort_values(by = 'importances', ascending = False)

Unnamed: 0,features,importances
4,Unemployment,0.321241
3,CPI,0.264524
1,Temperature,0.257384
2,Fuel_Price,0.146043
0,Holiday_Flag,0.010809


In [11]:
X_train = X_train.loc[:, ['CPI', 'Unemployment', 'Temperature']]
X_test = X_test.loc[:, ['CPI', 'Unemployment', 'Temperature']]

In [12]:
columns = X_train.columns
scalerx = StandardScaler()
X_train_scaled = scalerx.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = columns)

X_test_scaled = scalerx.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = columns)

In [13]:
y_train = np.array(y_train).reshape(y_train.shape[0],1)
y_test = np.array(y_test).reshape(y_test.shape[0],1)

scalery = StandardScaler()
y_train_scaled = scalery.fit_transform(y_train).ravel()
y_test_scaled = scalery.transform(y_test)

In [14]:
params = {'n_estimators':[100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth':[3,4,5]}

reg = GradientBoostingRegressor()

folds = 5
model_cv = GridSearchCV(estimator = reg, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 3)            
model_cv.fit(X_train_scaled, y_train_scaled)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END ..................max_depth=3, n_estimators=100; total time=   0.3s
[CV 2/5] END ..................max_depth=3, n_estimators=100; total time=   0.3s
[CV 3/5] END ..................max_depth=3, n_estimators=100; total time=   0.3s
[CV 4/5] END ..................max_depth=3, n_estimators=100; total time=   0.3s
[CV 5/5] END ..................max_depth=3, n_estimators=100; total time=   0.3s
[CV 1/5] END ..................max_depth=3, n_estimators=150; total time=   0.5s
[CV 2/5] END ..................max_depth=3, n_estimators=150; total time=   0.5s
[CV 3/5] END ..................max_depth=3, n_estimators=150; total time=   0.5s
[CV 4/5] END ..................max_depth=3, n_estimators=150; total time=   0.5s
[CV 5/5] END ..................max_depth=3, n_estimators=150; total time=   0.5s
[CV 1/5] END ..................max_depth=3, n_estimators=200; total time=   0.7s
[CV 2/5] END ..................max_depth=3, n_e

[CV 2/5] END ..................max_depth=5, n_estimators=200; total time=   1.2s
[CV 3/5] END ..................max_depth=5, n_estimators=200; total time=   1.2s
[CV 4/5] END ..................max_depth=5, n_estimators=200; total time=   1.2s
[CV 5/5] END ..................max_depth=5, n_estimators=200; total time=   1.3s
[CV 1/5] END ..................max_depth=5, n_estimators=250; total time=   1.5s
[CV 2/5] END ..................max_depth=5, n_estimators=250; total time=   1.5s
[CV 3/5] END ..................max_depth=5, n_estimators=250; total time=   1.6s
[CV 4/5] END ..................max_depth=5, n_estimators=250; total time=   1.7s
[CV 5/5] END ..................max_depth=5, n_estimators=250; total time=   1.5s
[CV 1/5] END ..................max_depth=5, n_estimators=300; total time=   2.4s
[CV 2/5] END ..................max_depth=5, n_estimators=300; total time=   2.2s
[CV 3/5] END ..................max_depth=5, n_estimators=300; total time=   2.2s
[CV 4/5] END ...............

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(),
             param_grid={'max_depth': [3, 4, 5],
                         'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                          450, 500]},
             return_train_score=True, scoring='r2', verbose=3)

In [15]:
model_cv.best_params_

{'max_depth': 3, 'n_estimators': 450}

In [16]:
model = GradientBoostingRegressor(max_depth = 3, 
                                  n_estimators = 450, 
                                  random_state = 1234).fit(X_train_scaled, y_train_scaled)

In [17]:
maet = metrics.mean_absolute_error(y_train_scaled,model.predict(X_train_scaled))
mae = metrics.mean_absolute_error(y_test_scaled,model.predict(X_test_scaled))
mset = metrics.mean_squared_error(y_train_scaled,model.predict(X_train_scaled))
mse = metrics.mean_squared_error(y_test_scaled,model.predict(X_test_scaled))
rmset=np.sqrt(mset)
rmse=np.sqrt(mse)
r2_score_train = metrics.r2_score(y_train_scaled, model.predict(X_train_scaled))
r2_score_test = metrics.r2_score(y_test_scaled, model.predict(X_test_scaled))

print('Training R2 Score: ', r2_score_train)
print('Test R2 Score: ', r2_score_test)
print('\n')
print('Training MAE: ', maet)
print('Test MAE: ', mae)
print('\n')
print('Training MSE: ', mset)
print('Test MSE: ', mse)
print('\n')
print('Training RMSE: ', rmset)
print('Test RMSE: ', rmse)

Training R2 Score:  0.4967802101560683
Test R2 Score:  0.32693644374471564


Training MAE:  0.5615894364854833
Test MAE:  0.6331344894887558


Training MSE:  0.5032197898439318
Test MSE:  0.6248185922621741


Training RMSE:  0.7093798628689229
Test RMSE:  0.7904546743882119


In [18]:
model

GradientBoostingRegressor(n_estimators=450, random_state=1234)

In [19]:
scalerx

StandardScaler()

In [20]:
CPI = 211.096358
unemployment = 8.106
temperature = 42.31

test_data = np.array([CPI, unemployment, temperature]).reshape(1, 3)
test_data_scaled = scalerx.transform(test_data)

predicted_sales = model.predict(test_data_scaled)

In [21]:
predicted_sales

array([0.61567914])

In [22]:
predicted_sales_unscaled = scalery.inverse_transform(predicted_sales)

In [23]:
predicted_sales_unscaled

array([1400215.78324457])

In [None]:
1643690.90, 1567605.06700487, 1400215.7832