In [None]:
#Importing Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import linear_rainbow
import scipy.stats as stats
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from statsmodels.tools.eval_measures import rmse
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from statsmodels.tsa.stattools import adfuller
#from pandas.plotting import autocorrelation_plo
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import statsmodels
from pandas.tseries.offsets import DateOffset

In [None]:
plt.rcParams["figure.figsize"] = (18,8) # Setting the size of the images used here.

In [None]:
train = pd.read_csv('../input/walmart-sales-prediction/train.csv', parse_dates=['Date'])  # Training data
features = pd.read_csv('../input/walmart-sales-prediction/features.csv', parse_dates=['Date']) # Features
stores = pd.read_csv('../input/walmart-sales-prediction/stores.csv')  # Store Details

In [None]:
train_stores = pd.merge(left=train, right=stores, how='left', on='Store')  # Merging Train data and Stores Details

In [None]:
df = pd.merge(left=train_stores, right=features, how='left', on=['Store','Date','IsHoliday'])  # Merging Train data, Stores and Features
df.head()

In [None]:
print(f'Total number of rows --> {df.shape[0]}')
print(f'Total number of columns --> {df.shape[1]}')

In [None]:
df.info()  # Information about the columns

In [None]:
df.describe().T  # Five Point Summary

In [None]:
df.columns   # Columns present in the dataset

In [None]:
#Plotting a heatmap to check the missing values
sns.heatmap(data = df.isna(), yticklabels=False, cbar=False, cmap='Wistia')
plt.title('Missing Values')
plt.show()

In [None]:
#Number of missing values
df.isna().sum().sort_values(ascending=False).head()

In [None]:
#Percentage of missing values
df.isna().sum().sort_values(ascending=False).head()/df.shape[0]*100

In [None]:
#Imputing the missing values with 0 as it means there is no discount available there
df.fillna(0, inplace=True)

In [None]:
#Plotting a heatmap again to confirm that there are no missing values
sns.heatmap(data = df.isna(), yticklabels=False, cbar=False, cmap='Wistia')
plt.title('Missing Values')
plt.show()

In [None]:
df_markdown = df[['MarkDown1', 'MarkDown2', 'MarkDown3','MarkDown4', 'MarkDown5', 'Weekly_Sales']].copy()  # Creating a new dataframe with all MarkDowns and Weekly Sales.
df_markdown['Is_MarkDown'] = df_markdown.iloc[:,0:5].sum(axis=1).apply(lambda x: 0 if x == 0 else 1) # Add one new column is_MarkDown

In [None]:
df_is_markdown = df_markdown.loc[df_markdown['Is_MarkDown']==1, 'Weekly_Sales']  # Weekly Sales with MarkDown
df_no_markdown = df_markdown.loc[df_markdown['Is_MarkDown']==0, 'Weekly_Sales']  # Weekly Sales without MarkDown

In [None]:
df_is_markdown.shape, df_no_markdown.shape  # NUmber of records with and without MarkDown

In [None]:
print(f'Average Weekly Sales with Markdown --> {df_is_markdown.mean():.2f}\nAverage Weekly Sales without Markdown --> {df_no_markdown.mean():.2f}')

In [None]:
ttest_ind(df_is_markdown, df_no_markdown)  # Two Sample Independent T Test performed on weekly sales with MarkDown andd weekly sales without MarkDown

**As we can see from the ttest, there is a significant difference between the Weekly Sales with Markdown and the Weekly Sales without Markdown.**

<h2 style='font-family:rockwell; color:#06917e'> Exploratory Data Analysis</h2>

In [None]:
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
sns.heatmap(data=df.corr(), annot=True, cmap='afmhot_r', mask=mask)  # Heatmap for correlation
plt.title('Correlation Matrix')
plt.show()

In [None]:
cols_outlier = df[['Weekly_Sales', 'Fuel_Price', 'Size', 'CPI', 'Dept', 'Temperature','MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'Unemployment']]
fig, axes = plt.subplots(4,3,figsize=(18,12))
fig.suptitle('Outliers in the numerical features',fontsize=18, color = '#06917e', x = 0.5, y = 1.05)
index = [(i,j) for i in range(4) for j in range(3)]
index_count=0
for col in cols_outlier.columns:
    sns.boxplot(x=col, ax=axes[index[index_count]], data=df, palette='afmhot_r')
    index_count += 1
    plt.tight_layout()
plt.show()

In [None]:
sns.distplot(df['Weekly_Sales'], bins=30, kde=True)  # Distribution of Target Variable 'Weekly_Sales'
plt.show()

In [None]:
sns.boxplot(x='Weekly_Sales', y='IsHoliday', data=df, orient='h', palette='afmhot_r')  # Effect of IsHoliday on Weekly Sales
plt.show()

In [None]:
fig, axes = plt.subplots(3,2,figsize=(18,12))
ax_index = [(i,j) for i in range(3) for j in range(2)]
index_number = 0
fig.suptitle('Effect of various factors on Weekly Sales',fontsize=18, color = '#06917e', y = 1.05)
for i in ['Unemployment','IsHoliday','Size','CPI','Temperature','Fuel_Price']:
    sns.scatterplot(x=i, y='Weekly_Sales', data=df, ax=axes[ax_index[index_number]], palette='afmhot_r')
    index_number += 1
    plt.tight_layout()

In [None]:
sns.boxplot(x='Size', y='Type', data=df, palette='afmhot_r')
plt.title('Size of the Store with respect to Type')
plt.show()

In [None]:
TypewiseSize = df.groupby(by='Type')['Size']
print("Median Size for Type A Stores --> ",TypewiseSize.get_group('A').median())
print("Median Size for Type B Stores --> ",TypewiseSize.get_group('B').median())
print("Median Size for Type C Stores --> ",TypewiseSize.get_group('C').median())

In [None]:
sns.boxplot(y='Type',x='Weekly_Sales', data=df, orient='h', palette='afmhot_r')
plt.title('Type wise Weekly Sales')
plt.show()

In [None]:
TypewiseSales = df.groupby(by='Type')['Weekly_Sales']
print("Median Weekly Sales for Type A Stores --> ",TypewiseSales.get_group('A').median())
print("Median Weekly Sales for Type B Stores --> ",TypewiseSales.get_group('B').median())
print("Median Weekly Sales for Type C Stores --> ",TypewiseSales.get_group('C').median())

**Average number of departments in each type**

In [None]:
#Average Sales per stores
avg_sales_per_store = df.groupby(by='Store')['Weekly_Sales'].mean()
sns.barplot(x = avg_sales_per_store.index, y=avg_sales_per_store)
plt.title('Average Sales per Store')
plt.show()

In [None]:
avg_sales_per_store.sort_values(ascending = False).head()  # Top 5 most average weekly sales stores.

In [None]:
avg_sales_per_store.sort_values(ascending = False).tail()  # Bottom 5 less average weekly sales stores.

In [None]:
#Average Sales per Department
avg_sales_per_dept = df.groupby(by='Dept')['Weekly_Sales'].mean()
sns.barplot(x = avg_sales_per_dept.index, y=avg_sales_per_dept)
plt.title('Average Sales per Department')
plt.xticks(rotation = 90)
plt.show()

In [None]:
avg_sales_per_dept.sort_values().head(6)  # Bottom 6 departments with least average weekly sales. 

**As we can see from the above graph, few of the sales have almost no sales**

In [None]:
#Total Sales per stores
total_sales_per_store = df.groupby(by='Store')['Weekly_Sales'].sum()
sns.barplot(x = total_sales_per_store.index, y=total_sales_per_store)
plt.title('Total Sales per Store')
plt.show()

In [None]:
total_sales_per_year = df.groupby(by=[df['Date'].dt.year, 'Type'])['Weekly_Sales'].sum()
g = total_sales_per_year.unstack().plot(kind='bar')
for p in g.patches:
    g.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.085, p.get_height()),ha='center', va='bottom',color= 'black')
plt.title('Total Sales per Year - Type Wise')
plt.xticks(rotation=0)
plt.show()

In [None]:
sns.barplot(x=df['Date'].dt.isocalendar().week, y="Weekly_Sales", data=df, ci=None)
plt.title('Week wise Total Sales')
plt.show()

In [None]:
df_2010 = df.loc[ (df['Date'].dt.year==2010), ['Date', 'Weekly_Sales'] ].groupby(by='Date').sum()
df_2011 = df.loc[ (df['Date'].dt.year==2011), ['Date', 'Weekly_Sales'] ].groupby(by='Date').sum()
df_2012 = df.loc[ (df['Date'].dt.year==2012), ['Date', 'Weekly_Sales'] ].groupby(by='Date').sum()

a10 = pd.DataFrame(data = {'Week_num':df_2010.index.isocalendar().week , 'Sales_2010':df_2010['Weekly_Sales']})
a11 = pd.DataFrame(data = {'Week_num':df_2011.index.isocalendar().week , 'Sales_2011':df_2011['Weekly_Sales']})
a12 = pd.DataFrame(data = {'Week_num':df_2012.index.isocalendar().week , 'Sales_2012':df_2012['Weekly_Sales']})

x = pd.merge(a11, a10, how='outer', on='Week_num')
y = pd.merge(a12, x, how='outer', on='Week_num')

for i in y.columns[1:]:
    plt.plot(y['Week_num'], y[i], label=i)
plt.ylabel("Sales in millions dollars")
plt.xlabel("Week of the Year")
plt.xticks(np.arange(1,53))
plt.yticks(np.arange(20000000, 85000000, 5000000))
plt.title('Weekly Sales over the Years')
plt.legend()
plt.grid()
plt.show()

In [None]:
df['Type'].value_counts(normalize=True).plot(kind='pie', autopct='%.2f', explode=[0.05,0.05,0.05])
plt.legend(df['Type'].value_counts(normalize=True).index, loc = 'upper right')
plt.title('Distribution of Store Types')
plt.show()

In [None]:
month_wise_avg_sales=df.groupby(df['Date'].dt.month)['Weekly_Sales'].mean()
plt.title('Month wise Average Sales')
g = sns.barplot(x=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], y=month_wise_avg_sales)
for p in g.patches:
    g.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),ha='center', va='bottom',color= 'black')

<h2 style='font-family:rockwell; color:#06917e'> Linear Model:</h2>

In [None]:
df1 = df.drop(columns=['Date'])  # Dropping Date column as most of the values are unique

In [None]:
df1['IsHoliday'] = df1['IsHoliday'].apply(lambda x : 1 if x==True else 0)  # Label Encoding

In [None]:
df1 = pd.get_dummies(df1, drop_first=True)  # get_dummies for 'Type' column
df1.head()

In [None]:
inp = df1.drop('Weekly_Sales',1)  # Independent Features
out = df1['Weekly_Sales']  # Dependent Features

In [None]:
sc=StandardScaler()
inp_sc=sc.fit_transform(inp.iloc[:,2:])
inp_sc=pd.DataFrame(inp_sc,columns=inp.iloc[:,2:].columns)
inp_sc = pd.concat((inp.iloc[:,0:2],inp_sc),axis=1)
inp_sc.head(2)

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(inp_sc,out,test_size = 0.3, random_state = 40)  # Splitting data into Train and Test

In [None]:
# Creating a base model using OLS
inpc = sm.add_constant(inp_sc)
ols = sm.OLS(out,inpc)
ols_mod = ols.fit()
ols_mod.summary()

In [None]:
# Linear Base Model using Scikit-learn
lr=LinearRegression()
lr.fit(xtrain,ytrain)
ypred=lr.predict(xtest)
print('R-Square Value:',r2_score(ytest,ypred))
rmse=np.sqrt(mean_squared_error(ytest,ypred))
print('RMSE:',rmse)

In [None]:
#Checking for Assumptions:
#Multi-collinearity
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(inp_sc.values,i) for i in range(inp_sc.shape[1])]
vif['Features'] = inp_sc.columns
vif.sort_values('VIF', ascending= False)

**As all values are less than 5, There is no multi-colinearity present in the data.**

In [None]:
#Auto Correlation
inpc = sm.add_constant(xtrain)
ols = sm.OLS(ytrain,inpc)
ols_mod = ols.fit()
ols_mod.summary()

**As the value of Durbin-Watson test near 2, we can say there is no auto correlation present.**

In [None]:
#Linearity
inpc = sm.add_constant(inp_sc)
ols = sm.OLS(out,inpc)
ols_mod = ols.fit()
stat,p_value = linear_rainbow(res = ols_mod, frac = 0.5)
stat,p_value

**As the P-value greater than 0.05, It is following linearity.**

In [None]:
#Homoscedasticity
sns.scatterplot(ols_mod.predict(),ols_mod.resid)
plt.show()

**From the graph, we can say that the model is homoscedastic.**

In [None]:
#Feature Selection
lr=LinearRegression()
backward=sfs(estimator=lr,k_features='best',forward=False,scoring='r2')
sfs_backward=backward.fit(inp_sc,out)
feat_back=sfs_backward.k_feature_names_
print('Best Features using Backward Elimination:\n',feat_back)

In [None]:
lr=LinearRegression()
forward=sfs(estimator=lr,k_features='best',forward=True,scoring='r2')
sfs_forward=forward.fit(inp_sc,out)
feat_forw=sfs_forward.k_feature_names_
print('Best Features using Forward Selection:\n',feat_forw)

In [None]:
lr=LinearRegression()
rfe=RFECV(estimator=lr)
rfe_mod=rfe.fit(inp_sc,out)
rfe_mod.ranking_

In [None]:
rank=pd.DataFrame()
rank['Features']=xtrain.columns
rank['RANK']=rfe_mod.ranking_
feat_rfe=rank[rank['RANK']==1]['Features']
rank.sort_values(by='RANK')
print('Best Features using Recursive Feature Elimination:\n',feat_rfe)

In [None]:
feat_back =list(feat_back)
feat_forw =list(feat_forw)
feat_rfe=list(feat_rfe)

In [None]:
#Building Model using features got by Backward Elimination
lr=LinearRegression()
lr.fit(xtrain[feat_back],ytrain)
ypred=lr.predict(xtest[feat_back])

r2=r2_score(ytest,ypred)
rmse=np.sqrt(mean_squared_error(ytest,ypred))

res_back=[r2,rmse]
res_back

In [None]:
#Building Model using features got by Forward Selection
lr=LinearRegression()
lr.fit(xtrain[feat_forw],ytrain)
ypred=lr.predict(xtest[feat_forw])

r2=r2_score(ytest,ypred)
rmse=np.sqrt(mean_squared_error(ytest,ypred))

res_forw=[r2,rmse]
res_forw

In [None]:
#Building Model using features got by Recursive Feature Elimination
lr=LinearRegression()
lr.fit(xtrain[feat_rfe],ytrain)
ypred=lr.predict(xtest[feat_rfe])

r2=r2_score(ytest,ypred)
rmse=np.sqrt(mean_squared_error(ytest,ypred))

res_rfe=[r2,rmse]
res_rfe

In [None]:
score_card=pd.DataFrame()
score_card['Backward_Elmination']=res_back
score_card['Forward_Selection']=res_forw
score_card['RFE']=res_rfe
score_card.index=['Rsquare','RMSE']
score_card

**RFE is giving comparatively bettr result.**

In [None]:
# Cross Validation Score using RFE
lr=LinearRegression()
res=cross_val_score(lr,inp_sc[feat_rfe],out,cv=3,scoring='neg_mean_squared_error')
rmse=np.sqrt(abs(res))
be=np.mean(rmse)
ve=np.std(rmse)
cvv=np.std(rmse)/np.mean(rmse)
res_lr=[be,ve,cvv]
res_lr

In [None]:
inp_sc=inp_sc[feat_rfe]

In [None]:
#Regularization
# Ridge Model
ridge=Ridge()
param={'alpha':[0.0001,0.001,0.005,0.01,0.5,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]}
grid=GridSearchCV(ridge,param_grid=param,cv=3,scoring='neg_mean_squared_error')
mod_hyp=grid.fit(inp_sc,out) 
print(mod_hyp.best_params_) 
print(abs(mod_hyp.best_score_))

In [None]:
ridge = Ridge(alpha = 100)
res = cross_val_score(ridge,inp_sc,out,cv = 3,scoring = 'neg_mean_squared_error')
rmse = np.sqrt(abs(res))
be = np.mean(rmse) #bias error
ve = np.std(rmse) #variance error
cve = be/ve #coefficient of variance
res_rid = [be,ve,cve]
res_rid

In [None]:
# Lasso Model
lasso=Lasso()
param={'alpha':[0.0001,0.001,0.005,0.01,0.5,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]}
grid=GridSearchCV(lasso,param_grid=param,cv=3,scoring='neg_mean_squared_error')
mod_hyp=grid.fit(inp_sc,out) 
print(mod_hyp.best_params_) 
print(abs(mod_hyp.best_score_))

In [None]:
lasso = Lasso(alpha = 100)
res = cross_val_score(lasso,inp_sc,out,cv = 3,scoring = 'neg_mean_squared_error')
rmse = np.sqrt(abs(res))
be = np.mean(rmse) #bias error
ve = np.std(rmse) #variance error
cve = be/ve #coefficient of variance
res_las = [be,ve,cve]
res_las

In [None]:
#ElasticNet Model
enet=ElasticNet()
param={'alpha':[0.0001,0.001,0.005,0.01,0.5,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100]}
grid=GridSearchCV(enet,param_grid=param,cv=3,scoring='neg_mean_squared_error')
mod_hyp=grid.fit(inp_sc,out) 
print(mod_hyp.best_params_) 
print(abs(mod_hyp.best_score_))

In [None]:
enet = ElasticNet(alpha = 0.5)
res = cross_val_score(enet,inp_sc,out,cv = 3,scoring = 'neg_mean_squared_error')
rmse = np.sqrt(abs(res))
be = np.mean(rmse) #bias error
ve = np.std(rmse) #variance error
cve = be/ve #coefficient of variance
res_enet = [be,ve,cve]
res_enet

In [None]:
score_card=pd.DataFrame() 
score_card['LR']=res_lr 
score_card['Ridge']=res_rid 
score_card['Lasso']=res_las 
score_card['ElasticNet']=res_enet 
score_card.index=['Bias Error','Variance Error', 'Coefficient of Variance'] 
score_card

**From the models, we can see that Linear Regression model is giving comparatively better result. But the r-square value is very less.**

**So we will try Non-Linear Models.**

<h2 style='font-family:rockwell; color:#06917e'> Non-Linear Models:</h2>

### Decision Tree Regressor
#### Base Model

In [None]:
x = inp_sc
y = out

In [None]:
from statsmodels.tools.eval_measures import rmse

In [None]:
dtree=  DecisionTreeRegressor()
dtree.fit(xtrain,ytrain)

ytrain_pred = dtree.predict(xtrain)
ytest_pred = dtree.predict(xtest)

print('RMSE score of train data: ', rmse(ytrain, ytrain_pred) )
print('R^2 score of train data: ',r2_score(ytrain, ytrain_pred) )

print('RMSE score of test data: ', rmse(ytest, ytest_pred) )
print('R^2 score of test data: ', r2_score(ytest, ytest_pred) )

In [None]:
result_rmse_score = pd.DataFrame(index=['Training','Testing'])
result_r2_score = pd.DataFrame(index=['Training','Testing'])

In [None]:
result_rmse_score['DT Base Model'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['DT Base Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

#### Tuning with RandomizedCV

In [None]:
dtree = DecisionTreeRegressor() # estimator


param_dist = {'max_depth' : np.arange(5,20),
             'min_samples_leaf':[15,17,20,25,30,35,40],
              'min_samples_split':[2,5,8,10,12,15],
              'criterion':['mse']}


rsearch  = RandomizedSearchCV(dtree, param_distributions = param_dist, cv=4) 

rsearch.fit(x,y)
rsearch.best_params_

#### Creating tuned model with best params of RandomizedCV object

In [None]:
dtree_rand_tuned = DecisionTreeRegressor(**rsearch.best_params_)
dtree_rand_tuned.fit(xtrain,ytrain)


ytrain_pred = dtree_rand_tuned.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred) )
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))


ytest_pred = dtree_rand_tuned.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred) )
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['DT Tuned Model'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['DT Tuned Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
result_rmse_score

In [None]:
result_r2_score

## Ensemble Technique:
### Bagging:

### Random Forest
#### Base Random Forest Model

In [None]:
rf1 = RandomForestRegressor()
rf1.fit(xtrain, ytrain)


ytrain_pred =  rf1.predict(xtrain)
print('RMSE of Train Data: ', rmse(ytrain, ytrain_pred))
print('R^2 score of Train Data: ', r2_score(ytrain, ytrain_pred))


ytest_pred =  rf1.predict(xtest)
print('RMSE of Test Data: ', rmse(ytest, ytest_pred))
print('R^2 score of Test Data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['RF Base Model'] = [rmse(ytrain, ytrain_pred),rmse(ytest, ytest_pred) ]
result_r2_score['RF Base Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
result_rmse_score

In [None]:
result_r2_score

#### Tuning Random forest model with RandomizedCV

In [None]:
rf2 = RandomForestRegressor()


param_dist = { 'n_estimators':sp_randint(50,100),
              'max_features': sp_randint(1,14),
              'max_depth' : sp_randint(5,20),
             'min_samples_leaf':sp_randint(10,50),
              'min_samples_split':sp_randint(2,50)}


rsearch_rf  = RandomizedSearchCV(estimator=rf2, param_distributions = param_dist, cv=4, random_state=4) 

rsearch_rf.fit(x,y)
rsearch_rf.best_params_

In [None]:
rsearch_rf.best_score_

#### Creating tuned model with best params of RandomizedCV object

In [None]:
rf_tuned = RandomForestRegressor(**rsearch_rf.best_params_)

rf_tuned.fit(xtrain,ytrain)

ytrain_pred = rf_tuned.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = rf_tuned.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['RF Tuned Model'] = [rmse(ytrain, ytrain_pred),rmse(ytest, ytest_pred) ]
result_r2_score['RF Tuned Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
result_rmse_score

In [None]:
result_r2_score

In [None]:
result_rmse_score

In [None]:
rf_tuned.fit(xtrain,ytrain)
pd.DataFrame(index=xtrain.columns, data=rf_tuned.feature_importances_, columns=['feat']).sort_values(by='feat',ascending=False)

### Extra Tree Regressor
#### Base Model

In [None]:
et = ExtraTreesRegressor()
et.fit(xtrain, ytrain)

ytrain_pred =  et.predict(xtrain)
print('RMSE of Train Data: ', rmse(ytrain, ytrain_pred))
print('R^2 score of Train Data: ', r2_score(ytrain, ytrain_pred))


ytest_pred =  et.predict(xtest)
print('RMSE of Test Data: ', rmse(ytest, ytest_pred))
print('R^2 score of Test Data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['ETR Base Model'] = [rmse(ytrain, ytrain_pred),rmse(ytest, ytest_pred) ]
result_r2_score['ETR Base Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
result_rmse_score

In [None]:
result_r2_score

#### Tuning the Extra Tree Regressor

In [None]:
etr2 = ExtraTreesRegressor()


param_dist = { 'n_estimators':sp_randint(50,100),
              'max_features': sp_randint(1,14),
              'max_depth' : sp_randint(5,20),
             'min_samples_leaf':sp_randint(10,50),
              'min_samples_split':sp_randint(2,50)}


rsearch_etr  = RandomizedSearchCV(estimator=etr2, param_distributions = param_dist, cv=4, random_state=4) 

rsearch_etr.fit(x,y)
rsearch_etr.best_params_

#### Creating tuned model with best params of RandomizedCV object

In [None]:
etr_tuned = ExtraTreesRegressor(**rsearch_etr.best_params_)
etr_tuned.fit(xtrain, ytrain)

ytrain_pred = etr_tuned.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = etr_tuned.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['ETR Tuned Model'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['ETR Tuned Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
result_rmse_score

In [None]:
result_r2_score

In [None]:
etr_tuned.fit(xtrain, ytrain)
etr_tuned.feature_importances_

In [None]:
pd.DataFrame(index=xtrain.columns, data=etr_tuned.feature_importances_, columns=['feat']).sort_values(by='feat',ascending=False)

### Boosting:
#### AdaBoost:

In [None]:
ada = AdaBoostRegressor(random_state=48)
ada.fit(xtrain,ytrain)

ytrain_pred = ada.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = ada.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['AdaBoost'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['AdaBoost'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

#### Gradient Boost:

In [None]:
grad = GradientBoostingRegressor(random_state=48)
grad.fit(xtrain,ytrain)

ytrain_pred = grad.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = grad.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['Gradient Boost'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['Gradient Boost'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

#### LightGBM:

In [None]:
lgbmr = LGBMRegressor(random_state=48)
lgbmr.fit(xtrain,ytrain)

ytrain_pred = lgbmr.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = lgbmr.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['LightGBM'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['LightGBM'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
lgbmr.fit(xtrain, ytrain)
lgbmr.feature_importances_

In [None]:
pd.DataFrame(index=xtrain.columns, data=lgbmr.feature_importances_, columns=['feat']).sort_values(by='feat',ascending=False)

#### XGBoost:

In [None]:
xg = XGBRegressor(random_state=48)
xg.fit(xtrain,ytrain)

ytrain_pred = xg.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = xg.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['XGBoost'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['XGBoost'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

In [None]:
xg.fit(xtrain, ytrain)
xg.feature_importances_

In [None]:
pd.DataFrame(index=xtrain.columns, data=xg.feature_importances_, columns=['feat']).sort_values(by='feat',ascending=False)

### K-Nearest Neighbor Regressor:

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor()
knn.fit(xtrain,ytrain)

ytrain_pred = knn.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = knn.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

In [None]:
result_rmse_score['KNN'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['KNN'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

### RMSE Scorecard:

In [None]:
result_rmse_score

### R-square Scorecard:

In [None]:
result_r2_score

**As we are getting best result for Random Forest Regressor and LightGBM, we will use these two ensemble methods in stacking to get best result.**

## Stacking Algorithms

In [None]:
rf_tuned = RandomForestRegressor(**rsearch_rf.best_params_)
etr_tuned = ExtraTreesRegressor(**rsearch_etr.best_params_)
lgbmr = LGBMRegressor(random_state=48)
xg = XGBRegressor(random_state=48)

estimators = [('rf_tuned', rf_tuned),('etr_tuned',etr_tuned),('lgbmr', lgbmr), ('xg',xg)]

stack1 = VotingRegressor(estimators=estimators)

stack1.fit(xtrain, ytrain)

ytrain_pred = stack1.predict(xtrain)
print('RMSE on train data: ', rmse(ytrain, ytrain_pred)) 
print('R^2 on train data: ', r2_score(ytrain, ytrain_pred))

ytest_pred = stack1.predict(xtest)
print('RMSE on test data: ', rmse(ytest, ytest_pred))
print('R^2 on test data: ', r2_score(ytest, ytest_pred))

result_rmse_score['Stacked Model'] = [rmse(ytrain, ytrain_pred), rmse(ytest, ytest_pred) ]
result_r2_score['Stacked Model'] = [r2_score(ytrain, ytrain_pred),r2_score(ytest, ytest_pred) ]

### RMSE Scorecard:

In [None]:
result_rmse_score

### R-square Scorecard:

In [None]:
result_r2_score

In [None]:
result_rmse_score.T

In [None]:
rmse_1 =  result_rmse_score.drop(columns=['AdaBoost']).T
r2_1 =  result_r2_score.drop(columns=['AdaBoost']).T

In [None]:
plt.figure(figsize=(18,5))
plt.plot(rmse_1['Training'], label='Train')
plt.plot(rmse_1['Testing'], label='Test')
plt.title('RMSE score')
plt.legend()

In [None]:
plt.figure(figsize=(18,5))
plt.plot(r2_1['Training'], label='train')
plt.plot(r2_1['Testing'], label='Test')
plt.title('R2 score')
plt.legend()