# Issues faced:-
* "Store" and "Dept" are both categorical variables but the are too many categories 
* Dates of week are unique. There are 143 total unique dates which have to be replaced with week number.
* Don't know which feature is important inorder to split the data using Stratified Split.
* Averaging the MarkDowns from Nov 2011 to Oct 2012 and filling them for repective weeks from Feb 2010 to Sep 2010.
* Binary Encoding "Dept" and "Store" due to presence of large number of classes in these variables.
* One Hot Encoding the "Type" variable as it had only 3 classes.
* Changed the Boolean entries to 1 and 0 in "IsHoliday" variable.
* Scaled the numerical values using Standard Scaler.
* Implemented a pipeline to transform the data for above numberical and categorical transformations.
* Worst performance with linear regression.
* Hyperparameter tunning Random Forest Regressor, Extra Trees Regressor and XGBoost Regressor.
* Individually, the increasing order of better performace is Random Forest Regressor, XGBoost Regressor, Extra Trees Regressor.
* Tried to create a Voting Regressor of Random Forest Regressor, Extra Trees Regressor and XGBoost but the kernal is running of memory( Exceeding 16GB. The data set size if just 50 MB).
* Hence, tried out combinations of (Random Forest,Extra Trees) and (Extra Trees, XGBoost). Here, the later one performed better.
* Got the best results with Stacking well tunned XGBoost Regressor, Random Forest Regressor and Extra Trees Regressor as base estimators and Linear Regressor as final estimator.

# Some Info
* CPI  = Price of basket of goods in Given Year/Price of basket of goods in base year
* CPI is an indicator of inflation. Hence if CPI is more then people can purchase less goods in same amount of money. (https://www.investopedia.com/terms/c/consumerpriceindex.asp)
* Unemployment Rate = Percentage of unemployed people divided by the total number of people in the labour force(employed + unemployed) (https://www.investopedia.com/terms/u/unemploymentrate.asp)

# Importing all the libraries

In [None]:
# Importing data processing libraries
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
import csv

# Importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Model libraries 
from sklearn.ensemble import VotingRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor
import xgboost
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Model evaluation 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Hyperparameter tunning libraries
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import random

# Importing Data

In [None]:
# Function to import dataset

def importData(feature = None):
    data = pd.read_csv("../input/course-material-walmart-challenge/train.csv")
    train = test = 0
    if feature is None:
        train, test = train_test_split(data, test_size = 0.2, random_state = 21)
    else:
        split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 21)
        for train_index, test_index in split.split(data, data[feature]):
            train = data.loc[train_index]
            test = data.loc[test_index]
    return [train,test]

In [None]:
# Simply splitting the data
train, test = importData("Date")

# Taking a quick look into the data

In [None]:
train.head(10)

In [None]:
train.info()

In [None]:
print(train['Store'].unique())
print('Total number of stores = {}'.format(len(train['Store'].unique())))

In [None]:
print(train['Dept'].unique())
print('Total number of departments = {}'.format(len(train['Dept'].unique())))

In [None]:
print('Dates of the weeks on which data was recorded :- ')
print(train['Date'].unique())
print('Total number of weeks = {}'.format(len(train['Date'].unique())))
year12 = []
year11 = []
year10 = []
date_list = train['Date'].unique()
for i in range(len(train['Date'].unique())):
    l = list(map(int, date_list[i].split('-')))
    if l[0] == 2012:
        year12.append(l[0])
    elif l[0] == 2011:
        year11.append(l[0])
    else:
        year10.append(l[0])
print('Number of Entries in Year 2010 = {}'.format(len(year10)))
print('Number of Entries in Year 2011 = {}'.format(len(year11)))
print('Number of Entries in Year 2012 = {}'.format(len(year12)))

**The entries are from February 2010 to October 2012, consisting of 143 entries. We can replace the dates by week number which will help us in visualization.**

# Replacing the "Date" variable with "Week" variable. 

In [None]:
class replaceDateWithWeek(BaseEstimator, TransformerMixin):
    def __init__(self,arg=None):
        self.arg = arg
    def fit(self,X,y=None):
        return self
    def transform(self,data,y=None):
        #Finding the dates and converting them to int
        dates_list = []
        dates = data['Date'].unique()
        print("Number of weeks in the dataset = {}".format(len(dates)))
        for i in range(len(dates)):
            l = list(map(int, dates[i].split('-')))
            dates_list.append(l)

        #Sorting the dates
        weeks = []
        for i in range(2010,2013):
            for j in range(1,13):
                for k in range(1,32):
                    for date in dates_list:
                        if date[0] == i and date[1] == j and date[2] == k:
                            weeks.append(date)

        #Reconverting the dates back to string   
        for i in range(len(weeks)):
            if weeks[i][1] >= 10 and weeks[i][2] >= 10:
                weeks[i] = str(weeks[i][0])+"-"+str(weeks[i][1])+"-"+str(weeks[i][2])
            elif weeks[i][1] >= 10 and weeks[i][2] < 10:
                weeks[i] = str(weeks[i][0])+"-"+str(weeks[i][1])+"-0"+str(weeks[i][2])
            elif weeks[i][1] < 10 and weeks[i][2] >= 10:
                weeks[i] = str(weeks[i][0])+"-0"+str(weeks[i][1])+"-"+str(weeks[i][2])
            elif weeks[i][1] < 10 and weeks[i][2] < 10:
                weeks[i] = str(weeks[i][0])+"-0"+str(weeks[i][1])+"-0"+str(weeks[i][2])

        #Replacing dates with week number
        week_num = []
        l = data['Date'].tolist()
        for i in range(len(l)):
            week_num.append(weeks.index(l[i]) + 1)

        data['Week'] = week_num
        data.drop(['Date'], axis = 1, inplace = True)
        return data
    
temp = replaceDateWithWeek()
temp.transform(train)
train.head(10)

**The above transformation can also be done using OrdinalEncoder**

# Performing EDA

# Checking the store wise sale in 3 years

In [None]:
sales_sum = []
store_index = [i for i in range(1,46)]

for i in range(1,46):
    sales_sum.append(sum(train[train['Store'] == i]['Weekly_Sales'])/1000000)

plt.figure(figsize = (15,10))
plt.title('Store wise sales in 3 years')
sns.barplot(x = store_index, y = sales_sum)
sns.set(style = 'darkgrid')
plt.xlabel("Stores")
plt.ylabel("Sales (In Million $)")
plt.show()

***Few stores have very high sales while few have comparatively very low sales***

# Testing difference in sales on Holiday Weeks

In [None]:
y = []
x = ['Holiday', 'Normal']

y.append(sum(train[train['IsHoliday'] == True]['Weekly_Sales'])/len(train[train['IsHoliday'] == True]['Weekly_Sales']))
y.append(sum(train[train['IsHoliday'] == False]['Weekly_Sales'])/len(train[train['IsHoliday'] == False]['Weekly_Sales']))

plt.figure(figsize = (10,8))
plt.title('Difference in average sales on Holiday Weeks')
sns.barplot(x = x, y = y)
sns.set(style = 'darkgrid')
plt.xlabel("Type of week")
plt.ylabel("Average Sales (In $)")
plt.show()

***Not much difference is seen in Holiday and Normal week Average Weekly Sales***

# Time Series subplot of Weekly Sales, CPI and unemployment rate

In [None]:
fig, axes = plt.subplots(3, 1, figsize = (20,15), sharex = True, sharey = False)
fig.suptitle("Time Series subplot of Weekly Sales, CPI and unemployment rate")
sns.lineplot(ax=axes[0],x = 'Week', y = 'Weekly_Sales', data=train)
axes[0].set_title("Average week wise sales(in $) in 3 years")
axes[0].set_ylabel("Average Weekly Sales")
sns.lineplot(ax=axes[1],x = 'Week', y = 'CPI', data=train)
axes[1].set_title("Average week wise CPI in 3 years")
axes[1].set_ylabel("Average CPI")
sns.lineplot(ax=axes[2],x = 'Week', y = 'Unemployment', data=train)
axes[2].set_title("Average week wise unemployment rate in 3 years")
axes[2].set_xlabel("Week")
axes[2].set_ylabel("Average Unemployment")

**The following points are observed:**
* The weekly sales have taken a jump around 48th and 100th week.
* The weekly sales have declined just after the jump.
* The CPI is increasin overall.
* The unemployment rate is decreasing overall.
* No clear correlation is seen among the three variables.( Weekly Sales, CPI, Unemployment Rate)

# Checking the temperature wise weekly sales

In [None]:
fig, axes = plt.subplots(3, 1, figsize = (20,20), sharex = False, sharey = False)
fig.suptitle("Time Series subplot of Weekly Sales and Temperature")
sns.lineplot(ax=axes[0],x = 'Temperature', y = 'Weekly_Sales', data=train)
axes[0].set_title("Average week wise sales(in $) based on temperature")
axes[0].set_xlabel("Temperature (In Fahrenheit)")
axes[0].set_ylabel("Average weekly sales")
sns.lineplot(ax=axes[1],x = 'Week', y = 'Temperature', data=train)
axes[1].set_title("Variation of average Temperature with Week")
axes[1].set_xlabel("Week")
axes[1].set_ylabel("Average Temperature(In Fahrenheit)")
sns.lineplot(ax=axes[2],x = 'Week', y = 'Weekly_Sales', data=train)
axes[2].set_title("Average week wise sales(in $) in 3 years")
axes[2].set_xlabel("Week")
axes[2].set_ylabel("Average Weekly Sales")

***The steep rise in the graph of Average Weekly Sales vs Week may be during Christmas (i.e in December) as the temperature is seen to be very low at that time***

# Filling up Markdowns for 2010 and 2011 based on data after Nov 2011
* The entries for MarkDowns have started from November 11, 2011 and upto October 26, 2012.
* Need to fill the missing entries from February 5, 2010 upto October 28, 2011 

In [None]:
class fillMarkDowns(BaseEstimator, TransformerMixin):
    def __init__(self,arg = None):
        self.arg = arg
    def fit(self,X,y = None):
        return self
    def transform(self,data, y = None):
        #Getting week numbers from Nov 2011 to Oct 2012
        weeksFebToOct12 = [x for x in range(105, 144)]
        weeksNovToJan12 = [x for x in range(92,105)]

        #Getting Markdowns from Nov 2011 to Oct 2012
        MarkDowns = []
        for i in range(1,6):
            markdowns = []
            for _ in range(2):
                weeks = []
                if _ == 0:
                    weeks = weeksFebToOct12
                else:
                    weeks = weeksNovToJan12
                for week in weeks:
                    k = data[data['Week'] == week]['MarkDown'+str(i)]
                    k = [0 if math.isnan(x) else x for x in k]
                    markdowns.append(sum(k)/len(k))
            MarkDowns.append(markdowns)

        #Filling missing values in MarkDowns from week 1 to week 91
        for k in range(5):
            i = 1
            j = 0
            while i <= 91:
                data.loc[data['Week'] == i,'MarkDown'+str(k+1)] = MarkDowns[k][j]
                i += 1
                j += 1
                if j == 52:
                    j = 0

        #Filling missing values in MarkDowns from week 92 to week 143
        id = [x for x in range(len(data))]
        data['id'] = id
        for i in range(len(data)):
            for j in range(5):
                if math.isnan(data.iloc[i]['MarkDown'+str(j+1)]):
                    data.loc[data['id'] == i,'MarkDown'+str(j+1)] = MarkDowns[j][data.iloc[i]['Week']-92]

        data.drop('id', axis = 1, inplace = True)
        return data

temp = fillMarkDowns()
temp.transform(train)
train.info()

# Checking correlation

In [None]:
corr_matrix = train.corr()
plt.figure(figsize = (15,10))
sns.heatmap(corr_matrix, vmin = -1, vmax = 1, cmap = 'seismic')
plt.gca().patch.set(hatch = "X", edgecolor = "#0080ff")
plt.show()

* As seen from graph CPI and unemployment rate are inversely correlated
* Markdown 2 and Temperature are inversely correlated
* There is a slight correlation between Store and CPI, Unemployment rate and Size.
* Week and fuel price are strongly correlated. It tells us that the fuel price are consistently rising.
* MarkDown4 and MarkDown 1 are also stronly correlated.

# Replacing the boolean values in IsHoliday variable 

In [None]:
class replaceBoolean(BaseEstimator, TransformerMixin):
    def __init__(self,arg=None):
        self.arg = arg
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X.loc[X['IsHoliday'] == True, 'IsHoliday'] = 1
        X.loc[X['IsHoliday'] == False, 'IsHoliday'] = 0
        return X

# Encoding Catergorical Features

* "Date" being an ordinal categorical variable. We can keep it as "Week" variable.
* As "Store" and "Dept" have many categories we will use Binary Encoding for them.
* "Type" variable has only 3 classes, hence we will use One Hot Encoding for it.
* "IsHoliday" is a boolen variable hence we will encode it using 1 and 0.

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns]

In [None]:
#Creating Pipeline
def runPipeline(data):
    #First transformation
    temp = replaceDateWithWeek()
    data = temp.transform(data)
    
    #Second transformation
    temp = fillMarkDowns()
    data = temp.transform(data)
    
    completePipeline = ColumnTransformer([
        ('binary_encoder', BinaryEncoder(cols = ['Store','Dept'], return_df=True),['Store','Dept']),
        ('one_hot_encoder',ce.OneHotEncoder(),['Type']),
        ('scalar', StandardScaler(),['Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','CPI','Unemployment','Size']),
        ('boolean_converter',replaceBoolean(),['IsHoliday'])
    ],n_jobs = -1,verbose = True)
    
    data = completePipeline.fit_transform(data)
    data = pd.DataFrame(data, columns = [x for x in range(1,len(data[0])+1)])
    data = data.astype(dtype = np.float64)
    
    return data

# Building Full Pipeline with Final Model

In [None]:
def fullPipeline():
    # Preprocessing and splitting the data into train and test set
    data = pd.read_csv("../input/course-material-walmart-challenge/train.csv")
    weekly_sales = data["Weekly_Sales"].copy()
    data.drop("Weekly_Sales", axis = 1, inplace = True)
    data = runPipeline(data)
    data['Weekly_Sales'] = weekly_sales.tolist()
    X, X_test = train_test_split(data, test_size = 0.2, random_state = 21)
    y = X["Weekly_Sales"].copy()
    y_test = X_test['Weekly_Sales'].copy()
    X.drop(['Weekly_Sales'],axis = 1, inplace = True)
    X_test.drop(['Weekly_Sales'], axis = 1, inplace = True)
    
    # Initializing all the base models.
    rf_reg = RandomForestRegressor(max_depth=232,max_features=23,n_estimators=162)
    ext_reg = ExtraTreesRegressor(max_depth=239,max_features=27,n_estimators=283)
    ext_reg = ExtraTreesRegressor(max_depth=150,max_features=27,n_estimators=283)
    xgb_reg = xgboost.XGBRegressor(eta=0.15,gamma=0,max_depth=13,min_child_weight=5)
    lin_reg = LinearRegression()
    
    # Declaring the parameters of Stacking Regressor
    estimators = [('rf', rf_reg),('ext', ext_reg),('xgb',xgb_reg)]
    final_estimator = lin_reg
    
    # Initializing and fitting the Stacking Regressor
    stacking_reg = StackingRegressor(estimators=estimators,final_estimator=final_estimator,cv = 2)
    stacking_reg.fit(X,y)
    
    # Predicting on the training set
    y_pred = stacking_reg.predict(X)
    stacking_train_mse = mean_squared_error(y, y_pred)
    stacking_train_rmse = np.sqrt(stacking_train_mse)
    print("Training Error = {}".format(stacking_train_rmse))
    acc_stacking_train = round( stacking_reg.score(X, y) * 100, 2)
    print ("Coefficient of determination R^2 of the prediction on the training set: ", str(acc_stacking_train) + ' percent')

    # Predicting on the test set
    y_pred = stacking_reg.predict(X_test)
    stacking_test_mse = mean_squared_error(y_test, y_pred)
    stacking_test_rmse = np.sqrt(stacking_test_mse)
    print("Test Error = {}".format(stacking_test_rmse))
    acc_stacking_test = round( stacking_reg.score(X_test, y_test) * 100, 2)
    print ("Coefficient of determination R^2 of the prediction on the test set: ", str(acc_stacking_test) + ' percent')

In [None]:
fullPipeline()

In [None]:
# # Importing the whole data and applying the complete pipeline again on the data
# train, test = importData('Date')
# y = train['Weekly_Sales'].copy()
# X_raw = train.drop(['Weekly_Sales'],axis = 1)
# y_test = test['Weekly_Sales'].copy()
# X_test_raw = test.drop(['Weekly_Sales'], axis = 1)
# X = runPipeline(X_raw)
# X_test = runPipeline(X_test_raw)

In [None]:
# X.head(10)

In [None]:
# X.info()

# Trying out various regression models

In [None]:
# lin_reg = LinearRegression()
# rnf_reg = RandomForestRegressor()
# ext_reg = ExtraTreesRegressor()
# voting_reg = VotingRegressor(
#     estimators=[('rf', rnf_reg), ('et', ext_reg)]
# )

In [None]:
# train = X.copy()
# train['Weekly_Sales'] = y.tolist()
# X_train, X_valid = train_test_split(train, test_size = 0.2, random_state = 21)

# y_train = X_train['Weekly_Sales'].copy()
# X_train.drop(["Weekly_Sales"], axis = 1, inplace = True)
# y_valid = X_valid['Weekly_Sales'].copy()
# X_valid.drop(["Weekly_Sales"], axis = 1, inplace = True)


# # for reg in (lin_reg, rnf_reg, ext_reg, voting_reg):
# #     reg.fit(X_train,y_train)
# #     y_pred = reg.predict(X_train)
# #     y_valid_pred = reg.predict(X_valid)
# #     reg_mse = mean_squared_error(y_train, y_pred)
# #     reg_rmse = np.sqrt(reg_mse)
# #     print(reg.__class__.__name__,": ")
# #     print("Training Error = {}".format(reg_rmse))
# #     reg_mse = mean_squared_error(y_valid, y_valid_pred)
# #     reg_rmse = np.sqrt(reg_mse)
# #     print("Validation Error = {}".format(reg_rmse))

# Saving the training and validation data set

In [None]:
# rows = [[x for x in range(1,30)]]
# rows[0].append("Weekly_Sales")
# for i in range(225960):
#     rows.append([train[x][i] for x in range(1,30)])
#     rows[i+1].append(train['Weekly_Sales'][i])
# with open("train.csv", 'w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerows(rows)

# Loading the training data set

In [None]:
# train = pd.read_csv("./train.csv")
# train.head()

# Trying out Boosting Algorithms

In [None]:
# #Initializing and fitting the model
# gbrt = GradientBoostingRegressor()
# gbrt.fit(X_train, y_train)

# #Predicting on traing set
# y_pred = gbrt.predict(X_train)
# gbrt_mse = mean_squared_error(y_train, y_pred)
# gbrt_rmse = np.sqrt(gbrt_mse)
# print("Gradient Boosting Regressor Training Error = {}".format(gbrt_rmse))

# #Predicting on validation set
# y_pred = gbrt.predict(X_valid)
# gbrt_mse = mean_squared_error(y_valid, y_pred)
# gbrt_rmse = np.sqrt(gbrt_mse)
# print("Gradient Boosting Regressor Validation Error = {}".format(gbrt_rmse))

In [None]:
# #Initializing and fitting the model
# xgb_reg = xgboost.XGBRegressor()
# xgb_reg.fit(X_train, y_train)

# #Predicting on traing set
# y_pred = xgb_reg.predict(X_train)
# xgb_reg_mse = mean_squared_error(y_train, y_pred)
# xgb_reg_rmse = np.sqrt(xgb_reg_mse)
# print("Extreme Gradient Boosting Regressor Training Error = {}".format(xgb_reg_rmse))

# #Predicting on validation set
# y_pred = xgb_reg.predict(X_valid)
# xgb_reg_mse = mean_squared_error(y_valid, y_pred)
# xgb_reg_rmse = np.sqrt(xgb_reg_mse)
# print("Extreme Gradient Boosting Regressor Validation Error = {}".format(xgb_reg_rmse))

In [None]:
# #Initializing and fitting the model
# ext_reg = ExtraTreesRegressor(n_estimators = 20,max_depth = 15)
# adb_reg = AdaBoostRegressor(base_estimator = ext_reg)
# adb_reg.fit(X_train, y_train)

# #Predicting on traing set
# y_pred = adb_reg.predict(X_train)
# adb_reg_mse = mean_squared_error(y_train, y_pred)
# adb_reg_rmse = np.sqrt(adb_reg_mse)
# print("Adaptive Boosting Regressor Training Error = {}".format(adb_reg_rmse))

# #Predicting on validation set
# y_pred = adb_reg.predict(X_valid)
# adb_reg_mse = mean_squared_error(y_valid, y_pred)
# adb_reg_rmse = np.sqrt(adb_reg_mse)
# print("Adaptive Boosting Regressor Validation Error = {}".format(adb_reg_rmse))

# Fine-Tunning Random Forest Regressor 

In [None]:
# param_distribs = {
#     'n_estimators': randint(low=150, high=300),
#     'max_features': randint(low=15, high=30),
#     'max_depth':randint(low=120, high = 250)
# }

# forest_reg = RandomForestRegressor(random_state=21)
# forest_rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_jobs = -1, return_train_score = True,
#                                 n_iter=15, cv=2, scoring='neg_mean_squared_error', random_state=21)
# forest_rnd_search.fit(X_train,y_train)

# cvres = forest_rnd_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [None]:
# Best Params:  {'max_depth': 232, 'max_features': 23, 'n_estimators': 162}
# Random Forest Regressor Validation Error = 5202.5457875042375

# print("Best Params: ", forest_rnd_search.best_params_)
# y_pred = forest_rnd_search.best_estimator_.predict(X_valid)
# forest_mse = mean_squared_error(y_valid, y_pred)
# forest_rmse = np.sqrt(forest_mse)
# print("Random Forest Regressor Validation Error = {}".format(forest_rmse))

# Fine-Tunning Extra Trees Regressor 

In [None]:
# param_distribs = {
#     'n_estimators': randint(low=150, high=300),
#     'max_features': randint(low=15, high=30),
#     'max_depth':randint(low=120, high = 250)
# }

# ext_reg = ExtraTreesRegressor(random_state=21)
# ext_rnd_search = RandomizedSearchCV(ext_reg, param_distributions=param_distribs, n_jobs = -1, return_train_score = True,
#                                 n_iter=15, cv=2, scoring='neg_mean_squared_error', random_state=21)
# ext_rnd_search.fit(X_train,y_train)

# cvres = ext_rnd_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [None]:
# Best Params:  {'max_depth': 239, 'max_features': 27, 'n_estimators': 283}

# # Extra Trees Regressor Validation Error = 4782.645726024579
# print("Best Params: ", ext_rnd_search.best_params_)
# y_pred = ext_rnd_search.best_estimator_.predict(X_valid)
# ext_mse = mean_squared_error(y_valid, y_pred)
# ext_rmse = np.sqrt(ext_mse)
# print("Extra Trees Regressor Validation Error = {}".format(ext_rmse))

# Hyperparameter tuning XGBoost

In [None]:
# param_distribs = {
#     'eta':[0.15,0.17,0.19,0.2,0.22,0.24,0.25,0.27,0.29,0.3],
#     'min_child_weight':randint(low = 1, high = 7),
#     'max_depth':randint(low=10, high = 25),
#     'gamma':randint(low = 0,high = 5),
# }

# xgb_reg = xgboost.XGBRegressor()
# xgb_rnd_search = RandomizedSearchCV(xgb_reg, param_distributions=param_distribs, n_jobs = -1, return_train_score = True,
#                                 n_iter=15, cv=2, scoring='neg_mean_squared_error', random_state=21)
# xgb_rnd_search.fit(X_train,y_train)

# cvres = xgb_rnd_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [None]:
# Best Params:  {'eta': 0.15, 'gamma': 0, 'max_depth': 13, 'min_child_weight': 5}

# print("Best Params: ", xgb_rnd_search.best_params_)
# y_pred = xgb_rnd_search.best_estimator_.predict(X_valid)
# xgb_mse = mean_squared_error(y_valid, y_pred)
# xgb_rmse = np.sqrt(xgb_mse)
# print("Extra Trees Regressor Validation Error = {}".format(xgb_rmse))

# Implementing Votting Regressor

In [None]:
# Training Error = 939.4560063275186 (ext+xgb)
# Validation Error = 4582.014285496773

# ext_reg = ExtraTreesRegressor(max_depth=239,max_features=27,n_estimators=283)
# xgb_reg = xgboost.XGBRegressor(eta=0.15,gamma=0,max_depth=13,min_child_weight=5)
# voting_reg = VotingRegressor(estimators=[('xgb', xgb_reg), ('et', ext_reg), ('rnf', rnf_reg)])

# voting_reg.fit(X_train, y_train)
# y_pred = voting_reg.predict(X_train)
# voting_train_mse = mean_squared_error(y_train, y_pred)
# voting_train_rmse = np.sqrt(voting_train_mse)
# print("Training Error = {}".format(voting_train_rmse))

In [None]:
# y_pred = voting_reg.predict(X_valid)
# voting_valid_mse = mean_squared_error(y_valid, y_pred)
# voting_valid_rmse = np.sqrt(voting_valid_mse)
# print("Validation Error = {}".format(voting_valid_rmse))

# Implementing Stacking Regressor

In [None]:
# # Training Error = 736.3944328793483
# # Validation Error = 4517.871439867831

# rf_reg = RandomForestRegressor(max_depth=232,max_features=23,n_estimators=162)
# ext_reg = ExtraTreesRegressor(max_depth=239,max_features=27,n_estimators=283)
# xgb_reg = xgboost.XGBRegressor(eta=0.15,gamma=0,max_depth=13,min_child_weight=5)
# lin_reg = LinearRegression()

# estimators = [('rf', rf_reg),('ext', ext_reg),('xgb',xgb_reg)]
# final_estimator = lin_reg

# stacking_reg = StackingRegressor(estimators=estimators,final_estimator=final_estimator,cv = 2)
# stacking_reg.fit(X_train,y_train)

# y_pred = stacking_reg.predict(X_train)
# stacking_train_mse = mean_squared_error(y_train, y_pred)
# stacking_train_rmse = np.sqrt(stacking_train_mse)
# print("Training Error = {}".format(stacking_train_rmse))
# acc_stacking_train = round( stacking_reg.score(X_train, y_train) * 100, 2)
# print ("Coefficient of determination R^2 of the prediction on the test set: ", str(acc_stacking_train) + ' percent')
    
# y_pred = stacking_reg.predict(X_valid)
# stacking_valid_mse = mean_squared_error(y_valid, y_pred)
# stacking_valid_rmse = np.sqrt(stacking_valid_mse)
# print("Validation Error = {}".format(stacking_valid_rmse))
# acc_stacking_test = round( stacking_reg.score(X_valid, y_valid) * 100, 2)
# print ("Coefficient of determination R^2 of the prediction on the test set: ", str(acc_stacking_test) + ' percent')