In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv("../data/data_train.csv")
test = pd.read_csv("../data/data_test.csv")

In [3]:
X = train.drop(columns = ['SalePrice'])
y = train['SalePrice']

### Helper function - Compute summary vectors

In [4]:
#Function to calculate a vector of mean and standard deviation values for each paramter

def compute_vectors(grid_search, trials):
    '''
    Takes a grid_search object and the number of trials as input
    '''
    mean_vec = np.zeros(trials)
    std_vec = np.zeros(trials)
    i = 0
    
    #Using Grid Search's 'cv_results' attribute to get mean and std for each paramter
    for mean_score, std_score in zip(grid_search.cv_results_["mean_test_score"], grid_search.cv_results_["std_test_score"]):
        mean_vec[i] = -mean_score
        # negative sign used with mean.score() to get positive mean squared error
        std_vec[i] = std_score
        i = i+1

    return mean_vec, std_vec

# SVR

In [None]:
%time
from sklearn.svm import SVR
svr = SVR()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'epsilon': [0.01, 0.1, 1]}

svr_model = GridSearchCV(svr, param_grid, cv = 10, scoring = 'neg_mean_absolute_error')
svr_model.fit(X, y)

Wall time: 0 ns


In [None]:
iterations = 60
#Calling the vector_values function created to calculate mean and std vectors
mean_vec, std_vec = compute_vectors(svr_model, iterations)

plt.figure(figsize=(12,10))
plt.title('Support Vector Regressor', fontsize= 20)
plt.plot((np.arange(iterations)+1), mean_vec)
plt.errorbar((np.arange(iterations)+1), mean_vec, yerr = std_vec)
plt.ylabel("MAE", fontsize= 20)
plt.xlabel("Parameter Set", fontsize= 20)
#plt.ylim(14000, 40000)
plt.show()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs=-1, random_state=0, criterion = 'mae')

param_grid = {'ccp_alpha': [0.001, 0.01, 0.1, 1, 10],
             'n_estimators':[10, 100, 500],
             'max_features': ('auto', 'sqrt', 'log2')
             }

rf_model = GridSearchCV(rf, param_grid, cv = 10, scoring = 'neg_mean_absolute_error')
rf_model.fit(X, y)

In [None]:
iterations = 45
#Calling the vector_values function created to calculate mean and std vectors
mean_vec, std_vec = compute_vectors(svr_model, iterations)

plt.figure(figsize=(12,10))
plt.title('Random Forest Regressor', fontsize= 20)
plt.plot((np.arange(iterations)+1), mean_vec)
plt.errorbar((np.arange(iterations)+1), mean_vec, yerr = std_vec)
plt.ylabel("MAE", fontsize= 20)
plt.xlabel("Parameter Set", fontsize= 20)
#plt.ylim(14000, 40000)
plt.show()

In [None]:
# print values

# XGBoost

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_jobs=-1, objective='reg:squarederror', n_estimators = 100)

In [None]:
param_grid = {'max_depth': [2,4,6],
              'booster': ('gbtree', 'gblinear', 'dart'),
              'gamma': [0.01, 0.1, 1]
             }

xgb_model = GridSearchCV(xgb, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', verbose = 4)
xgb_model.fit(X, y)

In [None]:
iterations = 27
#Calling the vector_values function created to calculate mean and std vectors
mean_vec, std_vec = compute_vectors(xgb_model, iterations)

plt.figure(figsize=(12,10))
plt.title('XGBoost Regressor', fontsize= 20)
plt.plot((np.arange(iterations)+1), mean_vec)
plt.errorbar((np.arange(iterations)+1), mean_vec, yerr = std_vec)
plt.ylabel("MAE", fontsize= 20)
plt.xlabel("Parameter Set", fontsize= 20)
#plt.ylim(14000, 40000)
plt.show()

In [None]:
#print vals

# LightGBM

In [None]:
import lightgbm
lgb = lightgbm.LGBMRegressor()

In [None]:
param_grid = {'num_leaves': [5,15,31],
              'boosting_type': ('gbdt', 'dart', 'gross'),
              'reg_alpha': [0.01, 0.1, 1]
             }

lgb_model = GridSearchCV(lgb, param_grid, cv = 5, scoring = 'neg_mean_absolute_error', verbose = 4)
lgb_model.fit(X, y)

In [None]:
iterations = 27
#Calling the vector_values function created to calculate mean and std vectors
mean_vec, std_vec = compute_vectors(lgb_model, iterations)

plt.figure(figsize=(12,10))
plt.title('XGBoost Regressor', fontsize= 20)
plt.plot((np.arange(iterations)+1), mean_vec)
plt.errorbar((np.arange(iterations)+1), mean_vec, yerr = std_vec)
plt.ylabel("MAE", fontsize= 20)
plt.xlabel("Parameter Set", fontsize= 20)
#plt.ylim(14000, 40000)
plt.show()

# Stacking

![Stacking](stacking.png)

In [None]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('ridge', Ridge(normalize=True)),
    ('xgb', XGBRegressor(**xgb_model.best_params_)),
    ('svr', SVR(**svr_model.best_params_))
    ('lgb', lightgbm.LGBMRegressor(**lgb_model.best_params_))]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor()
)

In [None]:
reg.fit(X, y)

# Model Evaluation

In [None]:
from sklearn.model_selection import cross_validate
scores = cross_validate(reg, X1, y, cv=5,
                        scoring=('r2', 'neg_mean_absolute_error'),
                        return_train_score=False)

In [None]:
scores['test_neg_mean_absolute_error']

In [None]:
scores['test_neg_mean_absolute_error'].mean()

In [None]:
variance = sum([((x - scores['test_neg_mean_absolute_error'].mean()) ** 2) for x in scores['test_neg_mean_absolute_error']]) / len(scores)
variance ** 0.5