In [13]:
!pip install vecstack



In [14]:
!pip install imblearn



In [88]:
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from imblearn.over_sampling import SMOTE 
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [65]:
# To upload our datasets from our working directory we need to mount our drive contents to the colab environment. 
# For the code to do so you can search “mount” in code snippets or use the code given below. 
# Our entire drive contents are now mounted on colab at the location “/gdrive”.

trainfile = r'train.csv'
train_data = pd.read_csv(trainfile)

testfile = r'test.csv'
test_data = pd.read_csv(testfile)

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [66]:
# Decide which Train data should drop
train_missing = train_data.isna().sum() / train_data.shape[0]
train_missing[train_missing > 0.7]

Alley          0.937671
PoolQC         0.995205
Fence          0.807534
MiscFeature    0.963014
dtype: float64

In [67]:
# Decide which Test data should drop
test_missing = test_data.isna().sum() / test_data.shape[0]
test_missing[test_missing > 0.7]

Alley          0.926662
PoolQC         0.997944
Fence          0.801234
MiscFeature    0.965045
dtype: float64

In [68]:
train_data.loc[:, (train_data.isna().sum() / train_data.shape[0] > 0.7)].columns

Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [69]:
test_data.loc[:, (test_data.isna().sum() / test_data.shape[0] > 0.7)].columns

Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [70]:
# Drop Train data that the percentage of missing value is larger than 0.7
train_data.drop(train_missing[train_missing > 0.7].index, axis = 1, inplace = True)

In [71]:
# Drop Test data that the percentage of missing value is larger than 0.7
test_data.drop(test_missing[test_missing > 0.7].index, axis = 1, inplace = True)

In [72]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [75]:
train_data.loc[:, (train_data.isna().sum() > 0).values].isna().sum()

Series([], dtype: float64)

In [74]:
# Manage the missing value in Train dataset
train_data['LotFrontage'].interpolate(axis=0, inplace=True)
train_data[['MasVnrType']].fillna('None', inplace=True)
train_data.dropna(subset=['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond'], inplace=True)
train_data.dropna(subset=['BsmtQual', 'BsmtCond', 'BsmtFinType1'], inplace=True)
train_data.dropna(subset=['MasVnrType', 'MasVnrArea'], inplace=True)
train_data.drop('FireplaceQu', axis=1, inplace=True)
train_data['BsmtExposure'].fillna('No', inplace=True)
train_data['BsmtFinType2'].fillna('Unf', inplace=True)
train_data['Electrical'].fillna('SBrkr', inplace=True)

In [78]:
test_data.loc[:, (test_data.isna().sum() > 0).values].isna().sum()

Series([], dtype: float64)

In [77]:
# Manage the missing value in Test dataset
test_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

In [79]:
# Copy Train data excluding target
trainData_Copy = train_data.drop(['SalePrice', 'Id'], axis=1).copy()
testData_Copy = test_data.drop('Id', axis=1).copy()

# Combine Train and test for One Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys = [0,1])

# Do One Hot Encoding for categorical features
combined_Data = pd.get_dummies(combined_Data)

# Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

y_train = train_data["SalePrice"]

print(X_train.shape)
print(X_test.head()) 

print(y_train.shape)

(1341, 274)
   1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  BsmtFinSF1  BsmtFinSF2  \
0       896         0          0             2       468.0       144.0   
1      1329         0          0             3       923.0         0.0   
2       928       701          0             3       791.0         0.0   
3       926       678          0             3       602.0         0.0   
4      1280         0          0             2       263.0         0.0   

   BsmtFullBath  BsmtHalfBath  BsmtUnfSF  EnclosedPorch  ...  SaleType_ConLD  \
0           0.0           0.0      270.0              0  ...               0   
1           0.0           0.0      406.0              0  ...               0   
2           0.0           0.0      137.0              0  ...               0   
3           0.0           0.0      324.0              0  ...               0   
4           0.0           0.0     1017.0              0  ...               0   

   SaleType_ConLI  SaleType_ConLw  SaleType_New  SaleType_Oth 

In [31]:
# Decision Tree - default mode
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
clf_predict = clf.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': clf_predict}).to_csv('result_decisiontree', index=None)

In [32]:
# Hyperparameter tuning done for DecisionTreeRegressor
parameters={'min_samples_split' : range(10,100,10),'max_depth': range(1,20,2)}
clf_random = RandomizedSearchCV(clf, parameters, n_iter=15)
clf_random.fit(X_train, y_train)
grid_parm = clf_random.best_params_
print(grid_parm)

# Using the parameters obtained from HyperParameterTuning in the DecisionTreeRegressor
clf = DecisionTreeRegressor(**grid_parm)
clf.fit(X_train, y_train)
clf_predict = clf.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': clf_predict}).to_csv('result_decisiontree_best', index=None)

{'min_samples_split': 10, 'max_depth': 11}


In [33]:
# Random Forest - default mode
rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': rfc_predict}).to_csv('result_randomforest', index=None)

In [34]:
# Hyperparameter tuning for RandomForestRegressor
parameters = {'n_estimators': range(50,150,20), 'min_samples_split' : range(10,100,10), 'max_depth': range(1,20,2)}
rfc_random = RandomizedSearchCV(rfc, parameters, n_iter=15)
rfc_random.fit(X_train, y_train)
grid_parm_rfc = rfc_random.best_params_
print(grid_parm_rfc)

# Contruct Random Forest using the best parameters
rfc = RandomForestRegressor(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': rfc_predict}).to_csv('result_randomforest_best', index=None)

{'n_estimators': 110, 'min_samples_split': 10, 'max_depth': 17}


In [35]:
# MLP Regressor - default mode
mlp = MLPRegressor()
mlp.fit(X_train, y_train)
mlp_predict = mlp.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': mlp_predict}).to_csv('result_mlpregressor', index=None)

In [36]:
# Hyperparameter tuning for MLP Regressor
parameters = {'hidden_layer_sizes': range(100, 500, 10), 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [0.001, 0.01, 0.005]}
mlp_random = RandomizedSearchCV(mlp, parameters, n_iter=15)
mlp_random.fit(X_train, y_train)
grid_parm_mlp = mlp_random.best_params_
print(grid_parm_mlp)

# Contruct MLP Regressor using the best parameters
mlp = MLPRegressor(**grid_parm_mlp)
mlp.fit(X_train,y_train)
mlp_predict = mlp.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': mlp_predict}).to_csv('result_mlpregressor_best', index=None)

{'learning_rate_init': 0.001, 'learning_rate': 'constant', 'hidden_layer_sizes': 290, 'activation': 'relu'}


In [37]:
# Support Vector Regressor - default mode
svr = SVR()
svr.fit(X_train, y_train)
svr_predict = svr.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': svr_predict}).to_csv('result_svmregressor', index=None)

In [42]:
# Hyperparameter tuning for Support Vector Regressor
parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': range(1, 5, 1)}
svr_random = RandomizedSearchCV(svr, parameters, n_iter=15)
svr_random.fit(X_train, y_train)
grid_parm_svr = svr_random.best_params_
print(grid_parm_svr)

# Contruct Support Vector Regressor using the best parameters
svr = SVR(**grid_parm_svr)
svr.fit(X_train, y_train)
svr_predict = svr.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': svr_predict}).to_csv('result_svmregressor_best', index=None)

{'kernel': 'linear', 'degree': 2}


In [91]:
# Gradient Boosting Regressor - default mode
search_random = {'n_estimators': range(1000, 5000, 100), 'learning_rate':[0.01, 0.1, 0.05]}
abc = GradientBoostingRegressor()
abc.fit(X_train, y_train)
abc_predict = abc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': abc_predict}).to_csv('result_gradientboosting', index=None)

In [93]:
# Hyperparameter tuning for Gradient Boosting Regressor
parameters = {'min_samples_split': range(10, 50, 2), 'max_depth': range(10, 50, 5)}
abc_random = RandomizedSearchCV(abc, parameters, n_iter=15)
abc_random.fit(X_train, y_train)
grid_parm_abc = abc_random.best_params_
print(grid_parm_abc)

# Contruct Gradient Boosting Regressor using the best parameters
abc = GradientBoostingRegressor(**grid_parm_abc)
abc.fit(X_train,y_train)
abc_predict = abc.predict(X_test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': abc_predict}).to_csv('result_gradientboosting_best', index=None)

{'min_samples_split': 46, 'max_depth': 30}


In [82]:
# Stacking Models
print("Ensemble Methods Predictions using MLP Regressor, Random Forest Regressor and Decision Tree Regressor\n")

models = [MLPRegressor(**grid_parm_mlp), RandomForestRegressor(**grid_parm_rfc), DecisionTreeRegressor(**grid_parm)]
      
S_Train, S_Test = stacking(models, X_train, y_train, X_test, regression=True, verbose=2)

Ensemble Methods Predictions using MLP Regressor, Random Forest Regressor and Decision Tree Regressor

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [MLPRegressor]
    fold  0:  [28806.51569566]
    fold  1:  [31058.52438161]
    fold  2:  [28650.69785378]
    fold  3:  [29497.06507736]
    ----
    MEAN:     [29503.20075210] + [952.78545572]
    FULL:     [29502.68122559]

model  1:     [RandomForestRegressor]
    fold  0:  [17959.35655317]
    fold  1:  [19663.28321144]
    fold  2:  [16679.64870497]
    fold  3:  [18794.90275060]
    ----
    MEAN:     [18274.29780505] + [1100.27311530]
    FULL:     [18274.06294953]

model  2:     [DecisionTreeRegressor]
    fold  0:  [27897.16431457]
    fold  1:  [27788.10229964]
    fold  2:  [25168.71578756]
    fold  3:  [26258.96423285]
    ----
    MEAN:     [26778.23665866] + [1132.69909935]
    FULL:     [26779.07105661]



In [83]:
# Stacking - Construct a Random Forest Regressor
model = RandomForestRegressor()  
model = model.fit(S_Train, y_train)
model_pred = model.predict(S_Test)

pd.DataFrame({'Id': test_data.Id, 'SalePrice': model_pred}).to_csv('result_stacking', index=None)