In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Goal: To predict sale price
## 1) Data Analysis

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
train.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


### Preprocessing data

In [4]:
# check for null values
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

### Selecing data

In [5]:
# only selecting numeric columns for inital set up
train_mod = train.select_dtypes(include=['int64'])

In [6]:
train_mod.columns

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

## 2) Model building

### Split data for training and tesing

In [7]:
 #Lets split training data into train and test
train_x = train_mod.iloc[:,train_mod.columns != 'SalePrice']
train_y = train_mod["SalePrice"]
X_train, X_test, y_train, y_test = train_test_split(train_x,train_y, test_size=0.33, random_state=42)

In [8]:
corr = train_mod.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

  corr.style.background_gradient(cmap='coolwarm').set_precision(2)


Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,1.0,0.01,-0.03,-0.03,0.01,-0.01,-0.02,-0.01,-0.01,-0.01,-0.02,0.01,0.01,-0.04,0.01,0.0,-0.02,0.01,0.01,0.04,0.0,0.03,-0.02,0.02,0.02,-0.03,-0.0,0.0,-0.05,0.0,0.06,-0.01,0.02,0.0,-0.02
MSSubClass,0.01,1.0,-0.14,0.03,-0.06,0.03,0.04,-0.07,-0.07,-0.14,-0.24,-0.25,0.31,0.05,0.07,0.0,-0.0,0.13,0.18,-0.02,0.28,0.04,-0.05,-0.04,-0.1,-0.01,-0.01,-0.01,-0.04,-0.03,0.01,-0.01,-0.01,-0.02,-0.08
LotArea,-0.03,-0.14,1.0,0.11,-0.01,0.01,0.01,0.21,0.11,-0.0,0.26,0.3,0.05,0.0,0.26,0.16,0.05,0.13,0.01,0.12,-0.02,0.19,0.27,0.15,0.18,0.17,0.08,-0.02,0.02,0.04,0.08,0.04,0.0,-0.01,0.26
OverallQual,-0.03,0.03,0.11,1.0,-0.09,0.57,0.55,0.24,-0.06,0.31,0.54,0.48,0.3,-0.03,0.59,0.11,-0.04,0.55,0.27,0.1,-0.18,0.43,0.4,0.6,0.56,0.24,0.31,-0.11,0.03,0.06,0.07,-0.03,0.07,-0.03,0.79
OverallCond,0.01,-0.06,-0.01,-0.09,1.0,-0.38,0.07,-0.05,0.04,-0.14,-0.17,-0.14,0.03,0.03,-0.08,-0.05,0.12,-0.19,-0.06,0.01,-0.09,-0.06,-0.02,-0.19,-0.15,-0.0,-0.03,0.07,0.03,0.05,-0.0,0.07,-0.0,0.04,-0.08
YearBuilt,-0.01,0.03,0.01,0.57,-0.38,1.0,0.59,0.25,-0.05,0.15,0.39,0.28,0.01,-0.18,0.2,0.19,-0.04,0.47,0.24,-0.07,-0.17,0.1,0.15,0.54,0.48,0.22,0.19,-0.39,0.03,-0.05,0.0,-0.03,0.01,-0.01,0.52
YearRemodAdd,-0.02,0.04,0.01,0.55,0.07,0.59,1.0,0.13,-0.07,0.18,0.29,0.24,0.14,-0.06,0.29,0.12,-0.01,0.44,0.18,-0.04,-0.15,0.19,0.11,0.42,0.37,0.21,0.23,-0.19,0.05,-0.04,0.01,-0.01,0.02,0.04,0.51
BsmtFinSF1,-0.01,-0.07,0.21,0.24,-0.05,0.25,0.13,1.0,-0.05,-0.5,0.52,0.45,-0.14,-0.06,0.21,0.65,0.07,0.06,0.0,-0.11,-0.08,0.04,0.26,0.22,0.3,0.2,0.11,-0.1,0.03,0.06,0.14,0.0,-0.02,0.01,0.39
BsmtFinSF2,-0.01,-0.07,0.11,-0.06,0.04,-0.05,-0.07,-0.05,1.0,-0.21,0.1,0.1,-0.1,0.01,-0.01,0.16,0.07,-0.08,-0.03,-0.02,-0.04,-0.04,0.05,-0.04,-0.02,0.07,0.0,0.04,-0.03,0.09,0.04,0.0,-0.02,0.03,-0.01
BsmtUnfSF,-0.01,-0.14,-0.0,0.31,-0.14,0.15,0.18,-0.5,-0.21,1.0,0.42,0.32,0.0,0.03,0.24,-0.42,-0.1,0.29,-0.04,0.17,0.03,0.25,0.05,0.21,0.18,-0.01,0.13,-0.0,0.02,-0.01,-0.04,-0.02,0.03,-0.04,0.21


### Build RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor on selected data

In [9]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score,max_error,RocCurveDisplay

In [10]:
regr = RandomForestRegressor(max_depth=200, random_state=42)
adr = AdaBoostRegressor(n_estimators=200, random_state=42)
gdr = GradientBoostingRegressor(n_estimators=200, random_state=42)

regr.fit(X_train, y_train)
adr.fit(X_train, y_train)
gdr.fit(X_train, y_train)

y_pred_rgr = regr.predict(X_test)
y_pred_adr = adr.predict(X_test)
y_pred_gdr = gdr.predict(X_test)

#metrics for exlaining  R^2 adj R^2,  metrics for model performance testing MSE, RMSE, MAE
print(f"RandomForestRegressor: R^2 = {r2_score(y_test, y_pred_rgr)}, max_error={max_error(y_test, y_pred_rgr) }");
print(f"AdaBoostRegressor: R^2 = {r2_score(y_test, y_pred_adr)}, max_error={max_error(y_test, y_pred_adr) }");
print(f"GradientBoostingRegressor: R^2 = {r2_score(y_test, y_pred_gdr)}, max_error={max_error(y_test, y_pred_gdr) }");

RandomForestRegressor: R^2 = 0.8696614368167593, max_error=271501.93
AdaBoostRegressor: R^2 = 0.8470105214360784, max_error=230699.09459459462
GradientBoostingRegressor: R^2 = 0.8546701458491424, max_error=332008.22382735857


### Build RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor using PCA

In [11]:
#using PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing

In [12]:

min_max_scaler = preprocessing.StandardScaler()
train_x_c = min_max_scaler.fit_transform(train_x.copy())
train_x_pca = PCA(n_components=5)

X_train, X_test, y_train, y_test = train_test_split(train_x_pca.fit_transform(train_x_c)\
                                                    ,train_y, test_size=0.33, random_state=42)

regr = RandomForestRegressor(max_depth=200, random_state=42)
adr = AdaBoostRegressor(n_estimators=200, random_state=42)
gdr = GradientBoostingRegressor(n_estimators=200, random_state=42)

regr.fit(X_train, y_train)
adr.fit(X_train, y_train)
gdr.fit(X_train, y_train)

y_pred_rgr = regr.predict(X_test)
y_pred_adr = adr.predict(X_test)
y_pred_gdr = gdr.predict(X_test)

#metrics for exlaining  R^2 adj R^2,  metrics for model performance testing MSE, RMSE, MAE
print(f"RandomForestRegressor: R^2 = {r2_score(y_test, y_pred_rgr)}, max_error={max_error(y_test, y_pred_rgr) }");
print(f"AdaBoostRegressor: R^2 = {r2_score(y_test, y_pred_adr)}, max_error={max_error(y_test, y_pred_adr) }");
print(f"GradientBoostingRegressor: R^2 = {r2_score(y_test, y_pred_gdr)}, max_error={max_error(y_test, y_pred_gdr) }");

RandomForestRegressor: R^2 = 0.8304115214120379, max_error=438376.49
AdaBoostRegressor: R^2 = 0.7601641133240579, max_error=560250.0
GradientBoostingRegressor: R^2 = 0.8052295893915117, max_error=552568.2852153287


### Build RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor using KernelPCA 

In [13]:
#using kernal PCA
from sklearn.decomposition import KernelPCA

min_max_scaler = preprocessing.StandardScaler()
train_x_c = min_max_scaler.fit_transform(train_x.copy())
train_x_pca = KernelPCA(n_components=10, kernel="poly")

X_train, X_test, y_train, y_test = train_test_split(train_x_pca.fit_transform(train_x_c)\
                                                    ,train_y, test_size=0.33, random_state=42)

regr = RandomForestRegressor(max_depth=200, random_state=42)
adr = AdaBoostRegressor(n_estimators=200, random_state=42)
gdr = GradientBoostingRegressor(n_estimators=200, random_state=42)

regr.fit(X_train, y_train)
adr.fit(X_train, y_train)
gdr.fit(X_train, y_train)

y_pred_rgr = regr.predict(X_test)
y_pred_adr = adr.predict(X_test)
y_pred_gdr = gdr.predict(X_test)

#metrics for exlaining  R^2 adj R^2,  metrics for model performance testing MSE, RMSE, MAE
print(f"RandomForestRegressor: R^2 = {r2_score(y_test, y_pred_rgr)}, max_error={max_error(y_test, y_pred_rgr) }");
print(f"AdaBoostRegressor: R^2 = {r2_score(y_test, y_pred_adr)}, max_error={max_error(y_test, y_pred_adr) }");
print(f"GradientBoostingRegressor: R^2 = {r2_score(y_test, y_pred_gdr)}, max_error={max_error(y_test, y_pred_gdr) }");


RandomForestRegressor: R^2 = 0.8241753131303865, max_error=392346.9
AdaBoostRegressor: R^2 = 0.7358631938393765, max_error=450651.4705882353
GradientBoostingRegressor: R^2 = 0.7764947057696819, max_error=407253.9751126041


### Build Neural Net using Scaled data

In [14]:
from sklearn.neural_network import MLPRegressor

min_max_scaler = preprocessing.StandardScaler()
train_x_c = min_max_scaler.fit_transform(train_x.copy())
train_x_pca = KernelPCA(n_components=10, kernel="poly")

X_train, X_test, y_train, y_test = train_test_split(train_x_pca.fit_transform(train_x_c)\
                                                    ,train_y, test_size=0.33, random_state=42)

regr = MLPRegressor(hidden_layer_sizes=(100,100,8), activation='relu', solver='adam',\
                    random_state=42, max_iter=500).fit(X_train, y_train)
y_pred_rgr = regr.predict(X_test)
regr.score(X_test, y_test)
print(f"Neural Network: R^2 = {r2_score(y_test, y_pred_rgr)}, max_error={max_error(y_test, y_pred_rgr) }");

Neural Network: R^2 = 0.7273929513035707, max_error=623451.3710058641


