# Ensemble Model

## Read the dataset

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd
df = pd.read_csv('training_set.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Perform basic data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
s=df.isna().sum()
s[s>0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

These are not missing values. NA represents Not Available as per the data description

In [5]:
df.duplicated().sum()

0

There are no missing values and duplicated values in the dataset

## Separate X and Y (Target feature is SalePrice)

In [6]:
X =df.drop(columns=['SalePrice','Id'])
Y =df[['SalePrice']]

In [7]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [8]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [14]:
cat = X.columns[X.dtypes=='object']
con = X.columns[X.dtypes!='object']

## Create a pipeline for Data Preprocessing

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [10]:
num_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='mean')),
                           ('scaler',StandardScaler())])

In [12]:
cat_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='constant',fill_value='NotAvail')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [15]:
pre = ColumnTransformer([('num',num_pipe,con),
                         ('cat',cat_pipe,cat)]).set_output(transform='pandas')

In [16]:
pre

In [17]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Perform train test split on preprocessed X and Y

In [18]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre,Y,test_size=0.2,random_state=21)

In [20]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
710,-0.636078,-0.6381565,-0.640101,-2.241782,0.381743,-1.201217,0.878668,-0.57441,-0.973018,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1098,-0.163109,-0.9106796,-0.452686,-1.518467,0.381743,-1.168096,-1.689368,-0.57441,0.500854,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1286,-0.872563,6.454645e-16,-0.072844,-0.071836,-0.5172,-0.273836,-1.059473,1.924104,0.274948,0.213629,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
992,0.073375,0.4519361,-0.075851,-0.071836,2.179628,-0.240715,0.394133,1.30917,0.20257,0.436865,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
631,1.492282,-1.637408,-0.593999,1.374795,-0.5172,1.150356,1.024029,0.023903,-0.92038,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
880,-0.872563,-0.456474,-0.350058,-0.795151,-0.5172,1.117235,1.024029,-0.57441,1.176379,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
605,0.073375,0.679039,0.309002,0.651479,0.381743,-0.207594,0.248772,0.40062,0.022723,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1166,-0.872563,-0.274792,-0.004192,1.374795,-0.5172,1.216598,1.120936,-0.175535,-0.973018,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
216,-0.872563,-0.229372,-0.207142,0.651479,-0.5172,1.084115,0.927122,0.899214,1.101808,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
970,-0.163109,-0.456474,0.02838,-1.518467,-1.416142,-0.737526,-1.689368,-0.57441,-0.973018,-0.288653,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [22]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [23]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


In [25]:
xtrain.shape

(1168, 303)

In [26]:
xtest.shape

(292, 303)

In [27]:
ytrain.shape

(1168, 1)

In [28]:
ytest.shape

(292, 1)

## Create an Ensemble Model: 
### 1) Bagging Method - Random Forest Regressor

In [30]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100,max_depth=3,min_samples_split=3,min_samples_leaf=3,criterion='absolute_error')
model.fit(xtrain,ytrain)

In [31]:
model.score(xtrain,ytrain)

0.773970258396705

In [32]:
model.score(xtest,ytest)

0.7420972351020764

## Hyperparameter tuning

In [35]:
params = {'n_estimators':[10,50,100,200],
          'max_depth':[2,3,4,5],
          'min_samples_split':[2,4,6,8,9,10],
          'criterion':['absolute_error','squared_error']}

In [36]:
from sklearn.model_selection import RandomizedSearchCV
rfg = RandomForestRegressor()
rscv = RandomizedSearchCV(rfg,param_distributions=params,cv=5,scoring='neg_mean_squared_error')
rscv.fit(xtrain,ytrain)

In [37]:
rscv.best_params_

{'n_estimators': 200,
 'min_samples_split': 4,
 'max_depth': 5,
 'criterion': 'squared_error'}

In [38]:
best_rfg = rscv.best_estimator_
best_rfg

In [39]:
best_rfg.score(xtrain,ytrain)

0.9152191579139229

In [40]:
best_rfg.score(xtest,ytest)

0.8123224309046888

## Predict the saleprice 

In [41]:
ypred_train = best_rfg.predict(xtrain)
ypred_test = best_rfg.predict(xtest)

In [42]:
ypred_train[:5]

array([ 94072.51289144, 110133.49349883, 154840.3072726 , 182249.05033376,
       238646.5432996 ])

In [43]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [44]:
ypred_test[:5]

array([144270.47636583, 214061.29205731, 261577.9543019 , 195749.58300963,
        92321.09873214])

In [45]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


### Model is performing good, however, lets check Gradient Boost model and then decide which model to consider for final predictions

## 2) Boosting Method: Gradient Boost

In [46]:
from sklearn.ensemble import GradientBoostingRegressor
model2 = GradientBoostingRegressor(learning_rate=0.1,n_estimators=100,max_depth=3,min_samples_split=5,criterion='squared_error')
model2.fit(xtrain,ytrain)

In [47]:
model2.score(xtrain,ytrain)

0.9708578109021565

In [48]:
model2.score(xtest,ytest)

0.8472863390745814

## Hyperparameter tuning

In [50]:
params2 = {'learning_rate':[0.01,0.1,0.15,0.2],
           'n_estimators':[10,50,100,150,200],
           'min_samples_split':[2,4,6,8,10],
           'criterion':['squared_error','absolute_error']}

In [51]:
gbr = GradientBoostingRegressor()
rscv2 = RandomizedSearchCV(gbr,params2,cv=5,scoring='neg_mean_squared_error')
rscv2.fit(xtrain,ytrain)

In [54]:
rscv2.best_params_

{'n_estimators': 100,
 'min_samples_split': 8,
 'learning_rate': 0.1,
 'criterion': 'squared_error'}

In [53]:
best_gbr = rscv2.best_estimator_
best_gbr

In [55]:
best_gbr.score(xtrain,ytrain)

0.9709003982970159

In [56]:
best_gbr.score(xtest,ytest)

0.8497281676718593

## Predict the saleprice

In [57]:
ypred_train2 = best_gbr.predict(xtrain)
ypred_test2 = best_gbr.predict(xtest)

In [58]:
ypred_train2[:5]

array([ 70637.00681282, 107260.10963694, 150577.42447241, 198403.36726211,
       225836.76737848])

In [59]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [60]:
ypred_test2[:5]

array([162404.19058024, 224900.50041062, 268118.70887843, 211666.40548301,
        78148.43445825])

In [67]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


## Gradient boost performs better than Random Forest. So lets consider Gradient Boost for final predictions

In [69]:
xnew = pd.read_csv('sample_set.csv')
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [70]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,-0.872563,0.451936,0.110763,-0.795151,0.381743,-0.340077,-1.15638,-0.57441,0.053428,0.604293,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,0.497357,0.37585,-0.071836,0.381743,-0.43944,-1.30174,0.023903,1.051363,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.179413,0.332053,-0.795151,-0.5172,0.852269,0.6364,-0.57441,0.761852,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.073375,0.361095,-0.054002,-0.071836,0.381743,0.88539,0.6364,-0.463612,0.347326,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.492282,-1.228623,-0.552407,1.374795,-0.5172,0.686666,0.345679,-0.57441,-0.39619,-0.288653,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [71]:
ypreds = best_gbr.predict(xnew_pre)
ypreds[:5]

array([126570.45782994, 158835.90475827, 182341.56687388, 183788.07462816,
       190297.10412329])

In [72]:
xnew['SalePricePredicted'] = ypreds
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePricePredicted
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,126570.45783
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,158835.904758
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,182341.566874
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,183788.074628
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,190297.104123


In [73]:
xnew.to_csv('GradientBoostPredicted_Results.csv',index=False)