In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_info_columns', 150)
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 400)

## 1.This project is focused mainly on exploring Feature Engineering aspects on a Regression Problem.
## 2.The dataset contains both Numerical as well as Categorial features almost equal numbers of cat and numerical features
## 3.This project also gives attention to Cross validation approach and Hyper tuning parameter approach for a regression problem.

In [3]:
# Load the train dataSet
dataSet=pd.read_csv('train.csv')

In [4]:
# About the dataSet: 43 Categorical Attributes and 38 Numerical attributes
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 82 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# Null Values and % of Null value distributions
dataSet.isnull().sum()/len(dataSet)*100

Id                0.000000
MSSubClass        0.000000
MSZoning          0.000000
LotFrontage      17.739726
LotArea           0.000000
Street            0.000000
Alley            93.767123
LotShape          0.000000
LandContour       0.000000
Utilities         0.000000
LotConfig         0.000000
LandSlope         0.000000
Neighborhood      0.000000
Condition1        0.000000
Condition2        0.000000
BldgType          0.000000
HouseStyle        0.000000
OverallQual       0.000000
OverallCond       0.000000
YearBuilt         0.000000
YearRemodAdd      0.000000
RoofStyle         0.000000
RoofMatl          0.000000
Exterior1st       0.000000
Exterior2nd       0.000000
MasVnrType        0.547945
MasVnrArea        0.547945
ExterQual         0.000000
ExterCond         0.000000
Foundation        0.000000
BsmtQual          2.534247
BsmtCond          2.534247
BsmtExposure      2.602740
BsmtFinType1      2.534247
BsmtFinSF1        0.000000
BsmtFinType2      2.602740
BsmtFinSF2        0.000000
B

## Manage Null Values:

In [6]:

# After looking into the 'NA' values across all attrbutes, it is understood that these NA values are infact Valid value.
# We just need to rename the 'NA' with some other valid acronym.

dataSet.LotFrontage.fillna(round(dataSet.LotFrontage.mean(),2),inplace=True) # NA should be replaced with Mean value
dataSet.Alley.fillna('NOA',inplace=True)      #NA should be replaced with 'NOA' (NO Alley Access)

dataSet.MasVnrType.fillna('NMVT',inplace=True) #NA should be replaced with 'NMVT' (NO Masonry veneer)
dataSet.MasVnrArea.fillna(round(dataSet.MasVnrArea.mean(),2),inplace=True)      # Replace the 'NA' values with mean value

dataSet.BsmtQual.fillna('NBQ',inplace=True) #NA should be replaced with 'NBQ' (NO basement quality)
dataSet.BsmtCond.fillna('NBC',inplace=True) #NA should be replaced with 'NBC' (NO basement condition)
dataSet.BsmtExposure.fillna('NBE',inplace=True) #NA should be replaced with 'NBE' (NO basement exposure)

dataSet.BsmtFinType1.fillna('NBF1',inplace=True) #NA should be replaced with 'NBF1' (NO basement type1)
dataSet.BsmtFinType2.fillna('NBF2',inplace=True) #NA should be replaced with 'NBF2' (NO basement type2)

dataSet.Electrical.fillna(dataSet.Electrical.mode()[0],inplace=True) #NA Replace with Mode value

dataSet.FireplaceQu.fillna('NFP',inplace=True) #NA should be replaced with 'NFP' (NO fire place)

dataSet.GarageType.fillna('NG',inplace=True) #NA should be replaced with 'NG' (NO Garage)
dataSet.GarageYrBlt.fillna(0,inplace=True) #Replace the 'NA' values with zero (0)
dataSet.GarageFinish.fillna('NGF',inplace=True) #NA should be replaced with 'NGF' (NO Garage finish)
dataSet.GarageQual.fillna('NGQ',inplace=True) #NA should be replaced with 'NGQ' (NO Garage quality)
dataSet.GarageCond.fillna('NGC',inplace=True) #NA should be replaced with 'NGC' (NO Garage condition)

dataSet.PoolQC.fillna('NP',inplace=True) #NA should be replaced with 'NP' (NO Pool)
dataSet.Fence.fillna('NF',inplace=True) #NA should be replaced with 'NF' (NO Fence)
dataSet.MiscFeature.fillna('NM',inplace=True) #NA should be replaced with 'NM' (NO Miscellaneous)
#All NA values are now manged
dataSet.isnull().sum()/len(dataSet)*100

Id               0.0
MSSubClass       0.0
MSZoning         0.0
LotFrontage      0.0
LotArea          0.0
Street           0.0
Alley            0.0
LotShape         0.0
LandContour      0.0
Utilities        0.0
LotConfig        0.0
LandSlope        0.0
Neighborhood     0.0
Condition1       0.0
Condition2       0.0
BldgType         0.0
HouseStyle       0.0
OverallQual      0.0
OverallCond      0.0
YearBuilt        0.0
YearRemodAdd     0.0
RoofStyle        0.0
RoofMatl         0.0
Exterior1st      0.0
Exterior2nd      0.0
MasVnrType       0.0
MasVnrArea       0.0
ExterQual        0.0
ExterCond        0.0
Foundation       0.0
BsmtQual         0.0
BsmtCond         0.0
BsmtExposure     0.0
BsmtFinType1     0.0
BsmtFinSF1       0.0
BsmtFinType2     0.0
BsmtFinSF2       0.0
BsmtUnfSF        0.0
TotalBsmtSF      0.0
Heating          0.0
HeatingQC        0.0
CentralAir       0.0
Electrical       0.0
1stFlrSF         0.0
2ndFlrSF         0.0
LowQualFinSF     0.0
GrLivArea        0.0
BsmtFullBath 

## It is also observed that there are many Ordinal Categorical Features and these features can be converted## to Numeric features by assigning ordinal values to each features after carefully analyzing the data dictionary.

In [7]:
dataSet.ExterQual.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
dataSet.ExterQual.astype(dtype='int64')
dataSet.ExterCond.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
dataSet.BsmtQual.replace(['NBQ','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
dataSet.BsmtCond.replace(['NBE','No','Mn','Av','Gd'],[0,0,1,2,3],inplace=True)
dataSet.BsmtFinType1.replace(['NBF1','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],[0,1,2,3,4,5,6],inplace=True)
dataSet.BsmtFinType2.replace(['NBF2','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],[0,1,2,3,4,5,6],inplace=True)
dataSet.HeatingQC.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
dataSet.CentralAir.replace(['Y','N'],[1,0],inplace=True)
dataSet.KitchenQual.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
dataSet.FireplaceQu.replace(['NFP','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
dataSet.GarageFinish.replace(['NGF','Unf','RFn','Fin'],[0,1,2,3],inplace=True)
dataSet.GarageQual.replace(['NGQ','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
dataSet.GarageCond.replace(['NGC','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
dataSet.PavedDrive.replace(['N','P','Y'],[1,2,3],inplace=True)
dataSet.PoolQC.replace(['NP','Fa','TA','Gd','Ex'],[0,1,2,3,4],inplace=True)
dataSet.Fence.replace(['NF','MnWw','GdWo','MnPrv','GdPrv'],[0,1,2,3,4],inplace=True)
dataSet.Street.replace(['Grvl','Pave'],[1,2],inplace=True)
dataSet.Alley.replace(['NOA','Grvl','Pave'],[0,1,2],inplace=True)
dataSet.LotShape.replace(['IR3','IR2','IR1','Reg'],[1,2,3,4],inplace=True)
dataSet.LandContour.replace(['Low','HLS','Bnk','Lvl'],[1,2,3,4],inplace=True)
dataSet.Utilities.replace(['ELO','NoSeWa','NoSewr','AllPub'],[1,2,3,4],inplace=True)
dataSet.LotConfig.replace(['FR3','FR2','CulDSac','Corner','Inside'],[1,2,3,4,5],inplace=True)
dataSet.LandSlope.replace(['Sev','Mod','Gtl'],[1,2,3],inplace=True)
dataSet.BsmtCond.replace(['NBC','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
dataSet.BsmtExposure.replace(['NBE','No','Mn','Av','Gd'],[0,1,2,3,4],inplace=True)

In [8]:
dataSet.select_dtypes(include='object').columns

Index(['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'Functional',
       'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition', 'Type'],
      dtype='object')

## Feature Engineering  Using only Numerical Features

In [9]:
num_dataSet=dataSet.select_dtypes(exclude='object').drop(columns='Id') 
# Remove the ID column as its not adding any value to prediction
num_dataSet.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,2,0,4,4,4,5,3,7,5,2003,2003,196.0,4,3,4,3,1,6,706,1,0,150,856,5,1,856,854,0,1710,1,0,2,1,3,1,4,8,0,0,2003.0,2,2,548,3,3,3,0,61,0,0,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,2,0,4,4,4,2,3,6,8,1976,1976,0.0,3,3,4,3,4,5,978,1,0,284,1262,5,1,1262,0,0,1262,0,1,2,0,3,1,3,6,1,3,1976.0,2,2,460,3,3,3,298,0,0,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,2,0,3,4,4,5,3,7,5,2001,2002,162.0,4,3,4,3,2,6,486,1,0,434,920,5,1,920,866,0,1786,1,0,2,1,3,1,4,6,1,3,2001.0,2,2,608,3,3,3,0,42,0,0,0,0,0,0,0,9,2008,223500
3,70,60.0,9550,2,0,3,4,4,4,3,7,5,1915,1970,0.0,3,3,3,3,1,5,216,1,0,540,756,4,1,961,756,0,1717,1,0,1,0,3,1,4,7,1,4,1998.0,1,3,642,3,3,3,0,35,272,0,0,0,0,0,0,2,2006,140000
4,60,84.0,14260,2,0,3,4,4,2,3,8,5,2000,2000,350.0,4,3,4,3,3,6,655,1,0,490,1145,5,1,1145,1053,0,2198,1,0,2,1,4,1,4,9,1,3,2000.0,2,3,836,3,3,3,192,84,0,0,0,0,0,0,0,12,2008,250000


## Outliers

In [10]:
#Check for Outliers
numeric_columns=dataSet.select_dtypes(exclude='object').columns
outliers= dataSet.apply(lambda x: sum(
                                 (x<(x.quantile(0.25)-1.5*(x.quantile(0.75)-x.quantile(0.25))))|
                                 (x>(x.quantile(0.75)+1.5*(x.quantile(0.75)-x.quantile(0.25))))
                                 if x.name in numeric_columns else ''
                            ))
outliers.sort_values(ascending=False)
## IQR method suggested many outliers. Lets check this outliers on thise features which are important.
# First find out the important features with below approaches.

Fence            281
EnclosedPorch    208
BsmtFinType2     204
ExterCond        178
BsmtFinSF2       167
LandContour      149
GarageQual       149
BsmtExposure     134
GarageCond       134
OverallCond      125
PavedDrive       120
ScreenPorch      116
LotFrontage      106
MSSubClass       103
MasVnrArea        98
CentralAir        95
Alley             91
BsmtCond          84
BsmtHalfBath      82
GarageYrBlt       81
LandSlope         78
OpenPorchSF       77
LotArea           69
KitchenAbvGr      68
TotalBsmtSF       61
SalePrice         61
MiscVal           52
LotConfig         51
BsmtQual          37
BedroomAbvGr      35
WoodDeckSF        32
GrLivArea         31
TotRmsAbvGrd      30
BsmtUnfSF         29
LowQualFinSF      26
3SsnPorch         24
GarageArea        21
1stFlrSF          20
LotShape          10
BsmtFinSF1         7
PoolArea           7
YearBuilt          7
PoolQC             7
Street             6
Fireplaces         5
GarageCars         5
OverallQual        2
2ndFlrSF     

In [11]:
## Feature Selection Using Coorrelation Approach
num_dataSet.corr().round(3)*100
# from below correlation table, lets select the features having correlation > 50% with SalePrice. 
# Below Features can be selected
# OverallQual 79%
# YearBuilt 52.3% & YearRemodAdd-50.7%
#ExterQual-68.3%
#BsmtQual-58.5 %
#TotalBsmtSF-61.4%
#1stFlrSF-60.6%
#GrLivArea -70.9 %
#FullBath -56.1%
#KitchenQual-66%
#TotRmsAbvGrd-53.4%
#FireplaceQu-52%
# GarageFinish-54.9%
#GarageCars-64%
#GarageArea-62.3%


#plt.figure(figsize=(50,50))
#sns.heatmap(num_dataSet.corr(),annot=True)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,MoSold,YrSold,SalePrice
MSSubClass,100.0,-35.7,-14.0,-2.5,17.9,11.5,2.5,2.3,3.8,2.6,3.3,-5.9,2.8,4.1,2.3,1.6,-6.5,5.1,-1.7,5.1,2.7,-7.0,-2.4,-6.6,-14.1,-23.9,-1.9,-10.2,-25.2,30.8,4.6,7.5,0.3,-0.2,13.2,17.7,-2.3,28.2,-1.2,4.0,-4.6,-3.9,-8.1,-3.3,-4.0,-9.9,-7.3,-9.0,-6.0,-1.3,-0.6,-1.2,-4.4,-2.6,0.8,1.6,-8.5,-0.8,-1.4,-2.1,-8.4
LotFrontage,-35.7,100.0,30.7,-3.7,-18.5,-19.7,-8.6,-0.0,-5.1,-6.8,23.4,-5.3,11.8,8.3,17.9,16.5,-1.6,14.2,5.0,16.2,7.4,21.6,1.9,4.3,12.2,36.3,8.9,6.9,41.4,7.2,3.7,36.8,9.1,-0.6,18.0,4.8,23.7,-0.6,16.9,32.0,23.6,22.6,10.0,21.8,27.0,32.4,10.9,10.4,8.9,7.7,13.7,1.0,6.2,3.8,18.1,23.5,3.1,0.1,1.0,0.7,33.5
LotArea,-14.0,30.7,100.0,-19.7,-8.4,-31.5,-33.9,-1.0,-13.1,-43.7,10.6,-0.6,1.4,1.4,10.4,5.6,1.5,7.2,2.8,22.9,5.9,21.4,8.7,11.1,-0.3,26.1,0.4,5.0,29.9,5.1,0.5,26.3,15.8,4.8,12.6,1.4,12.0,-1.8,6.8,19.0,27.1,18.5,7.3,12.5,15.5,18.0,7.9,7.6,1.5,17.2,8.5,-1.8,2.0,4.3,7.8,9.1,-4.1,3.8,0.1,-1.4,26.4
Street,-2.5,-3.7,-19.7,100.0,1.6,1.0,10.4,-0.2,-0.7,17.9,5.9,4.3,2.1,6.5,1.7,10.0,-1.5,1.1,-1.4,-9.3,1.7,-1.6,-7.8,-3.8,3.5,0.5,4.3,7.0,0.6,4.7,0.8,4.4,-5.1,1.5,4.6,2.8,2.9,1.4,6.6,4.7,-0.5,2.3,3.2,1.6,-2.0,-4.8,2.8,2.8,2.5,-1.8,-0.6,2.3,0.7,-3.3,0.4,0.4,3.0,-2.3,0.4,-2.5,4.1
Alley,17.9,-18.5,-8.4,1.6,100.0,7.9,-0.2,0.6,7.9,2.2,-2.7,3.8,-17.2,1.0,-2.0,-3.3,-1.6,-4.9,-0.6,-11.1,-12.9,-13.0,-5.5,-5.5,4.2,-11.3,1.9,-14.5,-12.9,10.4,5.7,-0.4,-9.6,-3.5,-0.5,-0.2,-0.5,2.4,-1.8,-4.3,-11.4,-10.3,-3.5,-12.3,-4.7,-6.3,-4.6,-5.8,-17.8,-11.7,7.5,15.3,-2.8,-2.2,-1.7,-1.6,-1.2,-1.1,-2.1,-1.2,-9.3
LotShape,11.5,-19.7,-31.5,1.0,7.9,100.0,20.4,2.7,27.7,14.4,-19.9,3.4,-22.9,-17.5,-8.9,-18.9,-1.4,-21.2,-8.5,-18.4,-11.1,-15.8,-6.2,-6.0,-1.4,-20.0,-12.0,-9.9,-18.9,-8.9,0.8,-21.3,-6.5,-5.8,-18.4,-11.7,-6.0,9.5,-15.8,-13.7,-20.2,-19.3,-9.3,-23.7,-19.5,-17.3,-10.9,-10.3,-10.4,-16.2,-9.3,9.5,-3.4,-6.5,-4.7,-5.3,6.2,-2.9,-2.7,3.7,-26.8
LandContour,2.5,-8.6,-33.9,10.4,-0.2,20.4,100.0,-0.8,1.0,60.6,0.2,0.7,4.8,2.5,3.5,4.4,2.8,-1.7,2.3,-25.0,-1.3,-10.6,0.3,-2.0,5.8,-5.9,5.1,2.9,-7.5,1.2,-1.6,-4.7,-8.7,-3.0,3.3,2.0,4.3,-1.7,1.3,-0.1,-9.1,-7.3,3.6,-0.0,-3.3,-3.9,2.6,3.2,5.9,-8.5,3.6,-0.0,-4.8,-1.8,0.8,0.3,6.5,1.4,-5.3,2.6,-7.3
Utilities,2.3,-0.0,-1.0,-0.2,0.6,2.7,-0.8,100.0,5.4,-0.6,0.2,-1.0,1.2,3.4,-6.3,1.8,0.6,1.5,-0.6,1.5,-3.0,1.9,-5.1,-5.0,1.3,1.4,0.4,-0.7,-1.2,2.1,0.3,0.9,2.1,-10.3,2.7,2.0,-0.4,0.6,-1.9,-0.8,-1.6,-3.1,-0.5,-0.8,-0.8,-0.6,-0.7,-0.7,-0.8,2.0,-2.8,0.9,0.3,-10.2,0.2,0.2,1.2,0.2,5.2,-2.3,1.4
LotConfig,3.8,-5.1,-13.1,-0.7,7.9,27.7,1.0,5.4,100.0,1.0,-6.4,-1.2,-8.3,-7.6,-4.2,-5.1,-2.1,-7.8,-1.2,-6.8,-5.0,-6.6,-1.4,-1.1,2.8,-4.4,-5.1,-5.6,-5.0,-3.2,2.4,-6.1,-2.8,-4.5,-6.0,-4.1,-3.6,2.5,-3.5,-2.7,-4.3,-3.6,-4.1,-6.8,-9.7,-6.9,-3.7,-3.7,-7.2,-5.6,-5.8,2.1,-5.9,1.6,-3.0,-3.9,-0.1,-3.2,2.5,-1.1,-9.2
LandSlope,2.6,-6.8,-43.7,17.9,2.2,14.4,60.6,-0.6,1.0,100.0,6.6,-1.0,7.4,5.9,2.2,8.7,4.0,0.7,-1.5,-26.2,-5.1,-11.4,-7.6,-8.6,11.6,-3.2,5.7,1.1,-6.8,0.9,-1.4,-4.4,-11.6,-7.0,4.7,-0.1,4.8,3.6,3.9,3.4,-12.4,-5.2,1.4,2.5,0.3,-0.5,0.6,1.6,2.5,-9.5,3.3,0.9,-0.9,-5.3,1.6,1.4,4.2,0.4,-0.7,0.2,-5.1


## 1. Feature Selection Using Correlation

In [12]:
feature_corr=dataSet.loc[:,['OverallQual','YearBuilt','ExterQual','BsmtQual','TotalBsmtSF',
                           '1stFlrSF','GrLivArea','FullBath','KitchenQual','TotRmsAbvGrd',
                           'FireplaceQu','GarageFinish','GarageCars','GarageArea']].values
label=dataSet.loc[:,'SalePrice'].values

## 2. Feature Selection Using BackwardElimination Technique(by OLS)
#### Step 1: Perform All In   
#### step 2: Decide SL - 0.05
#### Step 3: Perform OLS (Calc P-value)
#### Step 4: Select the feature with highest p-value which is greater than 0.05 and remove this feature 
#### Step 5: recreate the new features and perform OLS once again and perform step4
#### Repeat this step until we get all features with p-value < 0.05

In [13]:
from statsmodels.formula.api import ols
# create model object

feature_ols = ' + '.join(num_dataSet.drop(columns=
                                          ['SalePrice','1stFlrSF','3SsnPorch','2ndFlrSF','BsmtHalfBath'
                                          ,'MiscVal','LandContour','MoSold','Fence','EnclosedPorch'
                                          ,'LotShape','LotConfig','Alley','CentralAir','Fireplaces'
                                          ,'GarageCond','GarageCond','PavedDrive','LandSlope','YrSold'
                                          ,'BsmtUnfSF','HalfBath','BsmtFinType2','BsmtFinSF2','GarageArea'
                                          ,'GarageFinish','OpenPorchSF','PoolQC','PoolArea','FullBath'
                                          ,'Street','KitchenAbvGr','TotalBsmtSF','HeatingQC','YearRemodAdd'
                                          ,'LowQualFinSF','ExterCond','BsmtFullBath','GarageQual'],
                                          axis = 1).columns)
model_ols = ols('SalePrice ~ ' + feature_ols , data = num_dataSet)
# fit the model
LR=model_ols.fit()
LR.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.833
Model:,OLS,Adj. R-squared:,0.83
Method:,Least Squares,F-statistic:,311.1
Date:,"Sun, 13 Sep 2020",Prob (F-statistic):,0.0
Time:,15:38:39,Log-Likelihood:,-17238.0
No. Observations:,1460,AIC:,34520.0
Df Residuals:,1436,BIC:,34650.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5.514e+05,1.14e+05,-4.855,0.000,-7.74e+05,-3.29e+05
MSSubClass,-238.7063,23.072,-10.346,0.000,-283.964,-193.449
LotFrontage,-111.7385,47.916,-2.332,0.020,-205.730,-17.747
LotArea,0.4443,0.096,4.631,0.000,0.256,0.633
Utilities,3.313e+04,1.66e+04,1.995,0.046,557.900,6.57e+04
OverallQual,1.252e+04,1192.742,10.501,0.000,1.02e+04,1.49e+04
OverallCond,4916.6878,897.920,5.476,0.000,3155.313,6678.063
YearBuilt,163.6849,48.262,3.392,0.001,69.014,258.356
MasVnrArea,29.7522,5.519,5.391,0.000,18.925,40.579

0,1,2,3
Omnibus:,579.986,Durbin-Watson:,1.969
Prob(Omnibus):,0.0,Jarque-Bera (JB):,92396.79
Skew:,-0.736,Prob(JB):,0.0
Kurtosis:,41.945,Cond. No.,1950000.0


### From above results the features from Backward elimination Technique are below

In [14]:

feature_OLS=num_dataSet.drop(columns=['SalePrice','1stFlrSF','3SsnPorch','2ndFlrSF','BsmtHalfBath'
                          ,'MiscVal','LandContour','MoSold','Fence','EnclosedPorch'
                          ,'LotShape','LotConfig','Alley','CentralAir','Fireplaces'
                          ,'GarageCond','GarageCond','PavedDrive','LandSlope','YrSold'
                          ,'BsmtUnfSF','HalfBath','BsmtFinType2','BsmtFinSF2','GarageArea'
                          ,'GarageFinish','OpenPorchSF','PoolQC','PoolArea','FullBath'
                          ,'Street','KitchenAbvGr','TotalBsmtSF','HeatingQC','YearRemodAdd'
                          ,'LowQualFinSF','ExterCond','BsmtFullBath','GarageQual']).values


## Verify the performance of different models using Stratified-KFold Cross Validation With the features from Correlation

In [15]:
from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import f1_score,classification_report,confusion_matrix
from sklearn.metrics import mean_squared_error,median_absolute_error,r2_score

def stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y):
    global df_model_selection
    
    skf = StratifiedKFold(n_splits, random_state=12,shuffle=True)
    
    weighted_r2_score = []
    #print(skf.split(X,y))
    for train_index, test_index in skf.split(X,y):
        X_train, X_test = X[train_index], X[test_index] 
        y_train, y_test = y[train_index], y[test_index]
        
        
        model_obj.fit(X_train, y_train)##### HERE ###
        test_ds_predicted = model_obj.predict( X_test ) ##### HERE ####   
        #print( metrics.classification_report( y_test, test_ds_predicted ) )    
        weighted_r2_score.append(round(r2_score(y_true=y_test, y_pred=test_ds_predicted),2))
        
    sd_weighted_r2_score = np.std(weighted_r2_score, ddof=1)
    range_of_r2_scores = "{}-{}".format(min(weighted_r2_score),max(weighted_r2_score))    
    df_model_selection = pd.concat([df_model_selection,pd.DataFrame([[process,model_name,sorted(weighted_r2_score),range_of_r2_scores,sd_weighted_r2_score]], columns =COLUMN_NAMES) ])
    

In [16]:
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from xgboost import XGBRFRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR

In [17]:
COLUMN_NAMES = ["Process","Model Name", "r2 Scores","Range of r2 Scores","Std Deviation of r2 Scores"]
df_model_selection = pd.DataFrame(columns=COLUMN_NAMES)

process='Correlation'
n_splits = 10
SC=StandardScaler()
X=SC.fit_transform(feature_corr) # Perform feature scaling on features
y=label


# 1.LinearRegression
model_LR=LinearRegression()
model_obj=model_LR
model_name='LinearRegression'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 2.DecisionTreeRegressor
model_DTR=DecisionTreeRegressor()
model_obj=model_DTR
model_name='DecisionTreeRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 3.RandomForestRegressor
model_RFR=RandomForestRegressor()
model_obj=model_RFR
model_name='RandomForestRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 4.GradientBoostingRegressor
model_GBR=GradientBoostingRegressor()
model_obj=model_GBR
model_name='GradientBoostingRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 5.XGBRegressor
model_XGBR=XGBRegressor()
model_obj=model_XGBR
model_name='XGBRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 6.XGBRFRegressor
model_XGBRFR=XGBRFRegressor()
model_obj=model_XGBRFR
model_name='XGBRFRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 7.KNeighborsRegressor
# model_KNNR=KNeighborsRegressor()
# model_obj=model_KNNR
# model_name='KNeighborsRegressor'
# stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 8.GaussianProcessRegressor
model_GPR = GaussianProcessRegressor()
model_obj=model_GPR
model_name='GaussianProcessRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 9.SGDRegressor
model_SGDR = SGDRegressor()
model_obj=model_SGDR
model_name='SGDRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 10.Support Vector Machine Regressor
model_SVR=SVR()
model_obj=model_SVR
model_name='Support Vector Machine Regressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

#Exporting the results to csv
df_model_selection.to_csv("Model_statistics.csv",index = False)


df_model_selection

Unnamed: 0,Process,Model Name,r2 Scores,Range of r2 Scores,Std Deviation of r2 Scores
0,Correlation,LinearRegression,"[0.31, 0.77, 0.8, 0.81, 0.81, 0.81, 0.83, 0.83...",0.31-0.85,0.161331
0,Correlation,DecisionTreeRegressor,"[0.06, 0.71, 0.72, 0.72, 0.74, 0.75, 0.77, 0.7...",0.06-0.83,0.225608
0,Correlation,RandomForestRegressor,"[0.31, 0.84, 0.85, 0.86, 0.87, 0.88, 0.88, 0.8...",0.31-0.91,0.179679
0,Correlation,GradientBoostingRegressor,"[0.35, 0.83, 0.83, 0.86, 0.87, 0.88, 0.88, 0.9...",0.35-0.91,0.16776
0,Correlation,XGBRegressor,"[0.34, 0.79, 0.81, 0.83, 0.83, 0.83, 0.84, 0.8...",0.34-0.9,0.16303
0,Correlation,XGBRFRegressor,"[0.33, 0.83, 0.84, 0.84, 0.86, 0.86, 0.87, 0.8...",0.33-0.88,0.168592
0,Correlation,GaussianProcessRegressor,"[-1.33, -1.13, -0.96, -0.89, -0.88, -0.71, -0....",-1.33--0.33,0.335925
0,Correlation,SGDRegressor,"[0.32, 0.77, 0.79, 0.8, 0.81, 0.81, 0.82, 0.83...",0.32-0.84,0.156688
0,Correlation,Support Vector Machine Regressor,"[-0.1, -0.08, -0.08, -0.05, -0.04, -0.04, -0.0...",-0.1--0.02,0.02708


## Verify the performance of different models using Stratified-KFold Cross Validation With the features from Backward Ellimination Technique

In [18]:
COLUMN_NAMES = ["Process","Model Name", "r2 Scores","Range of r2 Scores","Std Deviation of r2 Scores"]
df_model_selection = pd.DataFrame(columns=COLUMN_NAMES)

process='Backward Elimination-OLS'
n_splits = 10
SC=StandardScaler()
X=SC.fit_transform(feature_OLS) # Perform feature scaling on features
y=label

# 1.LinearRegression
model_LR=LinearRegression()
model_obj=model_LR
model_name='LinearRegression'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 2.DecisionTreeRegressor
model_DTR=DecisionTreeRegressor()
model_obj=model_DTR
model_name='DecisionTreeRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 3.RandomForestRegressor
model_RFR=RandomForestRegressor()
model_obj=model_RFR
model_name='RandomForestRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 4.GradientBoostingRegressor
model_GBR=GradientBoostingRegressor()
model_obj=model_GBR
model_name='GradientBoostingRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 5.XGBRegressor
model_XGBR=XGBRegressor()
model_obj=model_XGBR
model_name='XGBRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 6.XGBRFRegressor
model_XGBRFR=XGBRFRegressor()
model_obj=model_XGBRFR
model_name='XGBRFRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 7.KNeighborsRegressor
# model_KNNR=KNeighborsRegressor()
# model_obj=model_KNNR
# model_name='KNeighborsRegressor'
# stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 8.GaussianProcessRegressor
model_GPR = GaussianProcessRegressor()
model_obj=model_GPR
model_name='GaussianProcessRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 9.SGDRegressor
model_SGDR = SGDRegressor()
model_obj=model_SGDR
model_name='SGDRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# 10.Support Vector Machine Regressor
model_SVR=SVR()
model_obj=model_SVR
model_name='Support Vector Machine Regressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

#Exporting the results to csv
#df_model_selection.to_csv("Model_statistics.csv",index = False)


df_model_selection

Unnamed: 0,Process,Model Name,r2 Scores,Range of r2 Scores,Std Deviation of r2 Scores
0,Backward Elimination-OLS,LinearRegression,"[0.25, 0.78, 0.85, 0.85, 0.85, 0.86, 0.86, 0.8...",0.25-0.89,0.193402
0,Backward Elimination-OLS,DecisionTreeRegressor,"[0.35, 0.61, 0.74, 0.76, 0.77, 0.77, 0.78, 0.7...",0.35-0.8,0.140321
0,Backward Elimination-OLS,RandomForestRegressor,"[0.37, 0.77, 0.87, 0.87, 0.87, 0.88, 0.89, 0.9...",0.37-0.92,0.166203
0,Backward Elimination-OLS,GradientBoostingRegressor,"[0.22, 0.85, 0.87, 0.88, 0.88, 0.9, 0.92, 0.92...",0.22-0.93,0.216076
0,Backward Elimination-OLS,XGBRegressor,"[0.33, 0.74, 0.85, 0.86, 0.87, 0.87, 0.89, 0.8...",0.33-0.93,0.178154
0,Backward Elimination-OLS,XGBRFRegressor,"[0.38, 0.7, 0.84, 0.85, 0.85, 0.86, 0.86, 0.87...",0.38-0.9,0.156876
0,Backward Elimination-OLS,GaussianProcessRegressor,"[-2.87, -2.72, -2.65, -2.45, -2.42, -2.36, -2....",-2.87--1.84,0.341981
0,Backward Elimination-OLS,SGDRegressor,"[0.22, 0.75, 0.85, 0.85, 0.85, 0.86, 0.86, 0.8...",0.22-0.88,0.202221
0,Backward Elimination-OLS,Support Vector Machine Regressor,"[-0.1, -0.08, -0.08, -0.05, -0.05, -0.04, -0.0...",-0.1--0.02,0.024585


## 3. Feature Selection Using SELECT FROM MODEL

In [19]:
# From above Cross validation results it seems that RandomForest Regressor seems to be performing relatively well
# Hence we can select RandomForest Regressor as a base model for this feature selection
from sklearn.feature_selection import SelectFromModel
model_RFR=RandomForestRegressor()
SFM=SelectFromModel(estimator=model_RFR)
SFM.fit(X=num_dataSet.drop(columns=['SalePrice']),y=num_dataSet.loc[:,'SalePrice'])
#Get the support
print(num_dataSet.columns)
print(SFM.get_support())

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscVal', 'MoSold',
       'YrSold', 'SalePrice'],
      dtype='object')
[False False False False False False False False False F

In [20]:
## Select FROM Model says below are the important features 
feature_FromModel=dataSet.loc[:,['OverallQual','BsmtFinSF1','TotalBsmtSF','1stFlrSF','2ndFlrSF'
                                ,'GrLivArea','GarageCars']].values

In [21]:
# Perform Crosaa validation for RandomForestRegressor
COLUMN_NAMES = ["Process","Model Name", "r2 Scores","Range of r2 Scores","Std Deviation of r2 Scores"]
df_model_selection = pd.DataFrame(columns=COLUMN_NAMES)

process='Select From Model'
n_splits = 10
SC=StandardScaler()
X=SC.fit_transform(feature_FromModel) # Perform feature scaling on features
y=label

model_RFR=RandomForestRegressor()
model_obj=model_RFR
model_name='RandomForestRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection

Unnamed: 0,Process,Model Name,r2 Scores,Range of r2 Scores,Std Deviation of r2 Scores
0,Select From Model,RandomForestRegressor,"[0.42, 0.82, 0.82, 0.85, 0.87, 0.87, 0.87, 0.8...",0.42-0.9,0.142708


## Conclusion SO-FAR From APROACH-1( onsidering only Numerical Variables)

In [None]:
# If we consider only numerical features 
# 1. Correlation Analysis Feature selection approach suggests GradientBoostingRegressor to be the best model with max test score as 91%. 15 Features.
# 2. Backward Elimination-OLS Feature selection approach suggests RandomForestRegressor to be the best model with max test score as 93%.23 Features.
# 3. Select From Model Feature selection approach suggests RandomForestRegressor to be the best model with max test score as 90%. 7 Features.

# APPROACH -2: 
# Let us now Include Categorical features and Numerical Features both and perform the Test

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_info_columns', 300)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 400)

In [3]:
# Since the number of Catgories may not be same across both train and Test data set, its better to concatenate both 
#test and train dataSet and perform the OneHOtEncoding.
# while concatenating we should include an additional column to differentite 
dataSetTrain=pd.read_csv('train.csv')
dataSetTest=pd.read_csv('test.csv')
print('Train Dataset Shape:',dataSetTrain.shape)
print('Test Dataset Shape:',dataSetTest.shape)

feature_TrainTest=pd.concat([dataSetTrain.drop(columns=['SalePrice','Id']),dataSetTest.drop(columns='Id')],axis=0)
#feature_TrainTest.head()

Train Dataset Shape: (1460, 82)
Test Dataset Shape: (1459, 81)


## Manage NULL Values

In [4]:
#feature_TrainTest.isnull().sum()/len(feature_TrainTest)*100

# After looking into the 'NA' values across all attrbutes, it is understood that these NA values are infact Valid value.
# We just need to rename the 'NA' with some other valid acronym.

feature_TrainTest.LotFrontage.fillna(round(feature_TrainTest.LotFrontage.mean(),2),inplace=True) # NA should be replaced with Mean value
feature_TrainTest.Alley.fillna('NOA',inplace=True)      #NA should be replaced with 'NOA' (NO Alley Access)

feature_TrainTest.MasVnrType.fillna('NMVT',inplace=True) #NA should be replaced with 'NMVT' (NO Masonry veneer)
feature_TrainTest.MasVnrArea.fillna(round(feature_TrainTest.MasVnrArea.mean(),2),inplace=True)      # Replace the 'NA' values with mean value

feature_TrainTest.BsmtQual.fillna('NBQ',inplace=True) #NA should be replaced with 'NBQ' (NO basement quality)
feature_TrainTest.BsmtCond.fillna('NBC',inplace=True) #NA should be replaced with 'NBC' (NO basement condition)
feature_TrainTest.BsmtExposure.fillna('NBE',inplace=True) #NA should be replaced with 'NBE' (NO basement exposure)

feature_TrainTest.BsmtFinType1.fillna('NBF1',inplace=True) #NA should be replaced with 'NBF1' (NO basement type1)
feature_TrainTest.BsmtFinType2.fillna('NBF2',inplace=True) #NA should be replaced with 'NBF2' (NO basement type2)

feature_TrainTest.Electrical.fillna(feature_TrainTest.Electrical.mode()[0],inplace=True) #NA Replace with Mode value

feature_TrainTest.FireplaceQu.fillna('NFP',inplace=True) #NA should be replaced with 'NFP' (NO fire place)

feature_TrainTest.GarageType.fillna('NG',inplace=True) #NA should be replaced with 'NG' (NO Garage)
feature_TrainTest.GarageYrBlt.fillna(0,inplace=True) #Replace the 'NA' values with zero (0)
feature_TrainTest.GarageFinish.fillna('NGF',inplace=True) #NA should be replaced with 'NGF' (NO Garage finish)
feature_TrainTest.GarageQual.fillna('NGQ',inplace=True) #NA should be replaced with 'NGQ' (NO Garage quality)
feature_TrainTest.GarageCond.fillna('NGC',inplace=True) #NA should be replaced with 'NGC' (NO Garage condition)

feature_TrainTest.PoolQC.fillna('NP',inplace=True) #NA should be replaced with 'NP' (NO Pool)
feature_TrainTest.Fence.fillna('NF',inplace=True) #NA should be replaced with 'NF' (NO Fence)
feature_TrainTest.MiscFeature.fillna('NM',inplace=True) #NA should be replaced with 'NM' (NO Miscellaneous)

feature_TrainTest.MSZoning.fillna(feature_TrainTest.MSZoning.mode()[0],inplace=True) #NAN should be replaced with Mode

feature_TrainTest.Utilities.fillna(feature_TrainTest.Utilities.mode()[0],inplace=True) #NAN should be replaced with Mode
feature_TrainTest.Exterior1st.fillna(feature_TrainTest.Exterior1st.mode()[0],inplace=True) #NAN should be replaced with Mode
feature_TrainTest.Exterior2nd.fillna(feature_TrainTest.Exterior2nd.mode()[0],inplace=True) #NAN should be replaced with Mode

feature_TrainTest.BsmtFinSF1.fillna(round(feature_TrainTest.BsmtFinSF1.mean(),2),inplace=True) # NA should be replaced with Mean value
feature_TrainTest.BsmtFinSF2.fillna(round(feature_TrainTest.BsmtFinSF2.mean(),2),inplace=True) # NA should be replaced with Mean value


feature_TrainTest.BsmtUnfSF.fillna(round(feature_TrainTest.BsmtUnfSF.mean(),2),inplace=True)# NA should be replaced with Mean value
feature_TrainTest.TotalBsmtSF.fillna(round(feature_TrainTest.TotalBsmtSF.mean(),2),inplace=True) # NA should be replaced with Mean value

feature_TrainTest.BsmtFullBath.fillna(feature_TrainTest.BsmtFullBath.mode()[0],inplace=True) #NAN should be replaced with Mode
feature_TrainTest.BsmtHalfBath.fillna(feature_TrainTest.BsmtHalfBath.mode()[0],inplace=True) #NAN should be replaced with Mode
feature_TrainTest.KitchenQual.fillna(feature_TrainTest.KitchenQual.mode()[0],inplace=True) #NAN should be replaced with Mode
feature_TrainTest.Functional.fillna(feature_TrainTest.Functional.mode()[0],inplace=True) #NAN should be replaced with Mode

feature_TrainTest.GarageCars.fillna(feature_TrainTest.GarageCars.mode()[0],inplace=True) #NAN should be replaced with Mode
feature_TrainTest.GarageArea.fillna(round(feature_TrainTest.GarageArea.mean(),2),inplace=True) # NA should be replaced with Mean value
feature_TrainTest.SaleType.fillna(feature_TrainTest.SaleType.mode()[0],inplace=True) #NAN should be replaced with Mode

#All NA values are now manged
#feature_TrainTest.isnull().sum()/len(feature_TrainTest)*100

## It is also observed that there are many Ordinal Categorical Features and these features can be converted## to Numeric features by assigning ordinal values to each features after carefully analyzing the data dictionary.

In [5]:
feature_TrainTest.ExterQual.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
feature_TrainTest.ExterCond.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
feature_TrainTest.BsmtQual.replace(['NBQ','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
feature_TrainTest.BsmtCond.replace(['NBE','No','Mn','Av','Gd'],[0,0,1,2,3],inplace=True)
feature_TrainTest.BsmtFinType1.replace(['NBF1','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],[0,1,2,3,4,5,6],inplace=True)
feature_TrainTest.BsmtFinType2.replace(['NBF2','Unf','LwQ','Rec','BLQ','ALQ','GLQ'],[0,1,2,3,4,5,6],inplace=True)
feature_TrainTest.HeatingQC.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
feature_TrainTest.CentralAir.replace(['Y','N'],[1,0],inplace=True)
feature_TrainTest.KitchenQual.replace(['Po','Fa','TA','Gd','Ex'],[1,2,3,4,5],inplace=True)
feature_TrainTest.FireplaceQu.replace(['NFP','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
feature_TrainTest.GarageFinish.replace(['NGF','Unf','RFn','Fin'],[0,1,2,3],inplace=True)
feature_TrainTest.GarageQual.replace(['NGQ','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
feature_TrainTest.GarageCond.replace(['NGC','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
feature_TrainTest.PavedDrive.replace(['N','P','Y'],[1,2,3],inplace=True)
feature_TrainTest.PoolQC.replace(['NP','Fa','TA','Gd','Ex'],[0,1,2,3,4],inplace=True)
feature_TrainTest.Fence.replace(['NF','MnWw','GdWo','MnPrv','GdPrv'],[0,1,2,3,4],inplace=True)
feature_TrainTest.Street.replace(['Grvl','Pave'],[1,2],inplace=True)
feature_TrainTest.Alley.replace(['NOA','Grvl','Pave'],[0,1,2],inplace=True)
feature_TrainTest.LotShape.replace(['IR3','IR2','IR1','Reg'],[1,2,3,4],inplace=True)
feature_TrainTest.LandContour.replace(['Low','HLS','Bnk','Lvl'],[1,2,3,4],inplace=True)
feature_TrainTest.Utilities.replace(['ELO','NoSeWa','NoSewr','AllPub'],[1,2,3,4],inplace=True)
feature_TrainTest.LotConfig.replace(['FR3','FR2','CulDSac','Corner','Inside'],[1,2,3,4,5],inplace=True)
feature_TrainTest.LandSlope.replace(['Sev','Mod','Gtl'],[1,2,3],inplace=True)
feature_TrainTest.BsmtCond.replace(['NBC','Po','Fa','TA','Gd','Ex'],[0,1,2,3,4,5],inplace=True)
feature_TrainTest.BsmtExposure.replace(['NBE','No','Mn','Av','Gd'],[0,1,2,3,4],inplace=True)

## Apply One HotEncoding

In [6]:
feature_TrainTest.select_dtypes(include='object').columns

Index(['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'Functional',
       'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition', 'Type'],
      dtype='object')

In [6]:
finalFeature_TrainTest=pd.get_dummies(feature_TrainTest,drop_first=True,columns=['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'Functional',
       'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'])

In [7]:
finalFeature_TrainTest.shape

(2919, 203)

In [8]:
finalFeature_Train=finalFeature_TrainTest[finalFeature_TrainTest.Type=='Train'].copy()
finalFeature_Train.drop(columns='Type',inplace=True)
finalFeature_Test=finalFeature_TrainTest[finalFeature_TrainTest.Type=='Test'].copy()
finalFeature_Test.drop(columns='Type',inplace=True)
label=dataSetTrain.loc[:,'SalePrice']
#del finalFeature_TrainTest

In [14]:
type(finalFeature_Train)

pandas.core.frame.DataFrame

In [16]:
print('Train Feature Shape:',finalFeature_Train.shape)
print('Test Dataset Shape:',finalFeature_Test.shape)
print('Test Label Shape:',label.shape)

Train Feature Shape: (1460, 202)
Test Dataset Shape: (1459, 202)
Test Label Shape: (1460,)


## Lets Apply Principal Component Analysis and extratct the best features

In [20]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#PCA Steps
#1. Perform Feature Scaling
#2. Perform PCA

In [26]:
SC=StandardScaler()
features = SC.fit_transform(finalFeature_Train)
pca=PCA()
pca.fit(features,label)
np.cumsum(pca.explained_variance_ratio_)

array([0.0742503 , 0.1076183 , 0.13518547, 0.15902883, 0.1793871 ,
       0.1989152 , 0.21635351, 0.23225355, 0.24720202, 0.26089971,
       0.2739901 , 0.28675258, 0.29918675, 0.31109784, 0.32278935,
       0.33419586, 0.34508024, 0.35567842, 0.36624124, 0.37660294,
       0.38666229, 0.39661935, 0.40643641, 0.41613682, 0.425642  ,
       0.43481233, 0.44381808, 0.45269325, 0.46130229, 0.46981809,
       0.47823022, 0.48645978, 0.49452436, 0.50240224, 0.51019686,
       0.51789929, 0.52558814, 0.5330477 , 0.54036843, 0.54752533,
       0.55464137, 0.56160107, 0.56845142, 0.57523798, 0.58196829,
       0.5886004 , 0.59520509, 0.60172379, 0.60809679, 0.6144021 ,
       0.62065317, 0.62682682, 0.63292888, 0.63896052, 0.64490895,
       0.65082834, 0.65667448, 0.66237747, 0.6680515 , 0.67367787,
       0.67925363, 0.68480437, 0.69021724, 0.69556917, 0.70087849,
       0.70615399, 0.71140482, 0.71660806, 0.72177385, 0.72688864,
       0.73188883, 0.73688157, 0.74181808, 0.74670147, 0.75156

In [28]:
pca=PCA(n_components=125)
pca.fit(features,label)
features_PCA=pca.transform(features)

## Perform Stratified-KFold Cross Validation With the features from PCA

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from xgboost import XGBRFRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR

from sklearn.model_selection import StratifiedKFold
#from sklearn.metrics import f1_score,classification_report,confusion_matrix
from sklearn.metrics import mean_squared_error,median_absolute_error,r2_score

def stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y):
    global df_model_selection
    
    skf = StratifiedKFold(n_splits, random_state=12,shuffle=True)
    
    weighted_r2_score = []
    #print(skf.split(X,y))
    for train_index, test_index in skf.split(X,y):
        X_train, X_test = X[train_index], X[test_index] 
        y_train, y_test = y[train_index], y[test_index]
        
        
        model_obj.fit(X_train, y_train)##### HERE ###
        test_ds_predicted = model_obj.predict( X_test ) ##### HERE ####   
        #print( metrics.classification_report( y_test, test_ds_predicted ) )    
        weighted_r2_score.append(round(r2_score(y_true=y_test, y_pred=test_ds_predicted),2))
        
    sd_weighted_r2_score = np.std(weighted_r2_score, ddof=1)
    range_of_r2_scores = "{}-{}".format(min(weighted_r2_score),max(weighted_r2_score))    
    df_model_selection = pd.concat([df_model_selection,pd.DataFrame([[process,model_name,sorted(weighted_r2_score),range_of_r2_scores,sd_weighted_r2_score]], columns =COLUMN_NAMES) ])

In [44]:
COLUMN_NAMES = ["Process","Model Name", "r2 Scores","Range of r2 Scores","Std Deviation of r2 Scores"]
df_model_selection = pd.DataFrame(columns=COLUMN_NAMES)

process='PCA'
n_splits = 10
#SC=StandardScaler()
#X=SC.fit_transform(features_PCA) # Perform feature scaling on features
X=features_PCA
y=label

# # 1.LinearRegression
model_LR=LinearRegression()
model_obj=model_LR
model_name='LinearRegression'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # # 2.DecisionTreeRegressor
model_DTR=DecisionTreeRegressor()
model_obj=model_DTR
model_name='DecisionTreeRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # # 3.RandomForestRegressor
model_RFR=RandomForestRegressor()
model_obj=model_RFR
model_name='RandomForestRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # # 4.GradientBoostingRegressor
model_GBR=GradientBoostingRegressor()
model_obj=model_GBR
model_name='GradientBoostingRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # # 5.XGBRegressor
model_XGBR=XGBRegressor()
model_obj=model_XGBR
model_name='XGBRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # 6.XGBRFRegressor
# model_XGBRFR=XGBRFRegressor()
# model_obj=model_XGBRFR
# model_name='XGBRFRegressor'
# stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # 7.KNeighborsRegressor
# # model_KNNR=KNeighborsRegressor()
# # model_obj=model_KNNR
# # model_name='KNeighborsRegressor'
# # stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # 8.GaussianProcessRegressor
# model_GPR = GaussianProcessRegressor()
# model_obj=model_GPR
# model_name='GaussianProcessRegressor'
# stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # 9.SGDRegressor
# model_SGDR = SGDRegressor()
# model_obj=model_SGDR
# model_name='SGDRegressor'
# stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# # 10.Support Vector Machine Regressor
# model_SVR=SVR()
# model_obj=model_SVR
# model_name='Support Vector Machine Regressor'
# stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)

# #Exporting the results to csv
# #df_model_selection.to_csv("Model_statistics.csv",index = False)


df_model_selection


Unnamed: 0,Process,Model Name,r2 Scores,Range of r2 Scores,Std Deviation of r2 Scores
0,PCA,XGBRegressor,"[0.42, 0.74, 0.79, 0.79, 0.84, 0.86, 0.88, 0.8...",0.42-0.9,0.142423


In [45]:
# Process	     Model Name	        r2 Scores	                                      Range of r2 Scores	Std Deviation of r2 Scores	
# Correlation	LinearRegression	[0.31, 0.77, 0.8, 0.81, 0.81, 0.81, 0.83, 0.83...	0.31-0.85	        0.161331	15 Features
# Correlation	DecisionTreeRegressor	[0.09, 0.63, 0.7, 0.71, 0.72, 0.73, 0.75, 0.81...	0.09-0.84	    0.218317	
# Correlation	RandomForestRegressor	[0.33, 0.84, 0.85, 0.86, 0.86, 0.86, 0.88, 0.8...	0.33-0.91	    0.172437	
# Correlation	GradientBoostingRegressor	[0.35, 0.83, 0.83, 0.86, 0.87, 0.88, 0.88, 0.9...	0.35-0.91	0.16776	
# Correlation	XGBRegressor	[0.34, 0.79, 0.81, 0.83, 0.83, 0.83, 0.84, 0.8...	0.34-0.9	            0.16303	
# Correlation	XGBRFRegressor	[0.34, 0.84, 0.84, 0.84, 0.86, 0.86, 0.87, 0.8...	0.34-0.89	            0.166132	
# Correlation	KNeighborsRegressor	[0.43, 0.69, 0.71, 0.72, 0.72, 0.73, 0.73, 0.7...	0.43-0.78	        0.100466	
					
# Backward Elimination-OLS	LinearRegression	[0.25, 0.78, 0.85, 0.85, 0.85, 0.86, 0.86, 0.8...	0.25-0.89	1.90E-01	23 Features
# Backward Elimination-OLS	DecisionTreeRegressor	[0.32, 0.5, 0.64, 0.73, 0.75, 0.75, 0.77, 0.78...	0.32-0.81	1.60E-01	
# Backward Elimination-OLS	RandomForestRegressor	[0.39, 0.79, 0.86, 0.87, 0.87, 0.88, 0.9, 0.91...	0.39-0.93	1.60E-01	
# Backward Elimination-OLS	GradientBoostingRegressor	[0.22, 0.84, 0.88, 0.88, 0.88, 0.9, 0.92, 0.92...	0.22-0.93	2.20E-01	
# Backward Elimination-OLS	XGBRegressor	[0.33, 0.74, 0.85, 0.86, 0.87, 0.87, 0.89, 0.8...	0.33-0.93	1.80E-01	
# Backward Elimination-OLS	XGBRFRegressor	[0.4, 0.73, 0.84, 0.85, 0.85, 0.86, 0.86, 0.88...	0.4-0.9	1.50E-01	
# Select From Model	RandomForestRegressor	[0.4, 0.83, 0.83, 0.85, 0.88, 0.88, 0.88, 0.88...	0.4-0.9	0.15	 7 Features
# PCA	LinearRegression	[-0.91, -0.55, -0.3, -0.12, -0.0, 0.7, 0.75, 0..	.-91 to 0.85	0.651768	125 Features
# PCA	DecisionTreeRegressor	[0.1, 0.58, 0.65, 0.67, 0.69, 0.7, 0.72, 0.75,...	0.1-0.76	0.197079	
# PCA	RandomForestRegressor	[0.54, 0.78, 0.8, 0.84, 0.85, 0.86, 0.87, 0.87...	0.54-0.9	0.104504	
# PCA	GradientBoostingRegressor	[0.79, 0.82, 0.85, 0.87, 0.88]	0.79-0.88	0.037014	
# PCA	XGBRegressor	[0.79, 0.81, 0.83, 0.84, 0.88]	0.79-0.88	0.033912	
# PCA	GradientBoostingRegressor	[0.42, 0.8, 0.83, 0.86, 0.87, 0.87, 0.89, 0.9,...	0.42-0.93	0.148309	
# PCA	XGBRegressor	[0.42, 0.74, 0.79, 0.79, 0.84, 0.86, 0.88, 0.8...	0.42-0.9	0.142423	


In [None]:
## From Both Approaches it seems that Including catagorical features does not significantly changes the prediction performance
# Rather its is increasing the complexity with many number of features ~ 125 features (PCA). 
# So PCA techniqe is rejected in this case.


## Approach-3: Select features FROM Model considering all Numerical and Categorical Features

In [11]:
# From previous Cross validation results it seems that RandomForest Regressor seems to be performing relatively well
# Hence we can select RandomForest Regressor as a base model for this feature selection FROM Model
from sklearn.feature_selection import SelectFromModel
model_RFR=RandomForestRegressor()
SFM=SelectFromModel(estimator=model_RFR)
SFM.fit(X=finalFeature_Train,y=label)
#Get the support
print(finalFeature_Train.columns)
print(SFM.get_support())

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=202)
[False  True  True False False False False False False False  True False
  True  True  True False False False False False False  True False False
  True  True False False  True  True False  True False False  True False
 False False False  True False False  True False  True  True False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False Fals

In [49]:
finalFeature_Train.head(1)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscVal,MoSold,YrSold,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,...,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkFace,MasVnrType_NMVT,MasVnrType_None,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NG,MiscFeature_NM,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,2,0,4,4,4,5,3,7,5,2003,2003,196.0,4,3,4,3,1,6,706.0,1,0.0,150.0,856.0,5,1,856,854,0,1710,1.0,0.0,2,1,3,1,4,8,0,0,2003.0,2,2.0,548.0,3,3,3,0,61,0,0,0,0,0,0,0,2,2008,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [45]:
## Select FROM Model says below are the important features 
feature_FromModel=finalFeature_Train.loc[:,['LotFrontage','LotArea','OverallQual','YearBuilt','YearRemodAdd',
                                 'MasVnrArea','BsmtQual','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
                                 '1stFlrSF','2ndFlrSF','GrLivArea','FullBath','TotRmsAbvGrd',
                                 'GarageYrBlt','GarageFinish','GarageCars','GarageArea']].values

# Perform Cross validation for RandomForestRegressor
COLUMN_NAMES = ["Process","Model Name", "r2 Scores","Range of r2 Scores","Std Deviation of r2 Scores"]
df_model_selection = pd.DataFrame(columns=COLUMN_NAMES)

process='Select From Model'
n_splits = 10
SC=StandardScaler()
X=SC.fit_transform(feature_FromModel) # Perform feature scaling on features
y=label

model_RFR=RandomForestRegressor()
model_obj=model_RFR
model_name='RandomForestRegressor'
stratified_K_fold_validation(model_obj, model_name, process, n_splits, X, y)
df_model_selection


Unnamed: 0,Process,Model Name,r2 Scores,Range of r2 Scores,Std Deviation of r2 Scores
0,Select From Model,RandomForestRegressor,"[0.47, 0.85, 0.85, 0.87, 0.87, 0.88, 0.88, 0.8...",0.47-0.92,0.131047


## Conclusion: From Approach-1(Numerical features), From Approach-2(PCA features) and Approach-3 It is concluded that
# RandomForestRegressor provided better performance with features
#'LotFrontage','LotArea','OverallQual','YearBuilt','YearRemodAdd',
#'MasVnrArea','BsmtQual','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
#'1stFlrSF','2ndFlrSF','GrLivArea','FullBath','TotRmsAbvGrd',
#'GarageYrBlt','GarageFinish','GarageCars','GarageArea'

# Now lets try to get the Scores using StratifiedKFold Cross Validation

In [46]:
# Now lets try to get the Scores using StratifiedKFold Cross Validation

#Initialize the algo
model=RandomForestRegressor()
feature_FromModel=SC.fit_transform(feature_FromModel)

#Initialize StratifiedKFold Method
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10, 
              random_state=1,
              shuffle=True)

#Initialize For Loop 

i=0
for train,test in kfold.split(feature_FromModel,label):
    i = i+1
    X_train,X_test = feature_FromModel[train],feature_FromModel[test]
    y_train,y_test = label[train],label[test]
    
    model.fit(X_train,y_train)
    test_ds_predicted=model.predict(X_test)
    train_ds_predicted=model.predict(X_train)
    
    test_r2_score=round(r2_score(y_true=y_test, y_pred=test_ds_predicted ),2)
    train_r2_score=round(r2_score(y_true=y_train, y_pred=train_ds_predicted ),2)
    
    print("Train r2-Score: {}, Test r2-score: {}, for Sample Split: {}".format(train_r2_score,test_r2_score,i))

Train r2-Score: 0.98, Test r2-score: 0.91, for Sample Split: 1
Train r2-Score: 0.98, Test r2-score: 0.86, for Sample Split: 2
Train r2-Score: 0.98, Test r2-score: 0.92, for Sample Split: 3
Train r2-Score: 0.98, Test r2-score: 0.86, for Sample Split: 4
Train r2-Score: 0.98, Test r2-score: 0.89, for Sample Split: 5
Train r2-Score: 0.98, Test r2-score: 0.71, for Sample Split: 6
Train r2-Score: 0.98, Test r2-score: 0.89, for Sample Split: 7
Train r2-Score: 0.98, Test r2-score: 0.79, for Sample Split: 8
Train r2-Score: 0.98, Test r2-score: 0.9, for Sample Split: 9
Train r2-Score: 0.98, Test r2-score: 0.89, for Sample Split: 10


In [47]:
#Lets extract the Train and Test sample for split 3
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10, #n_splits should be equal to no of cv value in cross_val_score
              random_state=1,
              shuffle=True)
i=0
for train,test in kfold.split(feature_FromModel,label):
    i = i+1
    if i == 3:
        X_train,X_test,y_train,y_test = feature_FromModel[train],feature_FromModel[test],label[train],label[test]

In [44]:
#Final Model
finalModel=RandomForestRegressor()
finalModel.fit(X_train,y_train)

test_ds_predicted=model.predict(X_test)
train_ds_predicted=model.predict(X_train)


test_r2_score=round(r2_score(y_true=y_test, y_pred=test_ds_predicted ),2)
train_r2_score=round(r2_score(y_true=y_train, y_pred=train_ds_predicted ),2)

print("Train r2-Score: {}, Test r2-score: {}".format(train_r2_score,test_r2_score))


train_score=np.round(finalModel.score(X_train,y_train),2)
test_score=np.round(finalModel.score(X_test,y_test),2)
print('Train Accuracy Score is:{} and  Test Accuracy Score:{}'.format(train_score,test_score))

Train r2-Score: 0.96, Test r2-score: 0.98
Train Accuracy Score is:0.98 and  Test Accuracy Score:0.9


## Lets try to see if we can improve the model performance by Hyper parameter Tuning using Randomized Grid Search

In [31]:
from sklearn.model_selection import RandomizedSearchCV
#min_child_weight
#colsample_bylevel
#colsample_bytree
#gamma
param_grid={
            #'booster':['gbtree','gblinear','dart'],
            'n_estimators':[50,100,120,150,200],
            'max_depth':[5,6,7,8,9,10,15,20,30],
            'min_samples_split':[2,3,4,5],
            'min_samples_leaf':[1,2,3,4],
            #'learning_rate':[0.05,0.1,0.5,0.9,1],
            #'loss':['deviance','exponential'],
            #'criterion':[0.2,0.4,0.6,0.8,1.0],
            #'colsample_bynode':[0.2,0.4,0.6,0.8,1.0],
            #'subsample':[0.5,0.6,0.7,0.8,0.9,1]
            #'gamma':[0.1,0.3,0.5,0.8,1]
           }
finalModel=RandomForestRegressor()
RS = RandomizedSearchCV(estimator=finalModel,param_distributions=param_grid,cv=10,n_iter=10)

RS.fit(feature_FromModel,label) 

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                             

In [32]:
RS.best_score_

0.8608998145448382

In [33]:
RS.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=4, min_weight_fraction_leaf=0.0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

## Prediction

In [38]:
test_data=finalFeature_Test.loc[:,['LotFrontage','LotArea','OverallQual','YearBuilt','YearRemodAdd',
                                'MasVnrArea','BsmtQual','BsmtFinSF1','BsmtUnfSF','TotalBsmtSF',
                                '1stFlrSF','2ndFlrSF','GrLivArea','FullBath','TotRmsAbvGrd',
                                'GarageYrBlt','GarageFinish','GarageCars','GarageArea']]
test_data=SC.fit_transform(test_data)
predict=finalModel.predict(test_data)
predict=pd.DataFrame(predict)
predict['Id']=dataSetTest[['Id']]
predict.to_csv('submission.csv')