## House SalePrice Prediction 

### Read the dataset

In [72]:
from warnings import filterwarnings
filterwarnings('ignore')

In [73]:
import pandas as pd
df = pd.read_csv('training_set.csv',na_values=[''],keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Perform basic Data quality checks

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1460 non-null   int64 
 1   MSSubClass     1460 non-null   int64 
 2   MSZoning       1460 non-null   object
 3   LotFrontage    1460 non-null   object
 4   LotArea        1460 non-null   int64 
 5   Street         1460 non-null   object
 6   Alley          1460 non-null   object
 7   LotShape       1460 non-null   object
 8   LandContour    1460 non-null   object
 9   Utilities      1460 non-null   object
 10  LotConfig      1460 non-null   object
 11  LandSlope      1460 non-null   object
 12  Neighborhood   1460 non-null   object
 13  Condition1     1460 non-null   object
 14  Condition2     1460 non-null   object
 15  BldgType       1460 non-null   object
 16  HouseStyle     1460 non-null   object
 17  OverallQual    1460 non-null   int64 
 18  OverallCond    1460 non-null

In [75]:
df.shape

(1460, 81)

In [76]:
s=df.isna().sum()
s[s>0]

Series([], dtype: int64)

In [77]:
df.duplicated().sum()

0

### There are no missing values and duplicated values in this dataset

## Define X and Y features

In [78]:
## Removing statistically insignificant columns for X. Y is our target feaure i.e SalePrice
X = df.drop(columns=['Id','SalePrice'])
Y = df[['SalePrice']]

In [79]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [80]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


## Determine categorical and continuous features 

In [81]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [82]:
cat[:5]

['MSZoning', 'LotFrontage', 'Street', 'Alley', 'LotShape']

In [83]:
con[:5]

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']

## Feature Selection Pipeline
Categorical: Ordinal Encoded

Continuous: Standard Scaler

In [84]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [85]:
num_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler())])

In [86]:
cat_pipe1 = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='NotAvail')),
                            ('ordinal',OrdinalEncoder())])

In [87]:
pre1 = ColumnTransformer([('num',num_pipe1,con),
                          ('cat',cat_pipe1,cat)]).set_output(transform='pandas')

In [88]:
pre1

In [89]:
X_pre1 = pre1.fit_transform(X)
X_pre1.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__GarageYrBlt,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.575425,-0.288653,-0.944591,-0.459303,...,89.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.429577,1.171992,-0.288653,-0.641228,0.466465,...,62.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.830215,0.092907,-0.288653,-0.301643,-0.313369,...,87.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.499274,-0.288653,-0.06167,-0.687324,...,84.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.733308,0.463568,-0.288653,-0.174865,0.19968,...,86.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0


## Checking which feature selector gives us best score : forward selection, backward elimination

In [90]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [91]:
lr = LinearRegression()
forward_sel = SequentialFeatureSelector(lr,direction='forward')
backward_sel = SequentialFeatureSelector(lr,direction='backward',n_features_to_select='auto')

In [92]:
forward_sel.fit_transform(X_pre1,Y)
sel1_cols = forward_sel.get_feature_names_out()
print(len(sel1_cols),'\n',sel1_cols)

39 
 ['num__MSSubClass' 'num__LotArea' 'num__OverallQual' 'num__OverallCond'
 'num__YearBuilt' 'num__BsmtFinSF1' 'num__GrLivArea' 'num__BsmtFullBath'
 'num__Fireplaces' 'num__GarageCars' 'num__WoodDeckSF' 'num__OpenPorchSF'
 'num__ScreenPorch' 'num__PoolArea' 'num__YrSold' 'cat__LotFrontage'
 'cat__Street' 'cat__LandContour' 'cat__Utilities' 'cat__Neighborhood'
 'cat__BldgType' 'cat__HouseStyle' 'cat__RoofStyle' 'cat__RoofMatl'
 'cat__Exterior1st' 'cat__ExterQual' 'cat__BsmtQual' 'cat__BsmtCond'
 'cat__BsmtExposure' 'cat__HeatingQC' 'cat__Electrical' 'cat__KitchenQual'
 'cat__Functional' 'cat__FireplaceQu' 'cat__GarageYrBlt' 'cat__GarageCond'
 'cat__PavedDrive' 'cat__Fence' 'cat__MiscFeature']


In [93]:
imp_cols1 = []
for i in sel1_cols:
    s=i.split('__')[1]
    imp_cols1.append(s)
imp_cols1[:5]

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']

## Get the dataframe for features selected using Forward Selection

In [94]:
X_forward = X[imp_cols1]
X_forward.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,BsmtFinSF1,GrLivArea,BsmtFullBath,Fireplaces,GarageCars,...,HeatingQC,Electrical,KitchenQual,Functional,FireplaceQu,GarageYrBlt,GarageCond,PavedDrive,Fence,MiscFeature
0,60,8450,7,5,2003,706,1710,1,0,2,...,Ex,SBrkr,Gd,Typ,,2003,TA,Y,,
1,20,9600,6,8,1976,978,1262,0,1,2,...,Ex,SBrkr,TA,Typ,TA,1976,TA,Y,,
2,60,11250,7,5,2001,486,1786,1,1,2,...,Ex,SBrkr,Gd,Typ,TA,2001,TA,Y,,
3,70,9550,7,5,1915,216,1717,1,1,3,...,Gd,SBrkr,Gd,Typ,Gd,1998,TA,Y,,
4,60,14260,8,5,2000,655,2198,1,1,3,...,Ex,SBrkr,Gd,Typ,TA,2000,TA,Y,,


## Get categorical and continous features from this dataframe

In [95]:
cat_forward = list(X_forward.columns[X_forward.dtypes=='object'])
con_forward = list(X_forward.columns[X_forward.dtypes!='object'])
print(f"cat: {cat_forward} '\n' con: {con_forward}")

cat: ['LotFrontage', 'Street', 'LandContour', 'Utilities', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageYrBlt', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature'] '
' con: ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'BsmtFinSF1', 'GrLivArea', 'BsmtFullBath', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'PoolArea', 'YrSold']


## Create a final pipeline for model building

In [96]:
from sklearn.preprocessing import OneHotEncoder

In [97]:
num_pipe2 = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler())])

In [98]:
cat_pipe2 = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='NotAvail')),
                            ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [99]:
pre2 = ColumnTransformer([('num',num_pipe2,con_forward),
                          ('cat',cat_pipe2,cat_forward)]).set_output(transform='pandas')

In [100]:
pre2

In [101]:
X_forward_pre = pre2.fit_transform(X_forward)
X_forward_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__Fireplaces,num__GarageCars,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.575425,0.370333,1.10781,-0.951226,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,1.171992,-0.482512,-0.819964,0.600495,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.092907,0.515013,1.10781,0.600495,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.499274,0.383659,1.10781,0.600495,1.650307,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.463568,1.299326,1.10781,0.600495,1.650307,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


## Apply train test split on this dataframe
training 80%

testing 20%

In [102]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_forward_pre,Y,test_size=0.2,train_size=0.8,random_state=21)

In [103]:
xtrain.shape

(1168, 365)

In [104]:
xtest.shape

(292, 365)

## Build the model

In [105]:
model = LinearRegression()
model.fit(xtrain,ytrain)

In [106]:
model.score(xtrain,ytrain)

0.9485768706107399

In [107]:
model.score(xtest,ytest)

-2.454105357220054e+19

## Model is performing worse with forward feature selection

In [108]:
backward_sel.fit_transform(X_pre1,Y)
sel2_cols = backward_sel.get_feature_names_out()
print(len(sel2_cols),'\n',sel2_cols)

40 
 ['num__MSSubClass' 'num__LotArea' 'num__OverallQual' 'num__OverallCond'
 'num__YearBuilt' 'num__BsmtFinSF1' 'num__BsmtFinSF2' 'num__BsmtUnfSF'
 'num__TotalBsmtSF' 'num__1stFlrSF' 'num__2ndFlrSF' 'num__LowQualFinSF'
 'num__GrLivArea' 'num__TotRmsAbvGrd' 'num__Fireplaces' 'num__GarageCars'
 'num__WoodDeckSF' 'num__ScreenPorch' 'num__PoolArea' 'num__YrSold'
 'cat__LotFrontage' 'cat__LandContour' 'cat__Utilities' 'cat__LandSlope'
 'cat__Neighborhood' 'cat__BldgType' 'cat__HouseStyle' 'cat__RoofStyle'
 'cat__RoofMatl' 'cat__Exterior1st' 'cat__ExterQual' 'cat__BsmtQual'
 'cat__BsmtCond' 'cat__BsmtExposure' 'cat__HeatingQC' 'cat__KitchenQual'
 'cat__Functional' 'cat__FireplaceQu' 'cat__GarageYrBlt'
 'cat__MiscFeature']


In [110]:
imp_cols2 = []
for i in sel2_cols:
    s=i.split('__')[1]
    imp_cols2.append(s)
imp_cols2[:5]

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']

In [111]:
X_backward = X[imp_cols2]
X_backward.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,ExterQual,BsmtQual,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageYrBlt,MiscFeature
0,60,8450,7,5,2003,706,0,150,856,856,...,Gd,Gd,TA,No,Ex,Gd,Typ,,2003,
1,20,9600,6,8,1976,978,0,284,1262,1262,...,TA,Gd,TA,Gd,Ex,TA,Typ,TA,1976,
2,60,11250,7,5,2001,486,0,434,920,920,...,Gd,Gd,TA,Mn,Ex,Gd,Typ,TA,2001,
3,70,9550,7,5,1915,216,0,540,756,961,...,TA,TA,Gd,No,Gd,Gd,Typ,Gd,1998,
4,60,14260,8,5,2000,655,0,490,1145,1145,...,Gd,Gd,TA,Av,Ex,Gd,Typ,TA,2000,


## separate cat and con features

In [117]:
cat_backward= list(X_backward.columns[X_backward.dtypes=='object'])
con_backward = list(X_backward.columns[X_backward.dtypes!='object'])
print(f"cat : {cat_backward} \ncon : {con_backward}")

cat : ['LotFrontage', 'LandContour', 'Utilities', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageYrBlt', 'MiscFeature'] 
con : ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'PoolArea', 'YrSold']


## create final pipeline

In [118]:
num_pipe3 = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                            ('scaler',StandardScaler())])

In [119]:
cat_pipe3 = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='NotAvail')),
                            ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [120]:
pre3 = ColumnTransformer([('num',num_pipe3,con_backward),
                          ('cat',cat_pipe3,cat_backward)]).set_output(transform='pandas')

In [121]:
pre3

In [122]:
X_backward_pre = pre3.fit_transform(X_backward)
X_backward_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,num__1stFlrSF,...,cat__GarageYrBlt_2007,cat__GarageYrBlt_2008,cat__GarageYrBlt_2009,cat__GarageYrBlt_2010,cat__GarageYrBlt_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.575425,-0.288653,-0.944591,-0.459303,-0.793434,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,1.171992,-0.288653,-0.641228,0.466465,0.25714,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.092907,-0.288653,-0.301643,-0.313369,-0.627826,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.499274,-0.288653,-0.06167,-0.687324,-0.521734,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.463568,-0.288653,-0.174865,0.19968,-0.045611,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Perform tarin test split
training 80%

testing 20%

In [129]:
xtrain1, xtest1, ytrain1, ytest1 = train_test_split(X_backward_pre,Y,train_size=0.8,test_size=0.2,random_state=21)

In [130]:
xtrain1.shape

(1168, 351)

In [131]:
xtest1.shape

(292, 351)

## Build the model

In [132]:
model2 = LinearRegression()
model2.fit(xtrain1,ytrain1)

In [133]:
model2.score(xtrain1,ytrain1)

0.9498309682008202

In [134]:
model.score(xtest1,ytest1)

-1.8960690336902515e+17

## Model is performing worse with backward feature selection

# Model is performing worse with feature selection methods. 