# Requirements: 
- Dataset for Project
1. Target Feature - SalePrice

- Steps 
1. Feature Selection Use any one forward or backward
2. On Selected features apply ridge or lasso and select best model
3. Perform model evaluation on best model
4. Save the model object and preprocessor

In [133]:
from warnings import filterwarnings

filterwarnings("ignore")

# Step 01 - Data Injestion


In [134]:
import pandas as pd

df = pd.read_csv("training_set.csv", na_values=["", "NA"], keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Step 02 - Perform basic data quality checks


In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [136]:
m = df.isna().sum()
m[m > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [137]:
df.duplicated().sum()

np.int64(0)

# Step 03 - Separate X AND Y

In [138]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [139]:
X = df.drop(columns=["Id", "SalePrice"])

In [140]:
Y = df[["SalePrice"]]

In [141]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [142]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


# Step 04 - Preprocess data for feature selection

In [143]:
cat = list(X.columns[X.dtypes == "object"])
con = list(X.columns[X.dtypes != "object"])

In [144]:
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [145]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [146]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [147]:
num_pipe1 = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

In [148]:
cat_pipe1 = make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder())

In [149]:
pre1 = ColumnTransformer([("num", num_pipe1, con), ("cat", cat_pipe1, cat)]).set_output(
    transform="pandas"
)

In [150]:
X_pre = pre1.fit_transform(X)

In [151]:
X_pre

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.517200,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.574410,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,-0.093110,0.073480,0.651479,-0.517200,0.984752,0.830215,0.323060,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.517200,-1.863632,-0.720298,-0.574410,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.517200,0.951632,0.733308,1.364570,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.365633,-0.260560,-0.071836,-0.517200,0.918511,0.733308,-0.574410,-0.973018,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1456,-0.872563,0.679039,0.266407,-0.071836,0.381743,0.222975,0.151865,0.084843,0.759659,0.722112,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1457,0.309859,-0.183951,-0.147810,0.651479,3.078570,-1.002492,1.024029,-0.574410,-0.369871,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,0.0,2.0,8.0,4.0
1458,-0.872563,-0.093110,-0.080160,-0.795151,0.381743,-0.704406,0.539493,-0.574410,-0.865548,6.092188,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


In [152]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


In [153]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

base_model = LinearRegression()
sel = SequentialFeatureSelector(
    estimator=base_model, n_features_to_select="auto", direction="forward", n_jobs=-1
)
sel.fit(X_pre, Y)
imp_cols = sel.get_feature_names_out()
print(imp_cols)

['num__MSSubClass' 'num__LotArea' 'num__OverallQual' 'num__OverallCond'
 'num__YearBuilt' 'num__MasVnrArea' 'num__BsmtFinSF1' 'num__GrLivArea'
 'num__BsmtFullBath' 'num__KitchenAbvGr' 'num__TotRmsAbvGrd'
 'num__Fireplaces' 'num__GarageCars' 'num__WoodDeckSF' 'num__OpenPorchSF'
 'num__ScreenPorch' 'num__PoolArea' 'num__YrSold' 'cat__Street'
 'cat__LandContour' 'cat__Utilities' 'cat__Neighborhood' 'cat__BldgType'
 'cat__HouseStyle' 'cat__RoofStyle' 'cat__RoofMatl' 'cat__Exterior1st'
 'cat__MasVnrType' 'cat__ExterQual' 'cat__BsmtQual' 'cat__BsmtCond'
 'cat__BsmtExposure' 'cat__HeatingQC' 'cat__KitchenQual' 'cat__Functional'
 'cat__GarageCond' 'cat__PavedDrive' 'cat__Fence' 'cat__MiscFeature']


In [154]:
X_sel = X_pre[imp_cols]
X_sel

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__KitchenAbvGr,...,cat__BsmtQual,cat__BsmtCond,cat__BsmtExposure,cat__HeatingQC,cat__KitchenQual,cat__Functional,cat__GarageCond,cat__PavedDrive,cat__Fence,cat__MiscFeature
0,0.073375,-0.207142,0.651479,-0.517200,1.050994,0.511418,0.575425,0.370333,1.107810,-0.211454,...,2.0,3.0,3.0,0.0,2.0,6.0,4.0,2.0,2.0,2.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.574410,1.171992,-0.482512,-0.819964,-0.211454,...,2.0,3.0,1.0,0.0,3.0,6.0,4.0,2.0,2.0,2.0
2,0.073375,0.073480,0.651479,-0.517200,0.984752,0.323060,0.092907,0.515013,1.107810,-0.211454,...,2.0,3.0,2.0,0.0,2.0,6.0,4.0,2.0,2.0,2.0
3,0.309859,-0.096897,0.651479,-0.517200,-1.863632,-0.574410,-0.499274,0.383659,1.107810,-0.211454,...,3.0,1.0,3.0,2.0,2.0,6.0,4.0,2.0,2.0,2.0
4,0.073375,0.375148,1.374795,-0.517200,0.951632,1.364570,0.463568,1.299326,1.107810,-0.211454,...,2.0,3.0,0.0,0.0,2.0,6.0,4.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.073375,-0.260560,-0.071836,-0.517200,0.918511,-0.574410,-0.973018,0.250402,-0.819964,-0.211454,...,2.0,3.0,3.0,0.0,3.0,6.0,4.0,2.0,2.0,2.0
1456,-0.872563,0.266407,-0.071836,0.381743,0.222975,0.084843,0.759659,1.061367,1.107810,-0.211454,...,2.0,3.0,3.0,4.0,3.0,2.0,4.0,2.0,2.0,2.0
1457,0.309859,-0.147810,0.651479,3.078570,-1.002492,-0.574410,-0.369871,1.569647,-0.819964,-0.211454,...,3.0,1.0,3.0,0.0,2.0,6.0,4.0,2.0,0.0,2.0
1458,-0.872563,-0.080160,-0.795151,0.381743,-0.704406,-0.574410,-0.865548,-0.832788,1.107810,-0.211454,...,3.0,3.0,2.0,2.0,2.0,6.0,4.0,2.0,2.0,2.0


In [155]:
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


# Feature selection is complete

In [156]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(
    X_sel, Y, test_size=0.2, random_state=10
)

In [157]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__KitchenAbvGr,...,cat__BsmtQual,cat__BsmtCond,cat__BsmtExposure,cat__HeatingQC,cat__KitchenQual,cat__Functional,cat__GarageCond,cat__PavedDrive,cat__Fence,cat__MiscFeature
1216,0.782828,-0.159035,-0.071836,-0.5172,0.222975,-0.57441,-0.973018,0.735839,-0.819964,4.328579,...,3.0,3.0,3.0,4.0,3.0,6.0,4.0,2.0,2.0,2.0
339,-0.872563,0.188735,-0.071836,1.280685,-0.43944,0.40062,0.31004,-0.571985,-0.819964,-0.211454,...,3.0,0.0,3.0,4.0,3.0,6.0,4.0,2.0,2.0,2.0
1057,0.073375,1.948534,0.651479,0.381743,0.752907,-0.57441,0.331973,0.636848,-0.819964,-0.211454,...,2.0,3.0,3.0,0.0,2.0,6.0,4.0,2.0,2.0,2.0
482,0.309859,-0.803463,0.651479,2.179628,-1.863632,-0.57441,-0.317233,0.591159,1.10781,-0.211454,...,3.0,3.0,3.0,0.0,0.0,3.0,1.0,2.0,2.0,2.0
529,-0.872563,2.220035,-0.071836,-2.315085,-0.47256,0.0,1.700568,1.90279,1.10781,4.328579,...,3.0,3.0,3.0,4.0,3.0,0.0,4.0,2.0,2.0,2.0


In [158]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__KitchenAbvGr,...,cat__BsmtQual,cat__BsmtCond,cat__BsmtExposure,cat__HeatingQC,cat__KitchenQual,cat__Functional,cat__GarageCond,cat__PavedDrive,cat__Fence,cat__MiscFeature
854,-0.872563,0.741961,-0.795151,-1.416142,-0.538802,-0.57441,-0.30188,0.501687,1.10781,-0.211454,...,3.0,3.0,2.0,4.0,3.0,6.0,4.0,2.0,2.0,2.0
381,-0.872563,-0.332419,0.651479,-0.5172,1.150356,-0.57441,-0.973018,-0.408269,1.10781,-0.211454,...,2.0,1.0,3.0,0.0,2.0,6.0,4.0,2.0,2.0,2.0
816,-0.872563,0.091019,-0.795151,0.381743,-0.571923,-0.57441,0.092907,-0.966045,-0.819964,-0.211454,...,3.0,3.0,3.0,2.0,3.0,6.0,4.0,2.0,2.0,2.0
577,0.546344,0.126297,-0.795151,0.381743,-0.174474,-0.037036,-0.253628,-0.372099,1.10781,-0.211454,...,3.0,3.0,0.0,0.0,3.0,6.0,4.0,2.0,2.0,2.0
35,0.073375,0.290761,1.374795,-0.5172,1.084115,0.156862,-0.973018,1.782859,-0.819964,-0.211454,...,0.0,3.0,0.0,0.0,2.0,6.0,4.0,2.0,2.0,2.0


In [159]:
ytrain.head()

Unnamed: 0,SalePrice
1216,112000
339,155000
1057,248000
482,155000
529,200624


In [160]:
ytest.head()

Unnamed: 0,SalePrice
854,170000
381,187750
816,137000
577,164500
35,309000


# Step 06 - Ridge and lasso?

In [161]:
from sklearn.linear_model import LinearRegression

model1 = LinearRegression()
model1.fit(xtrain, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [162]:
base_model.fit(xtrain, ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [163]:
base_model.score(xtrain, ytrain)

0.8388775833038841

In [164]:
base_model.score(xtest, ytest)

0.8592325864678617

In [165]:
from sklearn.linear_model import Ridge

model2 = Ridge(alpha=1.5)
model2.fit(xtrain, ytrain)

0,1,2
,alpha,1.5
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [166]:
model2.score(xtrain, ytrain)

0.8386996962812749

In [167]:
model2.score(xtest, ytest)

0.8597446210471582

# Cross validation

In [168]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model2, xtrain, ytrain, cv=5, scoring="r2")

In [169]:
scores

array([0.86350663, 0.85073508, 0.64663679, 0.71695441, 0.86804182])

In [170]:
scores.mean()

np.float64(0.7891749470568653)

# Hyperparameter tuning for Ridge

In [171]:
import numpy as np

In [172]:
params = {"alpha": np.arange(start=1, stop=60, step=0.5)}

In [173]:
params

{'alpha': array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,
         6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. , 10.5, 11. , 11.5,
        12. , 12.5, 13. , 13.5, 14. , 14.5, 15. , 15.5, 16. , 16.5, 17. ,
        17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5, 21. , 21.5, 22. , 22.5,
        23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. , 27.5, 28. ,
        28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 32. , 32.5, 33. , 33.5,
        34. , 34.5, 35. , 35.5, 36. , 36.5, 37. , 37.5, 38. , 38.5, 39. ,
        39.5, 40. , 40.5, 41. , 41.5, 42. , 42.5, 43. , 43.5, 44. , 44.5,
        45. , 45.5, 46. , 46.5, 47. , 47.5, 48. , 48.5, 49. , 49.5, 50. ,
        50.5, 51. , 51.5, 52. , 52.5, 53. , 53.5, 54. , 54.5, 55. , 55.5,
        56. , 56.5, 57. , 57.5, 58. , 58.5, 59. , 59.5])}

In [174]:
from sklearn.model_selection import GridSearchCV

ridge1 = Ridge()
gscv_ridge = GridSearchCV(ridge1, param_grid=params, cv=5, scoring="r2")
gscv_ridge.fit(xtrain, ytrain)

0,1,2
,estimator,Ridge()
,param_grid,"{'alpha': array([ 1. , ..., 59. , 59.5])}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(59.5)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [175]:
gscv_ridge.best_params_

{'alpha': np.float64(59.5)}

In [176]:
gscv_ridge.best_score_

np.float64(0.7943871975954964)

In [177]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

0,1,2
,alpha,np.float64(59.5)
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [178]:
best_ridge.score(xtrain, ytrain)

0.837236062920763

In [179]:
best_ridge.score(xtest, ytest)

0.8592248374723704

# Lasso model

In [180]:
from sklearn.linear_model import Lasso

model3 = Lasso(alpha=0.1)
model3.fit(xtrain, ytrain)

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [181]:
model3.score(xtrain, ytrain)

0.8388775805659645

In [182]:
model3.score(xtest, ytest)

0.8592350248486864

# Hyperparameter tuning on Lasso

In [183]:
params

{'alpha': array([ 1. ,  1.5,  2. ,  2.5,  3. ,  3.5,  4. ,  4.5,  5. ,  5.5,  6. ,
         6.5,  7. ,  7.5,  8. ,  8.5,  9. ,  9.5, 10. , 10.5, 11. , 11.5,
        12. , 12.5, 13. , 13.5, 14. , 14.5, 15. , 15.5, 16. , 16.5, 17. ,
        17.5, 18. , 18.5, 19. , 19.5, 20. , 20.5, 21. , 21.5, 22. , 22.5,
        23. , 23.5, 24. , 24.5, 25. , 25.5, 26. , 26.5, 27. , 27.5, 28. ,
        28.5, 29. , 29.5, 30. , 30.5, 31. , 31.5, 32. , 32.5, 33. , 33.5,
        34. , 34.5, 35. , 35.5, 36. , 36.5, 37. , 37.5, 38. , 38.5, 39. ,
        39.5, 40. , 40.5, 41. , 41.5, 42. , 42.5, 43. , 43.5, 44. , 44.5,
        45. , 45.5, 46. , 46.5, 47. , 47.5, 48. , 48.5, 49. , 49.5, 50. ,
        50.5, 51. , 51.5, 52. , 52.5, 53. , 53.5, 54. , 54.5, 55. , 55.5,
        56. , 56.5, 57. , 57.5, 58. , 58.5, 59. , 59.5])}

In [184]:
lasso1 = Lasso()
gscv_Lasso = GridSearchCV(lasso1, param_grid=params, cv=5, scoring="r2")
gscv_Lasso.fit(xtrain, ytrain)

0,1,2
,estimator,Lasso()
,param_grid,"{'alpha': array([ 1. , ..., 59. , 59.5])}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,alpha,np.float64(59.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [185]:
gscv_Lasso.best_params_

{'alpha': np.float64(59.5)}

In [186]:
gscv_Lasso.best_score_

np.float64(0.789379935957341)

In [187]:
best_lasso = gscv_Lasso.best_estimator_

In [188]:
best_lasso

0,1,2
,alpha,np.float64(59.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [189]:
best_lasso.score(xtrain, ytrain)

0.83832769401607

In [190]:
best_lasso.score(xtest, ytest)

0.8601258734002802

### from the above result we can say that lasso model is the best!!

# Step 07 - Evaluate best model in detail

In [191]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score,
)

In [192]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)

    mae = mean_absolute_error(y, ypred)
    rmse = root_mean_squared_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)

    print(f"RMSE : {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"Mape: {mape: .2%}")
    print(f"r2_score: {r2:.2%}")

In [193]:
evaluate_model(best_lasso, xtrain, ytrain)

RMSE : 31871.46
MAE: 18747.07
Mape:  10.96%
r2_score: 83.83%


In [194]:
evaluate_model(best_ridge, xtrain, ytrain)

RMSE : 31978.88
MAE: 18739.27
Mape:  10.90%
r2_score: 83.72%


In [195]:
evaluate_model(best_lasso, xtest, ytest)

RMSE : 29912.73
MAE: 20935.46
Mape:  12.27%
r2_score: 86.01%


### Both model evaluation has score >= 0.8, so it is the best model for sample prediction

# Step 08 - Out of sample prediction

In [196]:
xnew = pd.read_csv("testing_set.csv", na_values=["", "NA"], keep_default_na=False)

In [197]:
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [198]:
pre1

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [199]:
xnew_pre = pre1.transform(xnew)

In [200]:
xnew_pre

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,-0.872563,0.451936,0.110763,-0.795151,0.381743,-0.340077,-1.156380,-0.574410,0.053428,0.604293,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.497357,0.375850,-0.071836,0.381743,-0.439440,-1.301740,0.023903,1.051363,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,0.0,8.0,4.0
2,0.073375,0.179413,0.332053,-0.795151,-0.517200,0.852269,0.636400,-0.574410,0.761852,-0.288653,...,1.0,0.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.073375,0.361095,-0.054002,-0.071836,0.381743,0.885390,0.636400,-0.463612,0.347326,-0.288653,...,1.0,0.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
4,1.492282,-1.228623,-0.552407,1.374795,-0.517200,0.686666,0.345679,-0.574410,-0.396190,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-2.227875,-0.859988,-1.518467,1.280685,-0.041991,-0.720298,-0.574410,-0.973018,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1455,2.438219,-2.227875,-0.864197,-1.518467,-0.517200,-0.041991,-0.720298,-0.574410,-0.420316,-0.288653,...,4.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
1456,-0.872563,4.085578,0.950423,-0.795151,1.280685,-0.373198,0.539493,-0.574410,1.711535,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
1457,0.664586,-0.365633,-0.007600,-0.795151,-0.517200,0.686666,0.345679,-0.574410,-0.233889,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


In [201]:
xnew_pre.isna().sum()

num__MSSubClass       0
num__LotFrontage      0
num__LotArea          0
num__OverallQual      0
num__OverallCond      0
                     ..
cat__PoolQC           0
cat__Fence            0
cat__MiscFeature      0
cat__SaleType         0
cat__SaleCondition    0
Length: 79, dtype: int64

In [202]:
sel

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,-1

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [204]:
xnew_pre

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,-0.872563,0.451936,0.110763,-0.795151,0.381743,-0.340077,-1.156380,-0.574410,0.053428,0.604293,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.497357,0.375850,-0.071836,0.381743,-0.439440,-1.301740,0.023903,1.051363,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,0.0,8.0,4.0
2,0.073375,0.179413,0.332053,-0.795151,-0.517200,0.852269,0.636400,-0.574410,0.761852,-0.288653,...,1.0,0.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.073375,0.361095,-0.054002,-0.071836,0.381743,0.885390,0.636400,-0.463612,0.347326,-0.288653,...,1.0,0.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
4,1.492282,-1.228623,-0.552407,1.374795,-0.517200,0.686666,0.345679,-0.574410,-0.396190,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.438219,-2.227875,-0.859988,-1.518467,1.280685,-0.041991,-0.720298,-0.574410,-0.973018,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1455,2.438219,-2.227875,-0.864197,-1.518467,-0.517200,-0.041991,-0.720298,-0.574410,-0.420316,-0.288653,...,4.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
1456,-0.872563,4.085578,0.950423,-0.795151,1.280685,-0.373198,0.539493,-0.574410,1.711535,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
1457,0.664586,-0.365633,-0.007600,-0.795151,-0.517200,0.686666,0.345679,-0.574410,-0.233889,-0.288653,...,1.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


In [205]:
xnew_selected = sel.transform(xnew_pre)

In [208]:
preds = best_ridge.predict(xnew_selected)

In [209]:
preds

array([108155.11705928, 154081.42466675, 163064.1768974 , ...,
       178751.02056467, 116229.22558013, 241730.46898695], shape=(1459,))

In [211]:
xnew["SalesPrice"] = preds.round(2)

In [212]:
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Weight,SalesPrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,,MnPrv,,0,6,2010,WD,Normal,108155.12,108155.12
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,,,Gar2,12500,6,2010,WD,Normal,154081.42,154081.42
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,,MnPrv,,0,3,2010,WD,Normal,163064.18,163064.18
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,,,,0,6,2010,WD,Normal,181613.09,181613.09
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,,,,0,1,2010,WD,Normal,183310.25,183310.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,,,,0,6,2006,WD,Normal,74062.69,74062.69
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,,,,0,4,2006,WD,Abnorml,75679.81,75679.81
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,,,,0,9,2006,WD,Abnorml,178751.02,178751.02
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,,MnPrv,Shed,700,7,2006,WD,Normal,116229.23,116229.23


In [215]:
xnew = xnew.drop(columns=["Weight"])

In [216]:
xnew

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalesPrice
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,108155.12
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,154081.42
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,163064.18
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,181613.09
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,183310.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,74062.69
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,75679.81
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,178751.02
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,116229.23


In [217]:
xnew.to_csv("Project_Result.csv", index=False)

# Step 9 - Save the models

In [218]:
import joblib as jb

In [220]:
pre1

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [221]:
best_lasso

0,1,2
,alpha,np.float64(59.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [222]:
sel

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,-1

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [223]:
jb.dump(pre1, "preprocessing.joblib")

['preprocessing.joblib']

In [224]:
jb.dump(sel, "Featureselection.joblib")

['Featureselection.joblib']

In [225]:
jb.dump(best_lasso, "lassomodel.joblib")

['lassomodel.joblib']

In [226]:
p = jb.load("preprocessing.joblib")

In [227]:
p

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [228]:
r = jb.load("lassomodel.joblib")

In [229]:
f = jb.load("Featureselection.joblib")

In [230]:
r

0,1,2
,alpha,np.float64(59.5)
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [231]:
f

0,1,2
,estimator,LinearRegression()
,n_features_to_select,'auto'
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,-1

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False
