# Import all the necessary libraries and packages

In [1]:
# warnings
from warnings import filterwarnings
filterwarnings('ignore')

# os,pandas,numpy
import os
import pandas as pd
import numpy as np

# preprocessing and data cleaning
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# feature selection
from sklearn.feature_selection import SequentialFeatureSelector

# train test split
from sklearn.model_selection import train_test_split,GridSearchCV

# import necessary regression models
from sklearn.linear_model import LinearRegression,Ridge,Lasso

# evaluation metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

# Data Ingestion

In [2]:
path = r"C:\Users\Saurav Mali\Downloads\BasicPython\Python Recording\python Class\MACHINE LEARNING\ML PDF AND CSV\MLR PROJECT\training_set.csv"

In [3]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Basic Data Quality Checks

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
df.shape

(1460, 81)

In [6]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

# Search missing data

In [7]:
s = df.isna().sum()
s[s>0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [9]:
cnt = df["Id"].count()
cnt

np.int64(1460)

In [10]:
b = (s[s>0]/cnt)*100
b[b>50]

Alley          93.767123
MasVnrType     59.726027
PoolQC         99.520548
Fence          80.753425
MiscFeature    96.301370
dtype: float64

## If the columns are not important and have missing values for more than 50% of rows, then you can drop that column

In [11]:
df.drop(columns=list(b[b>50].index),inplace=True)
# df = df.drop(column=list(b[b>50].index))

In [12]:
df["Id"].count()

np.int64(1460)

In [13]:

df.shape

(1460, 76)

In [14]:
X = df.drop(columns=["Id","SalePrice"])
Y = df[["SalePrice"]]

In [15]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [16]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


# Feature Engineering
    Feature Selection process : Forward

In [17]:
# Separate cat and con features
cat = list(X.columns[X.dtypes=="object"])
con = list(X.columns[X.dtypes!="object"])

# 1.Data Preprocessing and data cleaning

In [18]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

In [19]:
con_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [20]:
pre = ColumnTransformer(
    [
        ("cat",cat_pipe,cat),
        ("con",con_pipe,con)
    ]
).set_output(transform="pandas")

In [21]:
pre

In [22]:
Xpre = pre.fit_transform(X)
Xpre.head()

Unnamed: 0,cat__MSZoning,cat__Street,cat__LotShape,cat__LandContour,cat__Utilities,cat__LotConfig,cat__LandSlope,cat__Neighborhood,cat__Condition1,cat__Condition2,...,con__GarageArea,con__WoodDeckSF,con__OpenPorchSF,con__EnclosedPorch,con__3SsnPorch,con__ScreenPorch,con__PoolArea,con__MiscVal,con__MoSold,con__YrSold
0,3.0,1.0,3.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,...,0.351,-0.752176,0.216503,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,0.138777
1,3.0,1.0,3.0,3.0,0.0,2.0,0.0,24.0,1.0,2.0,...,-0.060731,1.626195,-0.704483,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,-0.48911,-0.614439
2,3.0,1.0,0.0,3.0,0.0,4.0,0.0,5.0,2.0,2.0,...,0.631726,-0.752176,-0.070361,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,0.990891,0.138777
3,3.0,1.0,0.0,3.0,0.0,0.0,0.0,6.0,2.0,2.0,...,0.790804,-0.752176,-0.176048,4.092524,-0.116339,-0.270208,-0.068692,-0.087688,-1.599111,-1.367655
4,3.0,1.0,0.0,3.0,0.0,2.0,0.0,15.0,2.0,2.0,...,1.698485,0.780197,0.56376,-0.359325,-0.116339,-0.270208,-0.068692,-0.087688,2.100892,0.138777


In [23]:
model = LinearRegression()
sel = SequentialFeatureSelector(model,direction="forward",n_features_to_select="auto")
sel.fit(Xpre,Y)

In [24]:
imp_cols = sel.get_feature_names_out()
imp_cols

array(['cat__Street', 'cat__LandContour', 'cat__Utilities',
       'cat__Neighborhood', 'cat__BldgType', 'cat__HouseStyle',
       'cat__RoofStyle', 'cat__RoofMatl', 'cat__Exterior1st',
       'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtCond',
       'cat__BsmtExposure', 'cat__HeatingQC', 'cat__Electrical',
       'cat__KitchenQual', 'cat__Functional', 'cat__GarageCond',
       'cat__PavedDrive', 'con__MSSubClass', 'con__LotArea',
       'con__OverallQual', 'con__OverallCond', 'con__YearBuilt',
       'con__MasVnrArea', 'con__BsmtFinSF1', 'con__GrLivArea',
       'con__BsmtFullBath', 'con__KitchenAbvGr', 'con__TotRmsAbvGrd',
       'con__Fireplaces', 'con__GarageCars', 'con__WoodDeckSF',
       'con__OpenPorchSF', 'con__ScreenPorch', 'con__PoolArea',
       'con__YrSold'], dtype=object)

In [26]:
imp_cols[0].split("__")[1]

'Street'

In [27]:
sel_cols = []
for i in imp_cols:
    sel_cols.append(i.split("__")[1])
sel_cols

['Street',
 'LandContour',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageCond',
 'PavedDrive',
 'MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

In [28]:

len(sel_cols)

37

In [30]:
X_sel = X[sel_cols]
X_sel.head()

Unnamed: 0,Street,LandContour,Utilities,Neighborhood,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,ExterQual,...,BsmtFullBath,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,WoodDeckSF,OpenPorchSF,ScreenPorch,PoolArea,YrSold
0,Pave,Lvl,AllPub,CollgCr,1Fam,2Story,Gable,CompShg,VinylSd,Gd,...,1,1,8,0,2,0,61,0,0,2008
1,Pave,Lvl,AllPub,Veenker,1Fam,1Story,Gable,CompShg,MetalSd,TA,...,0,1,6,1,2,298,0,0,0,2007
2,Pave,Lvl,AllPub,CollgCr,1Fam,2Story,Gable,CompShg,VinylSd,Gd,...,1,1,6,1,2,0,42,0,0,2008
3,Pave,Lvl,AllPub,Crawfor,1Fam,2Story,Gable,CompShg,Wd Sdng,TA,...,1,1,7,1,3,0,35,0,0,2006
4,Pave,Lvl,AllPub,NoRidge,1Fam,2Story,Gable,CompShg,VinylSd,Gd,...,1,1,9,1,3,192,84,0,0,2008


In [31]:
# Data preprocessing and data cleaning

In [32]:
cat_sel = list(X_sel.columns[X_sel.dtypes=="object"])
con_sel = list(X_sel.columns[X_sel.dtypes!="object"])

In [33]:
cat_pipe1 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown='ignore',sparse_output=False)
)

In [34]:
con_pipe1 = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

In [35]:
pre1 = ColumnTransformer(
    [
        ("cat",cat_pipe1,cat_sel),
        ("con",con_pipe1,con_sel)
    ]
).set_output(transform="pandas")

In [36]:
pre1

In [37]:

Xpre1 = pre1.fit_transform(X_sel)
Xpre1.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__LandContour_Bnk,cat__LandContour_HLS,cat__LandContour_Low,cat__LandContour_Lvl,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,...,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__OpenPorchSF,con__ScreenPorch,con__PoolArea,con__YrSold
0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,0.91221,-0.951226,0.311725,-0.752176,0.216503,-0.270208,-0.068692,0.138777
1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,-0.318683,0.600495,0.311725,1.626195,-0.704483,-0.270208,-0.068692,-0.614439
2,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,-0.318683,0.600495,0.311725,-0.752176,-0.070361,-0.270208,-0.068692,0.138777
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,0.296763,0.600495,1.650307,-0.752176,-0.176048,-0.270208,-0.068692,-1.367655
4,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,1.527656,0.600495,1.650307,0.780197,0.56376,-0.270208,-0.068692,0.138777


## Train Test Split

In [38]:
xtrain,xtest,ytrain,ytest = train_test_split(Xpre1,Y,train_size=0.8,random_state=21)

In [39]:
xtrain.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__LandContour_Bnk,cat__LandContour_HLS,cat__LandContour_Low,cat__LandContour_Lvl,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,...,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__OpenPorchSF,con__ScreenPorch,con__PoolArea,con__YrSold
710,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,-0.93413,-0.951226,-2.36544,-0.752176,-0.704483,-0.270208,-0.068692,0.138777
1098,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,-0.318683,-0.951226,-1.026858,-0.752176,-0.704483,-0.270208,-0.068692,0.891994
1286,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,-0.318683,2.152216,0.311725,-0.752176,-0.311932,-0.270208,-0.068692,1.64521
992,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,0.296763,0.600495,0.311725,1.155309,-0.070361,-0.270208,-0.068692,-0.614439
631,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,-0.318683,0.600495,0.311725,0.492877,0.397681,-0.270208,-0.068692,-0.614439


In [40]:
xtest.head()

Unnamed: 0,cat__Street_Grvl,cat__Street_Pave,cat__LandContour_Bnk,cat__LandContour_HLS,cat__LandContour_Low,cat__LandContour_Lvl,cat__Utilities_AllPub,cat__Utilities_NoSeWa,cat__Neighborhood_Blmngtn,cat__Neighborhood_Blueste,...,con__BsmtFullBath,con__KitchenAbvGr,con__TotRmsAbvGrd,con__Fireplaces,con__GarageCars,con__WoodDeckSF,con__OpenPorchSF,con__ScreenPorch,con__PoolArea,con__YrSold
880,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,-0.93413,-0.951226,0.311725,-0.752176,0.035326,-0.270208,-0.068692,-0.614439
605,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,0.296763,3.703938,0.311725,-0.752176,-0.070361,3.120637,-0.068692,0.891994
1166,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,0.296763,-0.951226,1.650307,0.524802,-0.206245,-0.270208,-0.068692,1.64521
216,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.10781,-0.211454,0.91221,-0.951226,0.311725,0.357198,0.775134,-0.270208,-0.068692,0.138777
970,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,-0.819964,-0.211454,-0.318683,-0.951226,-2.36544,-0.752176,-0.704483,-0.270208,-0.068692,-1.367655


In [41]:

ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [42]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [43]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


# Model Building

In [44]:
model.fit(xtrain,ytrain)

In [45]:
model.score(xtrain,ytrain)

0.9231559516591814

In [46]:
model.score(xtest,ytest)

0.8117158490420748

## Hyperparameter tuning

In [47]:
params = {
    "alpha" : np.arange(start=0.1,stop=100,step=0.1)
}

In [48]:
base_model = Ridge()
gscv = GridSearchCV(base_model,params,cv=3,scoring="r2")
gscv.fit(xtrain,ytrain)

In [49]:
gscv.best_params_

{'alpha': np.float64(11.200000000000001)}

In [50]:

gscv.best_score_

np.float64(0.8448996927032733)

In [51]:
best_ridge = gscv.best_estimator_
best_ridge

In [53]:
best_ridge.score(xtrain,ytrain)

0.8883447861898661

In [54]:
best_ridge.score(xtest,ytest)

0.836142240122025

## Lasso

In [55]:
base_model2 = Lasso()
gscv2 = GridSearchCV(base_model2,params,cv=3,scoring="r2")
gscv2.fit(xtrain,ytrain)

In [57]:

gscv2.best_params_

{'alpha': np.float64(99.9)}

In [56]:
gscv2.best_score_

np.float64(0.8534467438170795)

In [58]:
best_lasso = gscv2.best_estimator_
best_lasso

In [59]:
best_lasso.score(xtrain,ytrain)

0.9146825720317181

In [60]:
best_lasso.score(xtest,ytest)

0.8251415057873493