# Project - House price prediction using Linear Regression and Lasso Penalty

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

## Reading the dataset

In [2]:
data=pd.read_csv("C:/Vaibhav Sir/HouseTraining.csv")

In [3]:
data.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

### Droping Unwanted Column

In [4]:
data.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [5]:
data=data.drop(labels=['Id'],axis=1)

## Missing Data Treatment

In [6]:
def replacer(df):
    import pandas as pd
    Q=pd.DataFrame(df.isna().sum(),columns=["ct"])
    for i in Q[Q.ct>0].index:
                   if(df[i].dtypes!="object"):
                       x=df[i].mean()
                       df[i]=df[i].fillna(x)

#i have replaced only continuous data because in my dataset i got some problem
# In my dataset values which are Nan are actually 'NO' typo error in dataset
# Thats why i cnanged 'Nan' to 'No' with fillna()

In [7]:
replacer(data)

In [8]:
data.isna().sum()
pd.set_option('display.max_rows',None)

In [9]:
data.isna().sum()

MSSubClass          0
MSZoning            0
LotFrontage         0
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          0
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea 

In [10]:
data.Electrical = data.Electrical.fillna("SBrkr")   #i got only one column which have true null values so i replaced with mode

In [11]:
data=data.fillna('No')    # here i changed 'Nan' to 'no'

In [12]:
data.isna().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual 

## Divide data into X and Y 

In [14]:
X=data.drop(labels=['SalePrice'],axis=1)
Y=data[["SalePrice"]]
X.head(20)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,No,Reg,Lvl,AllPub,Inside,...,0,0,No,No,No,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,No,Reg,Lvl,AllPub,FR2,...,0,0,No,No,No,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,No,IR1,Lvl,AllPub,Inside,...,0,0,No,No,No,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,No,IR1,Lvl,AllPub,Corner,...,0,0,No,No,No,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,No,IR1,Lvl,AllPub,FR2,...,0,0,No,No,No,0,12,2008,WD,Normal
5,50,RL,85.0,14115,Pave,No,IR1,Lvl,AllPub,Inside,...,0,0,No,MnPrv,Shed,700,10,2009,WD,Normal
6,20,RL,75.0,10084,Pave,No,Reg,Lvl,AllPub,Inside,...,0,0,No,No,No,0,8,2007,WD,Normal
7,60,RL,70.049958,10382,Pave,No,IR1,Lvl,AllPub,Corner,...,0,0,No,No,Shed,350,11,2009,WD,Normal
8,50,RM,51.0,6120,Pave,No,Reg,Lvl,AllPub,Inside,...,0,0,No,No,No,0,4,2008,WD,Abnorml
9,190,RL,50.0,7420,Pave,No,Reg,Lvl,AllPub,Corner,...,0,0,No,No,No,0,1,2008,WD,Normal


In [15]:
Y.head(20)

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
5,143000
6,307000
7,200000
8,129900
9,118000


In [14]:
X.shape

(1460, 79)

In [15]:
#x=1
#import matplotlib.pyplot as plt
#import seaborn as sns
#for i in X.columns:
#    plt.figure(figsize=(8,140))
#   if X[i].dtypes!='object':
#       plt.subplot(79,1,x)
#       x=x+1
#       sns.scatterplot(X[i],Y.SalePrice)
#       plt.show()
#   else:
#       plt.subplot(79,1,x)
#       x=x+1
#       sns.boxplot(X[i],Y.SalePrice)
#        plt.show()

### For important  continuous columns i have used correlation of SalePrice with all other columns and selected some columns

In [16]:
imp_con_columns=data.corr()['SalePrice'].sort_values().tail(11).index.drop('SalePrice')
imp_con_columns

Index(['YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF',
       'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual'],
      dtype='object')

In [17]:
c1=[]
c2=[]
for i in X.columns:
    if X[i].dtypes=='object':
        c1.append(i)
    else:
        c2.append(i)

In [18]:
imp_con_columns

Index(['YearRemodAdd', 'YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF',
       'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual'],
      dtype='object')

In [19]:
#c2

### For Categorical columns i've used ANOVA and on the basis of pvalue discarted some columns

In [20]:
def ANOVA(df,con,cat):    
    from pandas import DataFrame
    from statsmodels.api import OLS
    from statsmodels.formula.api import ols
    rel=con+"~"+cat
    model=ols(rel,df).fit()
    from statsmodels.stats.anova import anova_lm
    anova_results=anova_lm(model)
    Q=DataFrame(anova_results)
    a = Q['PR(>F)'][cat]
    return round(a,5)
        
    

In [21]:
imp_cat_columns=[]
for i in X.columns:
    if data[i].dtypes=='object':
        pv=ANOVA(data,'SalePrice',i)
        if pv<0.05 :
            print('Sale Price vs',i,'--> ',pv)
            imp_cat_columns.append(i)
            

Sale Price vs MSZoning -->  0.0
Sale Price vs Alley -->  0.0
Sale Price vs LotShape -->  0.0
Sale Price vs LandContour -->  0.0
Sale Price vs LotConfig -->  0.0
Sale Price vs Neighborhood -->  0.0
Sale Price vs Condition1 -->  0.0
Sale Price vs Condition2 -->  0.04343
Sale Price vs BldgType -->  0.0
Sale Price vs HouseStyle -->  0.0
Sale Price vs RoofStyle -->  0.0
Sale Price vs RoofMatl -->  0.0
Sale Price vs Exterior1st -->  0.0
Sale Price vs Exterior2nd -->  0.0
Sale Price vs MasVnrType -->  0.0
Sale Price vs ExterQual -->  0.0
Sale Price vs ExterCond -->  0.0
Sale Price vs Foundation -->  0.0
Sale Price vs BsmtQual -->  0.0
Sale Price vs BsmtCond -->  0.0
Sale Price vs BsmtExposure -->  0.0
Sale Price vs BsmtFinType1 -->  0.0
Sale Price vs BsmtFinType2 -->  0.0
Sale Price vs Heating -->  0.00075
Sale Price vs HeatingQC -->  0.0
Sale Price vs CentralAir -->  0.0
Sale Price vs Electrical -->  0.0
Sale Price vs KitchenQual -->  0.0
Sale Price vs Functional -->  0.00048
Sale Price vs F

In [22]:
imp_cat_columns

['MSZoning',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [23]:
imp_con_columns=list(data.corr()['SalePrice'].sort_values().tail(11).index)

In [24]:
imp_cols = []             
imp_cols.extend(imp_con_columns)
imp_cols.extend(imp_cat_columns)

In [25]:
imp_cols     #these are the important columns i got from ANOVA and CORR()

['YearRemodAdd',
 'YearBuilt',
 'TotRmsAbvGrd',
 'FullBath',
 '1stFlrSF',
 'TotalBsmtSF',
 'GarageArea',
 'GarageCars',
 'GrLivArea',
 'OverallQual',
 'SalePrice',
 'MSZoning',
 'Alley',
 'LotShape',
 'LandContour',
 'LotConfig',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [26]:
imp_cols.remove("SalePrice")

In [27]:
imp_con_columns.remove("SalePrice")

# Deal with skew

In [28]:
X[imp_cols].skew()

YearRemodAdd   -0.503562
YearBuilt      -0.613461
TotRmsAbvGrd    0.676341
FullBath        0.036562
1stFlrSF        1.376757
TotalBsmtSF     1.524255
GarageArea      0.179981
GarageCars     -0.342549
GrLivArea       1.366560
OverallQual     0.216944
dtype: float64

In [29]:
for i in imp_con_columns:
    W=[]
    for j in X[i]:
        if j!=0:
            W.append(np.log(j))
        else:
            W.append(0)
    X[i] = W

In [30]:
X[imp_con_columns].skew()

YearRemodAdd   -0.511009
YearBuilt      -0.641144
TotRmsAbvGrd   -0.190746
FullBath       -0.084139
1stFlrSF        0.079157
TotalBsmtSF    -5.152705
GarageArea     -3.480590
GarageCars     -0.355098
GrLivArea      -0.006995
OverallQual    -0.929781
dtype: float64

In [31]:
catUP=[]
conUP=[]
for i in X[imp_cols].columns:
    if X[i].dtypes=='object':
        catUP.append(i)
    else:
        conUP.append(i) 
Xupdated=X[imp_cols]
Xupdated.shape

(1460, 50)

# Preprocessing

In [32]:
X1=pd.get_dummies(Xupdated[catUP])
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X2=pd.DataFrame(ss.fit_transform(Xupdated[conUP]),columns=conUP)
Xnew=X2.join(X1)
Xnew.shape

(1460, 269)

# Spliting data into training and test set

In [33]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=31)

# OLS Model

In [34]:
from statsmodels.api import OLS,add_constant
x_const=add_constant(xtrain)
ol=OLS(ytrain,x_const)
model1=ol.fit()
model1.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.914
Model:,OLS,Adj. R-squared:,0.894
Method:,Least Squares,F-statistic:,45.95
Date:,"Tue, 17 May 2022",Prob (F-statistic):,0.0
Time:,14:34:33,Log-Likelihood:,-13370.0
No. Observations:,1168,AIC:,27180.0
Df Residuals:,947,BIC:,28300.0
Df Model:,220,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.31e+04,3079.021,7.503,0.000,1.71e+04,2.91e+04
YearRemodAdd,4186.1703,1317.298,3.178,0.002,1601.010,6771.330
YearBuilt,3790.6837,2576.960,1.471,0.142,-1266.529,8847.896
TotRmsAbvGrd,1263.5404,1729.259,0.731,0.465,-2130.083,4657.163
FullBath,1752.9853,1381.557,1.269,0.205,-958.282,4464.253
1stFlrSF,2891.5076,3491.616,0.828,0.408,-3960.692,9743.707
TotalBsmtSF,1.612e+04,5590.631,2.883,0.004,5145.598,2.71e+04
GarageArea,1.11e+04,7135.317,1.556,0.120,-2903.832,2.51e+04
GarageCars,2725.9178,1936.327,1.408,0.160,-1074.070,6525.905

0,1,2,3
Omnibus:,374.475,Durbin-Watson:,2.061
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5928.616
Skew:,1.041,Prob(JB):,0.0
Kurtosis:,13.839,Cond. No.,1.02e+16


### Deleted some columns on the basis of Adj R Squared Value ( Backward Elimination)

In [35]:
deleted_columns=[]

In [164]:
w=pd.DataFrame(model1.pvalues,columns=['Pvalue'])
w=w.drop('const')
maxPV=w.Pvalue.max()
col_to_drop=w[w.Pvalue==maxPV].index[0]
Xnew=Xnew.drop(labels=(col_to_drop),axis=1)
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=31)
from statsmodels.api import OLS,add_constant
xconst=add_constant(xtrain)
ol=OLS(ytrain,xconst)
model1=ol.fit()
print(col_to_drop)
deleted_columns.append(col_to_drop)
model1.summary()


LandContour_Bnk


0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.913
Model:,OLS,Adj. R-squared:,0.902
Method:,Least Squares,F-statistic:,76.76
Date:,"Tue, 17 May 2022",Prob (F-statistic):,0.0
Time:,14:39:28,Log-Likelihood:,-13376.0
No. Observations:,1168,AIC:,27040.0
Df Residuals:,1026,BIC:,27760.0
Df Model:,141,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.819e+04,8519.540,3.309,0.001,1.15e+04,4.49e+04
YearRemodAdd,4373.5431,1134.738,3.854,0.000,2146.870,6600.216
YearBuilt,3379.6221,2006.805,1.684,0.092,-558.289,7317.534
FullBath,1808.0239,1260.384,1.435,0.152,-665.201,4281.249
1stFlrSF,2578.1289,2727.370,0.945,0.345,-2773.731,7929.989
TotalBsmtSF,1.574e+04,4860.238,3.238,0.001,6198.859,2.53e+04
GarageArea,1e+04,6509.660,1.536,0.125,-2771.838,2.28e+04
GarageCars,3078.2172,1775.604,1.734,0.083,-406.012,6562.447
GrLivArea,2.612e+04,2912.727,8.967,0.000,2.04e+04,3.18e+04

0,1,2,3
Omnibus:,370.494,Durbin-Watson:,2.052
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5750.204
Skew:,1.031,Prob(JB):,0.0
Kurtosis:,13.673,Cond. No.,1.03e+16


In [165]:
#xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.2,random_state=31)

In [166]:
#from statsmodels.api import OLS,add_constant
#x_const=add_constant(xtrain)
#ol=OLS(ytrain,x_const)
#model1=ol.fit()
#model1.summary()
Xnew.shape

(1460, 154)

# Linear Regression Model

In [167]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lrmodel=lr.fit(xtrain,ytrain)
tr_pred=lrmodel.predict(xtrain)
ts_pred=lrmodel.predict(xtest)


## Evoluation of model

In [168]:
from sklearn.metrics import mean_absolute_error
tr_err=mean_absolute_error(ytrain,tr_pred)
ts_err=mean_absolute_error(ytest,ts_pred)
tr_err  #training error

15528.894691780823

In [169]:
ts_err  #testing error

21782.969178082192

# LASSO

In [170]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

In [171]:
Q=[]
x=134
for i in range(1,200,2):
    x=x+1
    x=round(x,4)
    Q.append(x)

### Finding optimal value for alpha with the help of GridSearchCV

In [172]:
#grid_search
tuning_grid={"alpha":Q}
ls=Lasso()
cv=GridSearchCV(ls,tuning_grid,scoring="neg_mean_absolute_error",cv=4)
cvmodel=cv.fit(xtrain,ytrain)
cvmodel.best_params_

{'alpha': 140}

## Lasso Penalty (Regularization)

In [173]:
ls=Lasso(alpha=140)
model=ls.fit(xtrain,ytrain)
pred_tr=model.predict(xtrain)
pred_ts=model.predict(xtest)

In [174]:
from sklearn.metrics import mean_absolute_error
tr_err1=mean_absolute_error(ytrain,pred_tr)
ts_err1=mean_absolute_error(ytest,pred_ts)
tr_err1

17334.204408376012

In [175]:
ts_err1   # got almost optimal value

20217.508081934186