## Ridge Regularisation Model

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Read the dataset

In [2]:
import pandas as pd
df = pd.read_csv('training_set.csv',na_values=[''],keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Perform basic data quality checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1460 non-null   int64 
 1   MSSubClass     1460 non-null   int64 
 2   MSZoning       1460 non-null   object
 3   LotFrontage    1460 non-null   object
 4   LotArea        1460 non-null   int64 
 5   Street         1460 non-null   object
 6   Alley          1460 non-null   object
 7   LotShape       1460 non-null   object
 8   LandContour    1460 non-null   object
 9   Utilities      1460 non-null   object
 10  LotConfig      1460 non-null   object
 11  LandSlope      1460 non-null   object
 12  Neighborhood   1460 non-null   object
 13  Condition1     1460 non-null   object
 14  Condition2     1460 non-null   object
 15  BldgType       1460 non-null   object
 16  HouseStyle     1460 non-null   object
 17  OverallQual    1460 non-null   int64 
 18  OverallCond    1460 non-null

In [4]:
s=df.isna().sum()
s[s>0]

Series([], dtype: int64)

In [5]:
df.duplicated().sum()

0

#### there are no missing values and duplicated values in this dataset

## Separate X and Y features.
Remove statistically insignificant columns in X

Y will be the target feature i.e Sale price

In [6]:
X = df.drop(columns=['Id','SalePrice'])
Y = df[['SalePrice']]

In [7]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [8]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


## Categorical and Continuous features

In [9]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [10]:
cat[:5]

['MSZoning', 'LotFrontage', 'Street', 'Alley', 'LotShape']

In [11]:
con[:5]

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']

## Preprocess X using pipeline

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:
num_pipe = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),
                           ('scaler',StandardScaler())])

In [14]:
cat_pipe = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='NotAvial')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [15]:
pre = ColumnTransformer([('num',num_pipe,con),
                         ('cat',cat_pipe,cat)]).set_output(transform='pandas')

In [16]:
pre

In [17]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__SaleType_ConLw,cat__SaleType_New,cat__SaleType_Oth,cat__SaleType_WD,cat__SaleCondition_Abnorml,cat__SaleCondition_AdjLand,cat__SaleCondition_Alloca,cat__SaleCondition_Family,cat__SaleCondition_Normal,cat__SaleCondition_Partial
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.575425,-0.288653,-0.944591,-0.459303,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.429577,1.171992,-0.288653,-0.641228,0.466465,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.830215,0.092907,-0.288653,-0.301643,-0.313369,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.499274,-0.288653,-0.06167,-0.687324,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.733308,0.463568,-0.288653,-0.174865,0.19968,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## Train test split
training 80%

testing 20%

In [18]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain , ytest = train_test_split(X_pre,Y,train_size=0.8,test_size=0.2,random_state=21)

In [19]:
xtrain.shape

(1168, 838)

In [20]:
xtest.shape

(292, 838)

## Build Linear Regression model and Rdge model

In [21]:
from sklearn.linear_model import LinearRegression,Ridge

In [22]:
model = LinearRegression()
model.fit(xtrain,ytrain)

In [23]:
model.score(xtrain,ytrain)

0.9856742115813273

In [24]:
model.score(xtest,ytest)

-4.491377558475041e+19

In [25]:
model2 = Ridge(alpha=2)
model2.fit(xtrain,ytrain)

In [26]:
model2.score(xtrain,ytrain)

0.9577711858998837

In [27]:
model2.score(xtest,ytest)

0.8321323782418545

## Hyperparameter tuning( alpha tuning)

In [28]:
import numpy as np
params = {'alpha': np.arange(start=0.1,stop=100,step=0.1)}

In [29]:
rr = Ridge()

In [30]:
from sklearn.model_selection import GridSearchCV
gscv = GridSearchCV(estimator=rr,param_grid=params,cv=5,scoring='neg_mean_squared_error')
gscv.fit(xtrain,ytrain)

In [31]:
gscv.best_params_

{'alpha': 47.900000000000006}

In [32]:
gscv.best_score_

-1088296141.5900366

In [33]:
gscv.best_estimator_

In [34]:
best_ridge=gscv.best_estimator_
best_ridge

## Evaluate tuned model

In [35]:
best_ridge.score(xtrain,ytrain)

0.8974584211274285

In [36]:
best_ridge.score(xtest,ytest)

0.8285366409675925

## cross validate r2 for above model

In [37]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_ridge,xtrain,ytrain,cv=5,scoring='r2')
scores

array([0.59133855, 0.84042547, 0.90452153, 0.87200777, 0.92807259])

In [38]:
scores.mean()

0.8272731811582531

## Ridge model is performing well in training and testing data. Lets use this to predict SalePrice out of Sample data

In [39]:
xnew = pd.read_csv('sample_set.csv',na_values=[''],keep_default_na=False)
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [40]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'NA'