<a href="https://colab.research.google.com/github/soujanya-vattikolla/ML-Basics-Definitions/blob/main/L1andL2Regularization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

L1 and L2 Regularization

Lasso and Ridge Regularization

In [1]:
# import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load the house dataset

house_df = pd.read_csv("Melbourne_housing.csv")
house_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
# unique values in dataset

house_df.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [5]:
# shape of the dataset

house_df.shape

(34857, 21)

They are 34857 records and 21 columns

In [6]:
house_df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [8]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG','Distance', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'CouncilArea', 'Regionname', 'Propertycount','Price']

house_df = house_df[cols_to_use]
house_df.head(2)

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount,Price
0,Abbotsford,2,h,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0,
1,Abbotsford,2,h,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0,1480000.0


In [10]:
house_df.shape

(34857, 15)

Now, we can observe that columns are reduced from 21 to 15.

In [11]:
# check for null values

house_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
Price             7610
dtype: int64

We can see that they are many null values present in the dataset.

In [13]:
# filling few columns to zero 

cols_to_fill_zero = ['Propertycount','Distance','Bedroom2','Bathroom','Car']

house_df[cols_to_fill_zero] = house_df[cols_to_fill_zero].fillna(0)

house_df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        0
Price             7610
dtype: int64

In [14]:
# filling the null values with the mean value.

house_df['Landsize'] = house_df['Landsize'].fillna(house_df['Landsize'].mean())
house_df['BuildingArea'] = house_df['BuildingArea'].fillna(house_df['BuildingArea'].mean())

In [15]:
house_df.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
CouncilArea         3
Regionname          3
Propertycount       0
Price            7610
dtype: int64

In [16]:
# dropping the null values

house_df.dropna(inplace=True)
house_df.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
Price            0
dtype: int64

We can observe that we dont find any null values present in the dataset.

In [17]:
house_df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount,Price
1,Abbotsford,2,h,S,Biggin,2.5,2.0,1.0,1.0,202.0,160.2564,Yarra City Council,Northern Metropolitan,4019.0,1480000.0
2,Abbotsford,2,h,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0,1035000.0
4,Abbotsford,3,h,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0,1465000.0
5,Abbotsford,3,h,PI,Biggin,2.5,3.0,2.0,1.0,94.0,160.2564,Yarra City Council,Northern Metropolitan,4019.0,850000.0
6,Abbotsford,4,h,VB,Nelson,2.5,3.0,1.0,2.0,120.0,142.0,Yarra City Council,Northern Metropolitan,4019.0,1600000.0


We can observe that they are few columns with string values, we need to convert them to integer data. 

In [18]:
# one hot encoding

house_df = pd.get_dummies(house_df, drop_first=True)
house_df.head(2)

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Price,Suburb_Aberfeldie,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,1480000.0,0,...,0,1,0,0,1,0,0,0,0,0
2,2,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,1035000.0,0,...,0,1,0,0,1,0,0,0,0,0


In [19]:
X = house_df.drop('Price',axis=1)
y = house_df['Price']

In [20]:
print(X.shape)
print(y.shape)

(27244, 744)
(27244,)


In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [23]:
from sklearn.linear_model import LinearRegression
lineareg = LinearRegression()
lineareg.fit(X_train,y_train)

LinearRegression()

In [24]:
lineareg.score(X_train,y_train)

0.6827792395792723

In [25]:
lineareg.score(X_test,y_test)

0.13853683161630603

We can observe that testing set gave very less accuracy of 13%, we can say it is a overfitting.

In [27]:
# Lasso regression

from sklearn import linear_model

lasso_reg = linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)

lasso_reg.fit(X_train,y_train)

Lasso(alpha=50, max_iter=100, tol=0.1)

In [30]:
lasso_reg.score(X_train,y_train)

0.6110572241699475

In [29]:
lasso_reg.score(X_test,y_test)

0.6971988445473387

Now we can observe that the test score has increased.

In [31]:
# Ridge Regression

ridge_reg = linear_model.Ridge(alpha=50, max_iter=100, tol=0.1)

ridge_reg.fit(X_train,y_train)

Ridge(alpha=50, max_iter=100, tol=0.1)

In [33]:
ridge_reg.score(X_train,y_train)

0.6622376739684328

In [32]:
ridge_reg.score(X_test,y_test)

0.6670848945194958