In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
# L1 and L2 Regularization is used to prevent overfitting of the model

![image](https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fanalystprep.com%2Fstudy-notes%2Fwp-content%2Fuploads%2F2021%2F03%2FImg_13.jpg&f=1&nofb=1&ipt=5de091879ffed129ffac3adf7da893a4ba4340dd1d20cdfed4eeac5eea018d93&ipo=images)

In [16]:
housing_url = "https://raw.githubusercontent.com/codebasics/py/refs/heads/master/ML/16_regularization/Melbourne_housing_FULL.csv"

In [17]:
import pandas as pd
import numpy as np

housing_df = pd.read_csv(housing_url)
housing_df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [18]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount',
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
housing_df = housing_df[cols_to_use]

In [19]:
housing_df.isna().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,3
Propertycount,3
Distance,1
CouncilArea,3
Bedroom2,8217


In [20]:
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
housing_df[cols_to_fill_zero] = housing_df[cols_to_fill_zero].fillna(0)

housing_df.isna().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,3
Propertycount,0
Distance,0
CouncilArea,3
Bedroom2,0


In [21]:
housing_df['Landsize'] = housing_df['Landsize'].fillna(housing_df.Landsize.mean())
housing_df['BuildingArea'] = housing_df['BuildingArea'].fillna(housing_df.BuildingArea.mean())

In [22]:
housing_df.dropna(inplace=True)
housing_df.isna().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Method,0
SellerG,0
Regionname,0
Propertycount,0
Distance,0
CouncilArea,0
Bedroom2,0


In [23]:
housing_df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,160.2564,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,94.0,160.2564,850000.0
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,1.0,2.0,120.0,142.0,1600000.0


In [24]:
housing_df = pd.get_dummies(housing_df, drop_first=True, dtype=int)
housing_df.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,160.2564,850000.0,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [25]:
ip = housing_df.drop('Price', axis=1)
op = housing_df['Price']

In [26]:
from sklearn.model_selection import train_test_split

ip_train, ip_test, op_train, op_test = train_test_split(ip, op, test_size=0.3, random_state=2)

In [27]:
reg = LinearRegression().fit(ip_train, op_train)

In [28]:
reg.score(ip_test, op_test)

0.1385368316165222

In [29]:
reg.score(ip_train, op_train)

0.6827792395792723

In [30]:
# we can see that our score is very low on test dataset while it's way higher on training dataset. This is an example of overfitting

In [32]:
from sklearn.linear_model import Lasso # Lasso is l1 regularization

lasso_reg = Lasso(alpha=50, max_iter=10000, tol=0.1)
lasso_reg.fit(ip_train, op_train)

In [33]:
lasso_reg.score(ip_test, op_test)

0.6636280170612745

In [34]:
lasso_reg.score(ip_train, op_train)

0.6767149418617553

In [35]:
# The score has improved a lot compared to previous

In [36]:
from sklearn.linear_model import Ridge # Ridge is l2 regularization

ridge_reg = Ridge(alpha=50, max_iter=10000, tol=0.1)
ridge_reg.fit(ip_train, op_train)

In [37]:
ridge_reg.score(ip_test, op_test)

0.6670848945194958

In [38]:
ridge_reg.score(ip_train, op_train)

0.6622376739684328