# L1 & L2 Regularization Exercise using Melbourne Housing Dataset

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [67]:
# Suppress Warnings for clean notebook
import warnings
warnings.filterwarnings('ignore')

In [68]:
data= pd.read_csv("Melbourne_housing_FULL.csv")
data.head(5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [69]:
data.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [70]:
# let's use limited columns which makes more sense for serving our purpose
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
dataset = data[cols_to_use]

In [71]:
dataset.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [72]:
dataset.shape

(34857, 15)

Checking for missing values

In [73]:
# check missing values in data
dataset.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [74]:
#from sklearn.preprocessing import LabelEncoder

# Fit and transform the dates to numerical labels
#dataset['Date'] = LabelEncoder().fit_transform(dataset['Date'])

In [75]:
# Some feature's missing values can be treated as zero (another class for NA values or absence of that feature)
# like 0 for Propertycount, Bedroom2 will refer to other class of NA values
# like 0 for Car feature will mean that there's no car parking feature with house
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)

# other continuous features can be imputed with mean for faster results since our focus is on Reducing overfitting
# using Lasso and Ridge Regression
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())

Drop NA values of Price, since it’s our predictive variable we won’t impute it

In [76]:
dataset.dropna(inplace=True)

In [77]:
type(dataset)

pandas.core.frame.DataFrame

Encoding categorical features first

In [78]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.columns

Index(['Rooms', 'Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'Price', 'Suburb_Aberfeldie',
       ...
       'CouncilArea_Moorabool Shire Council',
       'CouncilArea_Moreland City Council',
       'CouncilArea_Nillumbik Shire Council',
       'CouncilArea_Port Phillip City Council',
       'CouncilArea_Stonnington City Council',
       'CouncilArea_Whitehorse City Council',
       'CouncilArea_Whittlesea City Council',
       'CouncilArea_Wyndham City Council', 'CouncilArea_Yarra City Council',
       'CouncilArea_Yarra Ranges Shire Council'],
      dtype='object', length=745)

In [79]:
import seaborn as sns
from ydata_profiling import ProfileReport

In [80]:
profile = ProfileReport(dataset,title="Melbourne Housing Dataset",html={'style':{'full_width':True}},sort="Ascending")

In [81]:
X = dataset.drop('Price', axis=1)
y = dataset['Price']

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [83]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

In [84]:
reg.score(X_test, y_test)

0.138536831613972

In [85]:
reg.score(X_train, y_train)

0.6827792395792723

Therefore, normal regression is clearly overfitting as\
training score is 68% but testing score is 13% which is very low

## Using Lasso (L1 Regularization) Regression Model

In [86]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(X_train, y_train)

In [87]:
lasso_reg.score(X_test, y_test)

0.6636111369404489

In [88]:
lasso_reg.score(X_train,y_train)

0.6766985624766824

In [89]:
#predict and evaluate
y_pred=lasso_reg.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
print("Mean squared error: ",mse)
print("Coefficients: ",lasso_reg.coef_)

Mean squared error:  133653603705.64285
Coefficients:  [ 2.70967779e+05  4.63745956e+00 -3.03357705e+04 -8.48946624e+04
  1.23220366e+05  4.17503159e+04  2.56283488e+00  8.65129010e+01
  2.19220733e+05 -1.20065548e+05 -0.00000000e+00  3.24095470e+05
  1.55485391e+05  2.97071682e+05  1.38140476e+05 -9.41251645e+04
 -1.30121499e+05 -0.00000000e+00  1.51646664e+05 -9.60503347e+04
 -1.74328672e+04 -0.00000000e+00  2.29652095e+04 -1.42906742e+05
 -0.00000000e+00 -2.72295256e+05  0.00000000e+00 -3.51581901e+04
  2.31085764e+05  6.08260117e+04  3.20753304e+04 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  1.90154504e+05 -1.90416055e+05
  3.35876186e+04 -8.78863211e+04  5.12664874e+04  2.57628933e+05
  1.86342954e+05 -5.53309467e+03 -0.00000000e+00  6.29880653e+04
  1.70718008e+04  0.00000000e+00  2.46076632e+05 -1.20882624e+05
 -0.00000000e+00  4.88763467e+05 -0.00000000e+00  1.27428350e+04
  0.00000000e+00  1.92776878e+05  3.58671817e+04  6.42019523e+04
 -0.00000000e+00  0.00000000e+00 -1

## Using Ridge (L2 Regularization) Regression 

In [90]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(X_train, y_train)

In [91]:
ridge_reg.score(X_test, y_test)

0.6670848945194958

In [92]:
ridge_reg.score(X_train, y_train)

0.6622376739684328

In [93]:
#predict and evaluate
y_pred=ridge_reg.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
print("Mean squared error: ",mse)
print("Coefficients: ",ridge_reg.coef_)

Mean squared error:  132273414674.95776
Coefficients:  [ 2.74565399e+05  1.43900376e+00 -3.08679934e+04 -8.54802356e+04
  1.30784473e+05  3.79897031e+04  3.01027203e+00  3.48784768e+01
  1.28807037e+05 -6.43936116e+04 -2.20191147e+04  1.41225935e+05
  6.01541283e+04  1.36604921e+05  6.82450737e+04 -4.48758586e+04
 -1.18261081e+05 -1.40375061e+04  9.47702189e+04 -5.29144382e+04
 -4.41704919e+04 -4.98005912e+04  2.40852122e+04 -3.83936672e+04
 -1.58353300e+04 -1.68201754e+05  8.08493825e+03 -6.27880702e+04
  1.48801899e+05  3.01077775e+04  2.59806529e+04 -1.38083800e+04
  1.17008911e+04  9.42462899e+02  7.84154905e+04 -6.16676997e+04
  3.86961098e+04 -5.72485218e+04  4.67434700e+04  6.78005371e+04
  7.05287853e+04 -3.93760752e+04 -2.74948996e+04  2.49850015e+04
  2.99162838e+04  0.00000000e+00  1.14984883e+05 -7.42920919e+04
 -1.36014255e+04  3.69878814e+05 -4.72146099e+04 -8.03342124e+03
  2.80914008e+03  5.43349020e+04  5.11640785e+04  4.72233711e+04
  2.35059755e+03  0.00000000e+00 -9