In [64]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [65]:
data = pd.read_csv('./data/Melbourne/Melbourne_housing_FULL.csv')
data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


## Data cleaning

In [66]:
data.isna().sum(axis=0)

Suburb               0
Address              0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
YearBuilt        19306
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
dtype: int64

We see some missing data in the target feature. Filling with mean/mode can affect model performance on new real data.
Instead, just drop data-entries with unkown target values.

In [67]:
data.dropna(subset=['Price'], axis=0, inplace=True)
target = data['Price']
data.drop(['Price', 'Method', 'Date', 'SellerG', 'Postcode'], axis=1, inplace=True)

data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,Suburb,Address,Rooms,Type,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,2.5,3.0,2.0,1.0,94.0,,,Yarra City Council,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27242,Yarraville,13 Burns St,4,h,6.3,4.0,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
27243,Yarraville,29A Murray St,2,h,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
27244,Yarraville,147A Severn St,2,t,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
27245,Yarraville,12/37 Stephen St,3,h,6.3,,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


In [68]:
geo_features = ['Lattitude', 'Longtitude', 'CouncilArea', 'Address', 'Suburb']
position_data = data[geo_features]
data.drop(geo_features, axis=1, inplace=True)

In [69]:
cat_feat = [f for f in data.columns if data.dtypes[f] == 'object']
num_feat = [f for f in data.columns if f not in cat_feat]

for f in cat_feat:
    data[f] = data[f].fillna(method='ffill')
    
for f in num_feat:
    data[f] = data[f].fillna(0)

In [70]:
print(geo_features)
data

['Lattitude', 'Longtitude', 'CouncilArea', 'Address', 'Suburb']


Unnamed: 0,Rooms,Type,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Regionname,Propertycount
0,2,h,2.5,2.0,1.0,1.0,202.0,0.0,0.0,Northern Metropolitan,4019.0
1,2,h,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Northern Metropolitan,4019.0
2,3,h,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Northern Metropolitan,4019.0
3,3,h,2.5,3.0,2.0,1.0,94.0,0.0,0.0,Northern Metropolitan,4019.0
4,4,h,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...
27242,4,h,6.3,4.0,1.0,3.0,593.0,0.0,0.0,Western Metropolitan,6543.0
27243,2,h,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Western Metropolitan,6543.0
27244,2,t,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Western Metropolitan,6543.0
27245,3,h,6.3,0.0,0.0,0.0,0.0,0.0,0.0,Western Metropolitan,6543.0


## Data preprocessing

One-hot encoding is applied to categorical features "Regionname" and "Type"

In [71]:
ohe = OneHotEncoder()

encoded_cat = ohe.fit_transform(data[cat_feat])
encoded_cat = pd.DataFrame(encoded_cat.todense(), columns=[str(i) for i in range(11)])
data.drop(cat_feat, axis=1, inplace=True)

data = pd.concat([data, encoded_cat], axis=1)

For feature "YearBuilt" - apply segmentation into bins

In [72]:
data['YearBuilt'] = pd.cut(data['YearBuilt'], bins=20, labels=range(20))

In [73]:
data

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,0,1,2,3,4,5,6,7,8,9,10
0,2,2.5,2.0,1.0,1.0,202.0,0.0,0,4019.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2,2.5,2.0,1.0,0.0,156.0,79.0,18,4019.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3,2.5,3.0,2.0,0.0,134.0,150.0,18,4019.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,3,2.5,3.0,2.0,1.0,94.0,0.0,0,4019.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4,2.5,3.0,1.0,2.0,120.0,142.0,19,4019.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27242,4,6.3,4.0,1.0,3.0,593.0,0.0,0,6543.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
27243,2,6.3,2.0,2.0,1.0,98.0,104.0,19,6543.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
27244,2,6.3,2.0,1.0,2.0,220.0,120.0,19,6543.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
27245,3,6.3,0.0,0.0,0.0,0.0,0.0,0,6543.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Generate polynomial features for continuous data

In [74]:
from sklearn.preprocessing import PolynomialFeatures

pol_features = ['Rooms','Distance','Bedroom2','Bathroom','Car','Landsize','BuildingArea','YearBuilt','Propertycount']

pol = PolynomialFeatures(degree=2)
pol_extended = pol.fit_transform(data[pol_features])

data.drop(pol_features, axis=1, inplace=True)
data = pd.concat([data, pd.DataFrame(pol_extended, columns=[str(i) for i in range(pol_extended.shape[1])])], axis=1)

## Modelling

In [75]:
from sklearn.linear_model import RidgeCV, LinearRegression, Lasso, ElasticNetCV

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=42)

In [76]:
import warnings
warnings.filterwarnings(action='ignore')

model_1 = LinearRegression().fit(X_train, y_train)
model_2 = RidgeCV(alphas=np.linspace(0.05, 0.5, 10), cv=5).fit(X_train, y_train)
model_3 = Lasso(alpha=0.1).fit(X_train, y_train)
model_4 = ElasticNetCV(alphas=np.linspace(0.05, 1, 10), l1_ratio=0.5).fit(X_train, y_train)

In [77]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(f'MSE Regression: \tR^2 score = {model_1.score(X_test, y_test):.4f} | MSE = {mean_squared_error(y_test, model_1.predict(X_test)):.4e} | MAE = {mean_absolute_error(y_test, model_1.predict(X_test)):.4e}')
print(f'Ridge Regression: \tR^2 score = {model_2.score(X_test, y_test):.4f} | MSE = {mean_squared_error(y_test, model_2.predict(X_test)):.4e} | MAE = {mean_absolute_error(y_test, model_2.predict(X_test)):.4e}')      
print(f'Lasso Regression: \tR^2 score = {model_3.score(X_test, y_test):.4f} | MSE = {mean_squared_error(y_test, model_3.predict(X_test)):.4e} | MAE = {mean_absolute_error(y_test, model_3.predict(X_test)):.4e}')      
print(f'ElasticNet: \t\tR^2 score = {model_4.score(X_test, y_test):.4f} | MSE = {mean_squared_error(y_test, model_4.predict(X_test)):.4e} | MAE = {mean_absolute_error(y_test, model_4.predict(X_test)):.4e}')      

MSE Regression: 	R^2 score = 0.5707 | MSE = 1.8345e+11 | MAE = 2.5946e+05
Ridge Regression: 	R^2 score = 0.5702 | MSE = 1.8367e+11 | MAE = 2.5947e+05
Lasso Regression: 	R^2 score = 0.4045 | MSE = 2.5446e+11 | MAE = 2.6317e+05
ElasticNet: 		R^2 score = -0.3349 | MSE = 5.7042e+11 | MAE = 2.9459e+05
