In [4]:
import os
import sys
from pathlib import Path

import pandas as pd
import numpy as np


sys.path.append( (Path(os.getcwd()).parent).as_posix() )
np.set_printoptions(linewidth=120)

In [21]:
DATA_ROOT = Path("../data")
TARGET_DATA = "Melbourne_housing_FULL.csv"

data = pd.read_csv(DATA_ROOT / "raw/melbourne" / TARGET_DATA)
data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


## Data cleaning

In [22]:
data.dropna(subset=['Price', 'Longtitude', 'Lattitude'], axis=0, inplace=True)

# Features that do not affect the price
data.drop(['Method', 'Date', 'SellerG', 'Postcode'], axis=1, inplace=True)

# Features that had 2/3 of missing data. Data filling methods at this point are useless
data.drop(['YearBuilt', 'BuildingArea'], axis=1, inplace=True)

data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,2.5,2.0,1.0,1.0,202.0,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,2.5,3.0,2.0,1.0,94.0,Yarra City Council,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,Yarra City Council,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20988,Yarraville,78 Bayview Rd,3,h,1101000.0,6.3,3.0,1.0,,288.0,Maribyrnong City Council,-37.81095,144.88516,Western Metropolitan,6543.0
20989,Yarraville,13 Burns St,4,h,1480000.0,6.3,4.0,1.0,3.0,593.0,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
20990,Yarraville,29A Murray St,2,h,888000.0,6.3,2.0,2.0,1.0,98.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
20991,Yarraville,147A Severn St,2,t,705000.0,6.3,2.0,1.0,2.0,220.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0


In [23]:
geo_features = ['Longtitude', 'Lattitude', 'CouncilArea', 'Address', 'Suburb']
position_data = data[geo_features]
data.drop(geo_features, axis=1, inplace=True)

In [24]:
cat_feat = [f for f in data.columns if data.dtypes[f] == 'object']
num_feat = [f for f in data.columns if f not in cat_feat]

for f in cat_feat:
    data[f] = data[f].fillna(method='ffill')
    
for f in num_feat:
    data[f] = data[f].fillna(data[f].median())

  data[f] = data[f].fillna(method='ffill')


In [25]:
data.isna().sum(axis=0)

Rooms            0
Type             0
Price            0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
Regionname       0
Propertycount    0
dtype: int64

## Data preprocessing

- standardization (for numerical features)
- OHE (for categorical)
- separate spatial data from the rest

In [26]:
from scipy import stats

data = data[(np.abs(stats.zscore(data[num_feat])) <= 3).all(axis=1)]

target = data["Price"].to_numpy().reshape(-1, 1)
data.drop(columns=["Price"], inplace=True)
num_feat.remove("Price")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=["Price"], inplace=True)


In [27]:
print(geo_features)
position_data = position_data.iloc[data.index].reset_index(drop=True)
data.reset_index(drop=True, inplace=True)

['Longtitude', 'Lattitude', 'CouncilArea', 'Address', 'Suburb']


In [28]:
cat_data = data[cat_feat]
num_data = data[num_feat]

In [29]:
from sklearn.preprocessing import StandardScaler, Normalizer

std_scaler = StandardScaler()
norm = Normalizer()

num_data = std_scaler.fit_transform(num_data)
num_data.shape

(19267, 7)

One-hot encoding is applied to categorical features "Regionname" and "Type"

In [30]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
encoded_cat_data = ohe.fit_transform(data[cat_feat])
encoded_cat_data = pd.DataFrame(encoded_cat_data.todense())

data = np.hstack((num_data, encoded_cat_data))

In [31]:
position_data = position_data[['Longtitude', 'Lattitude']].to_numpy()

In [32]:
print("Num ", num_data.shape)
print("Cat ", encoded_cat_data.shape)
print("Data ", data.shape)
print("Pos ", position_data.shape)
print("Target ", target.shape)

Num  (19267, 7)
Cat  (19267, 11)
Data  (19267, 18)
Pos  (19267, 2)
Target  (19267, 1)


In [36]:
with open(DATA_ROOT / "processed/melbourne" / TARGET_DATA.replace(".csv", ".npz"), "xb") as f:
    np.savez(
        f,
        data=data,
        target=target,
        spatial=position_data,
    )   