In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('./data/Melbourne/Melbourne_housing_FULL.csv')
data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,...,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,...,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,...,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


## Data cleaning

In [3]:
data.isna().sum(axis=0)

Suburb               0
Address              0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
YearBuilt        19306
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
dtype: int64

We see some missing data in the target feature. Filling with mean/mode can affect model performance on new real data.
Instead, just drop data-entries with unkown target values.

In [4]:
data.dropna(subset=['Price'], axis=0, inplace=True)
target = data['Price']
data.drop(['Price', 'Method', 'Date', 'SellerG', 'Postcode'], axis=1, inplace=True)

data.reset_index(inplace=True, drop=True)
data

Unnamed: 0,Suburb,Address,Rooms,Type,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,2.5,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,2.5,3.0,2.0,1.0,94.0,,,Yarra City Council,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27242,Yarraville,13 Burns St,4,h,6.3,4.0,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
27243,Yarraville,29A Murray St,2,h,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
27244,Yarraville,147A Severn St,2,t,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
27245,Yarraville,12/37 Stephen St,3,h,6.3,,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


In [5]:
geo_features = ['Lattitude', 'Longtitude', 'CouncilArea', 'Address', 'Suburb']
position_data = data[geo_features]
data.drop(geo_features, axis=1, inplace=True)

In [6]:
cat_feat = [f for f in data.columns if data.dtypes[f] == 'object']
num_feat = [f for f in data.columns if f not in cat_feat]

for f in cat_feat:
    data[f] = data[f].fillna(method='ffill')
    
for f in num_feat:
    data[f] = data[f].fillna(0)

  data[f] = data[f].fillna(method='ffill')


In [7]:
print(geo_features)
data

['Lattitude', 'Longtitude', 'CouncilArea', 'Address', 'Suburb']


Unnamed: 0,Rooms,Type,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Regionname,Propertycount
0,2,h,2.5,2.0,1.0,1.0,202.0,0.0,0.0,Northern Metropolitan,4019.0
1,2,h,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Northern Metropolitan,4019.0
2,3,h,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Northern Metropolitan,4019.0
3,3,h,2.5,3.0,2.0,1.0,94.0,0.0,0.0,Northern Metropolitan,4019.0
4,4,h,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...
27242,4,h,6.3,4.0,1.0,3.0,593.0,0.0,0.0,Western Metropolitan,6543.0
27243,2,h,6.3,2.0,2.0,1.0,98.0,104.0,2018.0,Western Metropolitan,6543.0
27244,2,t,6.3,2.0,1.0,2.0,220.0,120.0,2000.0,Western Metropolitan,6543.0
27245,3,h,6.3,0.0,0.0,0.0,0.0,0.0,0.0,Western Metropolitan,6543.0


## Data preprocessing

One-hot encoding is applied to categorical features "Regionname" and "Type"

In [8]:
ohe = OneHotEncoder()

encoded_cat = ohe.fit_transform(data[cat_feat])
encoded_cat = pd.DataFrame(encoded_cat.todense(), columns=[str(i) for i in range(11)])
data.drop(cat_feat, axis=1, inplace=True)

data = pd.concat([data, encoded_cat], axis=1)

For feature "YearBuilt" - apply segmentation into bins

In [9]:
data['YearBuilt'] = pd.cut(data['YearBuilt'], bins=20, labels=range(20))

In [10]:
data.shape

(27247, 20)

Generate edge index for our graph of real estate.<br>
Creates edge between two entities, if distance between them is under the predefined threshold

In [11]:
import os
import pickle
from GraphEncoder.encoder import DistanceGraphEncoder
edge_index_datafile = "edge_index.pickle"

if os.path.exists(edge_index_datafile):
    index_file = open(edge_index_datafile, 'rb')
    edge_index = pickle.load(index_file)
else:
    EDGE_CREATING_DISTANCE = 3

    graph_encoder = DistanceGraphEncoder(position_data, EDGE_CREATING_DISTANCE)
    edge_index = graph_encoder.transform()
    
    with open("edge_index.pickle", "wb") as output:
        pickle.dump(edge_index, output)

Create DataLoader which devides nodes into set of batches (for learning).<br>
Also, creates custom data-splitter for dividing data into train/val/test parts.

In [12]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch_geometric.transforms import RandomNodeSplit

data = Data(
    torch.tensor(data.to_numpy(), dtype=torch.float32),
    edge_index=torch.tensor(edge_index), 
    y=torch.tensor(target.to_numpy(), dtype=torch.float32)
)
node_splitter = RandomNodeSplit(num_val=0.2, num_test=0.1)
graph = node_splitter(data)

data_loader = NeighborLoader(graph, 
                             batch_size=64, 
                             num_neighbors=[30], 
                             shuffle=True,
                             input_nodes=graph.train_mask)

  edge_index=torch.tensor(edge_index),


## Applying GNN

In [22]:
import torch
import torch.nn as nn
import torch_geometric.nn as nn_geom
from torch_geometric.data import Data

class GCN(nn.Module):
    def __init__(self, input_size: int, hidden_layer: int):
        super().__init__()
        self.conv1 = nn_geom.SAGEConv(input_size, hidden_layer)
        self.conv2 = nn_geom.SAGEConv(hidden_layer, hidden_layer)
        self.dropout = nn.Dropout(p=0.3)
        
        self.linear1 = nn.Linear(hidden_layer, 2*hidden_layer)
        self.linear2 = nn.Linear(2*hidden_layer, 2*hidden_layer)
        self.linear3 = nn.Linear(2*hidden_layer, 1)
    
    def forward(self, data: Data, edge_index: torch.Tensor) -> torch.Tensor:
        x = self.dropout(self.conv1(data, edge_index).relu())
        x = self.conv1(data, edge_index).relu()
        x = self.linear1(x).relu()
        x = self.linear2(x).relu()
        
        return self.linear3(x)

In [23]:
model = GCN(graph.x.shape[1], 128)

loss_criteria = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=0.01)

In [24]:
model

GCN(
  (conv1): SAGEConv(20, 128, aggr=mean)
  (conv2): SAGEConv(128, 128, aggr=mean)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear1): Linear(in_features=128, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=1, bias=True)
)

In [25]:
import math
from sklearn.metrics import r2_score
import torch_sparse

import warnings
warnings.filterwarnings(action='ignore')

n_epochs = 30
best_mse, best_weights = math.inf, None

for epoch in range(n_epochs):
    for batch in data_loader:
        model.train()
        optim.zero_grad()
        
        predicts = model(batch.x, batch.edge_index)
        loss = loss_criteria(predicts, batch.y)
        loss.backward()
        optim.step()
    
    model.eval()
    predicts = model(graph.x, graph.edge_index)
    mse = loss_criteria(predicts[graph.val_mask], graph.y[graph.val_mask])
    r2 = r2_score(graph.y[graph.val_mask].detach(), predicts[graph.val_mask].detach())
    
    print(f'Epoch #{epoch} | MSE_Loss = {mse:.4e} | R^2 = {r2}')
    
    if mse < best_mse:
        best_mse, best_weights = mse, model.state_dict()

Epoch #0 | MSE_Loss = 7.2065e+11 | R^2 = -0.6113359402339504
Epoch #1 | MSE_Loss = 5.5968e+11 | R^2 = -0.328103389549842
Epoch #2 | MSE_Loss = 5.2675e+11 | R^2 = -0.19577634320575443
Epoch #3 | MSE_Loss = 4.7787e+11 | R^2 = -0.054645429647152355
Epoch #4 | MSE_Loss = 4.6470e+11 | R^2 = -0.03775234153273299
Epoch #5 | MSE_Loss = 5.5192e+11 | R^2 = -0.23928460329947354
Epoch #6 | MSE_Loss = 4.5142e+11 | R^2 = 0.00728422130647377


KeyboardInterrupt: 

In [17]:
model.eval()
predicts = model(graph.x, graph.edge_index)
mse = loss_criteria(predicts[graph.test_mask], graph.y[graph.test_mask])
r2 = r2_score(graph.y[graph.test_mask].detach(), predicts[graph.test_mask].detach())

print(f'GNN : MSE_Loss = {mse:.4e} | R^2 = {r2}')

GNN : MSE_Loss = 5.0320e+11 | R^2 = -0.12797071515259462


In [18]:
predicts[graph.test_mask] - graph.y[graph.test_mask]

tensor([[-384348.6250, -231348.6250, -560848.6250,  ...,   50651.3750,
         -216348.6250, -534348.6250],
        [-417744.0625, -264744.0625, -594244.0625,  ...,   17255.9375,
         -249744.0625, -567744.0625],
        [-248806.2500,  -95806.2500, -425306.2500,  ...,  186193.7500,
          -80806.2500, -398806.2500],
        ...,
        [-113177.6875,   39822.3125, -289677.6875,  ...,  321822.3125,
           54822.3125, -263177.6875],
        [ -86374.0625,   66625.9375, -262874.0625,  ...,  348625.9375,
           81625.9375, -236374.0625],
        [ -96074.1875,   56925.8125, -272574.1875,  ...,  338925.8125,
           71925.8125, -246074.1875]], grad_fn=<SubBackward0>)