In [20]:
import os
import sys
from importlib import reload
from pathlib import Path

import numpy as np
import torch
from torch.nn import ReLU, Linear
from torch_geometric.nn import SAGEConv, GCNConv, GATConv
from torch import from_numpy
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
# NOTE: Boilerplate setup for Jupyter imports

root = Path(os.getcwd()).parent
sys.path.append(
    (root / "src").as_posix()
)

import configs as config_module
import enhancer as enhancer_module
import strategies as encoders_module
import schema.edges as edges_module
import schema.network as network_module
import schema.data as data_module
import utils.metrics as utils_module

reload(edges_module)
reload(config_module)
reload(enhancer_module)
reload(encoders_module)
reload(network_module)
reload(data_module)
reload(utils_module)

PathConfig, TrainConfig = config_module.PathConfig, config_module.TrainConfig
Enhancer = enhancer_module.Enhancer
get_default_encoders = encoders_module.get_default_encoders
NetworkConfig, EnhancerData = network_module.NetworkConfig, data_module.EnhancerData
GraphSetup = edges_module.GraphSetup
euclid_metric, haversine_metric = utils_module.euclid_dist, utils_module.haversine_dist

In [22]:
path_config = PathConfig(data_root="../data")
path_config.target_data = path_config.data_root / "processed/np/Melbourne_housing_FULL.npz"

In [23]:
with open(path_config.target_data, "rb") as f:
    unpacked = np.load(f)
    data, target, spatial = unpacked["data"], unpacked["target"], unpacked["spatial"]

In [24]:
scaler = StandardScaler()
spatial_processed = scaler.fit_transform(spatial)

In [25]:
train_data, test_data, train_target, test_target, train_spatial, test_spatial, train_spatial_pcs, test_spatial_pcs = (
    train_test_split(data, target, spatial, spatial_processed, test_size=0.2)
)

train_data, val_data, train_target, val_target, train_spatial, val_spatial, train_spatial_pcs, val_spatial_pcs = (
    train_test_split(train_data, train_target, train_spatial, train_spatial_pcs, test_size=0.125)
)

In [26]:
print(
    train_data.shape[0] / data.shape[0],
    test_data.shape[0] / data.shape[0],
    val_data.shape[0] / data.shape[0],
)

0.6999532880053978 0.2000311413297348 0.1000155706648674


## Fixed network architecture

In [27]:
HIDDEN = [128, 128, 64]
ACTIVATION_T = ReLU

LOSS = torch.nn.MSELoss()
LR = 1e-4
BATCH_SIZE = 128
EPOCHS = 50

## Classical DL

In [28]:
X_train, y_train = (
    torch.from_numpy(
        np.hstack((train_data, train_spatial_pcs)).astype(np.float32)
    ),
    torch.from_numpy(train_target.astype(np.float32)),
)

X_test, y_test = (
    torch.from_numpy(
        np.hstack((test_data, test_spatial_pcs)).astype(np.float32)
    ),
    torch.from_numpy(test_target.astype(np.float32)),
)

X_val, y_val = (
    torch.from_numpy(
        np.hstack((val_data, val_spatial_pcs)).astype(np.float32)
    ),
    torch.from_numpy(val_target.astype(np.float32)),
)

In [29]:
class Model(torch.nn.Module):
    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()

        layers = []
        tmp = [input_dim, *HIDDEN, output_dim]

        for layer_in, layer_out in zip(tmp, tmp[1:]):
            layers.append(Linear(layer_in, layer_out))
            layers.append(ACTIVATION_T())

        self.hidden = torch.nn.Sequential(*layers[:-1])

    def forward(self, x: torch.Tensor):
        return self.hidden(x)

In [30]:
train_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(X_train, y_train),
    batch_size=BATCH_SIZE,
)

model = Model(X_train.shape[1], 1)
optim = torch.optim.Adam(model.parameters(), lr=LR)

In [31]:
for e in range(EPOCHS):
    model.train()
    for b_data, b_target in train_loader:
        optim.zero_grad()
        output = model(b_data)

        loss = LOSS(output.squeeze(), b_target.squeeze())
        loss.backward()
        optim.step()

    model.eval()
    with torch.no_grad():
        output = model(X_val)
        val_loss = mean_squared_error(y_val.squeeze(), output.squeeze())
        print(f"Epoch {e}: Validation loss = {val_loss:.3E}")

Epoch 0: Validation loss = 1.356E+12
Epoch 1: Validation loss = 1.356E+12
Epoch 2: Validation loss = 1.356E+12
Epoch 3: Validation loss = 1.355E+12
Epoch 4: Validation loss = 1.355E+12
Epoch 5: Validation loss = 1.354E+12
Epoch 6: Validation loss = 1.353E+12
Epoch 7: Validation loss = 1.351E+12
Epoch 8: Validation loss = 1.348E+12
Epoch 9: Validation loss = 1.344E+12
Epoch 10: Validation loss = 1.338E+12
Epoch 11: Validation loss = 1.332E+12
Epoch 12: Validation loss = 1.324E+12
Epoch 13: Validation loss = 1.314E+12
Epoch 14: Validation loss = 1.302E+12
Epoch 15: Validation loss = 1.288E+12
Epoch 16: Validation loss = 1.273E+12
Epoch 17: Validation loss = 1.255E+12
Epoch 18: Validation loss = 1.234E+12
Epoch 19: Validation loss = 1.212E+12
Epoch 20: Validation loss = 1.187E+12
Epoch 21: Validation loss = 1.160E+12
Epoch 22: Validation loss = 1.130E+12
Epoch 23: Validation loss = 1.099E+12
Epoch 24: Validation loss = 1.065E+12
Epoch 25: Validation loss = 1.029E+12
Epoch 26: Validation l

In [32]:
with torch.no_grad():
    test_output = model(X_test)
    mse = mean_squared_error(y_test.squeeze(), test_output.squeeze())
    print(f"FINAL: MSE = {mse:.3E}")

FINAL: MSE = 2.043E+11


## Enhancer

In [33]:
enhancer_data = EnhancerData(
    from_numpy(data     .astype(np.float32)),
    from_numpy(target   .astype(np.float32)),
    from_numpy(spatial  .astype(np.float32)),
)

In [34]:
train_config = TrainConfig(
    n_epochs=EPOCHS,
    learn_rate=LR,
    batch_size=BATCH_SIZE,
    loss_criteria=LOSS,
    val_ratio=0.1,
    test_ratio=0.2,
)

In [35]:
encoder = encoder=[
    SAGEConv(enhancer_data.features.shape[1], 64),
    SAGEConv(64, 64),
]

estimator = []
tmp = [64, *HIDDEN, 1]
for layer_in, layer_out in zip(tmp, tmp[1:]):
    estimator.append(Linear(layer_in, layer_out))
    estimator.append(ACTIVATION_T())

gnn_setup = NetworkConfig(
    encoder, estimator
)

In [36]:
knn_strategy = encoders_module.KNNStrategy(
    K=5,
    dist_metric=euclid_metric,
    cache_dir=path_config.edge_cache,
    cache_id="melbourne_knn_5",
)

threshold_strategy = encoders_module.ThresholdStrategy(
    dist_metric=haversine_metric,
    max_dist=3,
    cache_dir=path_config.edge_cache,
    cache_id="melbourne_threshold_3",
)

anchor_strategy = encoders_module.AnchorStrategy(
    n_repr=100,
    cluster_sample_rate=0.03,
    cache_dir=path_config.edge_cache,
    cache_id="melbourne_anchors_100",
)

grid_strategy = encoders_module.GridStrategy(
    intra_edge_ratio=0.01,
    source_inter_ratio=0.01,
    k_connectivity=2,
    bins=4,
    cache_dir=path_config.edge_cache,
    cache_id="melbourne_grid",
)

input_strategies = [
    GraphSetup(knn_strategy, enhancer_data),
    GraphSetup(threshold_strategy, enhancer_data),
    GraphSetup(anchor_strategy, enhancer_data),
    GraphSetup(grid_strategy, enhancer_data),
]

In [37]:
result = Enhancer.compare_strategies(gnn_setup, train_config, input_strategies)

GNN training:   6%|▌         | 3/50 [00:01<00:28,  1.63epoch/s, val_loss=1.3e+12] 


KeyboardInterrupt: 

In [None]:
print(result.get_comparison([mean_squared_error]))

Option                   mean_squared_error      density    average degree    n connected components    largest component
---------------------  --------------------  -----------  ----------------  ------------------------  -------------------
melbourne_knn_5                 1.9681e+11   0.000322345           6.21031                        25                17602
melbourne_threshold_3           9.40949e+10  0.0317267           611.056                           5                19020
melbourne_anchors_100           1.17623e+11  0.000782952          15.0616                        102                  451
melbourne_grid                  1.08393e+11  0.00414286           79.7003                          6                19224
