In [None]:
import torch
import random
import os
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from model import HeteroSAGEFull
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from urbanity.data_class import UrbanGraph
from utils import to_pyg_graph
from sklearn.metrics import precision_score, recall_score

seed = 0
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
project_path  = f'YOUR_PROJECT_PATH'
os.chdir(project_path)

### **01. Parameter settings**

In [2]:
city = 'Washington'
label_path   = f'data/building_type/{city}.csv'
graph_path  = f'data/urban_graph/{city}.zip'

output_path = f'output'
cache_path = f'output/cache'
os.makedirs(output_path, exist_ok=True)
os.makedirs(cache_path, exist_ok=True)

lr             = 5e-3
hidden_dim      = 256
batch_size      = 64
EPOCH = 500
knn = 10
distance   = 100
label_col_name  = 'building_type_osm'

### **02. Divide train, validation and test sets**

In [3]:
df_labels = pd.read_csv(label_path).drop_duplicates('building_id')
df_labels.building_id = df_labels.building_id.astype(str)
df_labels[label_col_name] = df_labels[label_col_name].astype('category')
num_classes = len(df_labels[label_col_name].cat.categories)

# load urban graph
graph = UrbanGraph()
graph.load_graph(graph_path)
graph.initialize_edges(building_neighbours='knn', knn=knn, distance=distance)
buildings = graph.building
df_labels = df_labels[df_labels['building_id'].isin(buildings['bid'].values)]


train_df, temp_df = train_test_split(
    df_labels, test_size=0.4,
    stratify=df_labels[label_col_name], random_state=seed
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5,
    stratify=temp_df[label_col_name], random_state=seed
)

all_bids = list(df_labels['building_id'].astype(str).values)
all_bids_graph = list(buildings['bid'].astype(str).values)
bid2node = {bid: idx for idx, bid in enumerate(all_bids_graph)}

train_bids = train_df.building_id.tolist()
val_bids   = val_df.building_id.tolist()
test_bids  = test_df.building_id.tolist()

train_idx = [bid2node[bid] for bid in train_bids]
val_idx   = [bid2node[bid] for bid in val_bids]
test_idx  = [bid2node[bid] for bid in test_bids]

y_target = np.full(len(all_bids_graph), -1, dtype=int)
class_map = {cat: i for i,cat in enumerate(df_labels[label_col_name].cat.categories)}
for bid, cat in zip(df_labels.building_id, df_labels[label_col_name]):
    y_target[bid2node[bid]] = class_map[cat]
    
data = to_pyg_graph(graph.geo_store, graph.edge_store, target_col='building', target_value=y_target.tolist())

valid_y = data['building'].y[data['building'].y >= 0]
unique, counts = np.unique(valid_y.cpu().numpy(), return_counts=True)
value_counts = dict(zip(unique, counts))

### **03. Training**

In [13]:
data = data.to(device)
model = HeteroSAGEFull(hidden_channels=hidden_dim, out_channels=num_classes, dropout_rate=0.2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

train_set = df_labels[df_labels['building_id'].isin(train_bids)]
labels_train = train_set[label_col_name].map(lambda x: class_map[x]).values
class_counts = np.bincount(labels_train)
print("Class counts:", class_counts)
total = labels_train.size
weights = total / (num_classes * class_counts.astype(np.float32))

class_weights = torch.tensor(weights, dtype=torch.float32).to(device)
print("Class weights:", class_weights)

best_epoch = 0
best_val_loss = float('inf')
patience_counter = 0
PATIENCE = 50

for epoch in range(EPOCH):
    model.train()
    optimizer.zero_grad()

    out = model(data.x_dict, data.edge_index_dict)
    train_loss = F.cross_entropy(
        out[train_idx],
        data['building'].y[train_idx],
        weight=class_weights
    )
    train_loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        val_loss = F.cross_entropy(
            out[val_idx],
            data['building'].y[val_idx]
        )
        all_preds = out[val_idx].argmax(dim=1).cpu().numpy()
        all_labels = data['building'].y[val_idx].cpu().numpy()
        avg_acc = (all_preds == all_labels).mean()
        macro_f1 = f1_score(all_labels, all_preds, average='macro')
        
        if epoch % 10 == 0:
            print(f'Epoch: {epoch} | Train Loss: {train_loss.item():.5f}')
            print(f'Epoch: {epoch} | Val Loss: {val_loss.item():.5f} | Accuracy: {avg_acc:.5f} | Macro F1: {macro_f1:.5f}')
        
        # Early Stopping Check
        if val_loss < best_val_loss and epoch > 10:
            best_val_loss = val_loss
            best_epoch = epoch
            torch.save(model.state_dict(), os.path.join(output_path, f'best_model.pth'))
            print(f"Best model saved for epoch {epoch} with Val Loss: {best_val_loss:.5f}")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter % 10 == 0:
                print(f"No improvement for {patience_counter} epochs.")

    if patience_counter >= PATIENCE:
        print(f"Early stopping triggered at epoch {epoch} after {PATIENCE} epochs without improvement.")
        break

Class counts: [  449   238 18893   173   194    21   131    41    73  8852]
Class weights: tensor([  6.4733,  12.2122,   0.1538,  16.8006,  14.9820, 138.4048,  22.1870,
         70.8902,  39.8151,   0.3283], device='cuda:0')
Epoch: 0 | Train Loss: 2.30139
Epoch: 0 | Val Loss: 2.37925 | Accuracy: 0.00444 | Macro F1: 0.00986
No improvement for 10 epochs.
Epoch: 10 | Train Loss: 0.98737
Epoch: 10 | Val Loss: 0.62794 | Accuracy: 0.80687 | Macro F1: 0.46002
Best model saved for epoch 11 with Val Loss: 0.61831
Best model saved for epoch 12 with Val Loss: 0.54486
Best model saved for epoch 13 with Val Loss: 0.51009
Best model saved for epoch 14 with Val Loss: 0.50583
Best model saved for epoch 15 with Val Loss: 0.48924
Best model saved for epoch 16 with Val Loss: 0.42766
Best model saved for epoch 17 with Val Loss: 0.39731
Epoch: 20 | Train Loss: 0.62387
Epoch: 20 | Val Loss: 0.39609 | Accuracy: 0.86427 | Macro F1: 0.49800
Best model saved for epoch 20 with Val Loss: 0.39609
Best model saved 

### **04. Evaluate on test set**

In [14]:
print("\nEvaluating best model on test set...")
model.load_state_dict(torch.load(os.path.join(output_path, f'best_model.pth')))
model.eval()
with torch.no_grad():
    out = model(data.x_dict, data.edge_index_dict)
    test_loss = F.cross_entropy(
        out[test_idx],
        data['building'].y[test_idx]
    ).item()
    all_test_preds = out[test_idx].argmax(dim=1).cpu().numpy()
    all_test_labels = data['building'].y[test_idx].cpu().numpy()
    test_acc = (all_test_preds == all_test_labels).mean()
    test_macro_f1 = f1_score(all_test_labels, all_test_preds, average='macro')
    test_precision = precision_score(all_test_labels, all_test_preds, average='macro', zero_division=0)
    test_recall = recall_score(all_test_labels, all_test_preds, average='macro', zero_division=0)

    top2 = out[test_idx].topk(2, dim=1).indices
    true_labels = data['building'].y[test_idx].unsqueeze(1)
    test_top2_acc = (top2.eq(true_labels).sum(dim=1) > 0).float().mean().item()

    print(f'Test Loss: {test_loss:.5f} | Test Accuracy: {test_acc:.5f} | Test Macro F1: {test_macro_f1:.5f}')
    print(f'Test Precision: {test_precision:.5f} | Test Recall: {test_recall:.5f}')
    print(f'Test Top2 Accuracy: {test_top2_acc:.5f}')


Evaluating best model on test set...
Test Loss: 0.20524 | Test Accuracy: 0.94169 | Test Macro F1: 0.64736
Test Precision: 0.63573 | Test Recall: 0.66457
Test Top2 Accuracy: 0.98906
