# Exploitation - Label Prediction

In the exploitation task, we aim to predict the type of edges (transport routes). We start with Hand crafted features, followed by node embedding and finally use GNNs. In this notebook, we will work using GNNs.

## Task - 2

In the second task, we predict the edge labels between the given nodes. 

**Imports**

In [1]:
import glob
import pickle
import pathlib
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns

from enum import Enum
from tqdm import tqdm
from matplotlib import pyplot as plt
from typing import Union, List, Dict, Literal, Tuple

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from skmultilearn.model_selection import iterative_train_test_split

from node2vec import Node2Vec

#### Paths for input and output

In [2]:
rel_data_folder_path = pathlib.Path("./../../../data")
transport_data_path = rel_data_folder_path.joinpath('transport_data')
city_network = rel_data_folder_path.joinpath('network_graphs')
city_network_graphs = city_network.joinpath('graphs')
city_network_graphs_dir = city_network_graphs.joinpath('directed_graphs')
city_network_bones = city_network.joinpath('nodes-edges')

checkpoints_folder_path = rel_data_folder_path.joinpath("checkpoints")
city_network_graphs_dir_label_pred_node2vec = checkpoints_folder_path.joinpath('node2vec-label-pred')

In [3]:
# Define enum for route types
class RouteType(Enum):
    tram, subway, rail, bus, ferry, cablecar, gondola = range(7)

def load_city_graphs(city_name: str, graphs_folder: pathlib.Path) -> Dict[str, Union[float, List[List[int]], nx.Graph]]:
    with open(graphs_folder.joinpath(city_name.lower() + '.gpickle'), 'rb') as f:
        city_graph = pickle.load(f)
    return city_graph

def load_all_cities_graphs(cities: List[str], graphs_folder: pathlib.Path) -> Dict[str, Dict[str, Union[float, List[List[int]], nx.Graph]]]:
    return {city: load_city_graphs(city, graphs_folder) for city in cities}

In [4]:
import torch_geometric as pyg
from torch_geometric.data import Data
from torch import nn
import torch

"""
Graph neural network module. It comprises of a series of `pyg.nn.GraphConv` Graph convolutional layers
followed by the pooling layer that uses addition based reduction.
"""
class GNN(nn.Module):
    """
    Initialize the GNN model layers.
    Args:
        num_node_features: int -> Dimension of the edge-feature vector.
        num_classes: int -> Number of classes to consider for the final linear layer's output, the output vector dimension

    Returns:
        nn.Module -> GNN model

    """
    def __init__(self, num_node_features: int, num_classes: int):
        super().__init__()

        self.conv1 = pyg.nn.GraphConv(num_node_features, 16)

        self.linear1 = nn.Linear(32, num_classes)
    
    """
    Forward pass function for the GNN model.
    Args:
        x -> Node feature matrix
        edge_index -> connectivity tensor
        batch -> batch vector that assigns a node to a specific data sample.
    """
    def forward(self, x, edge_index, batch=None):
        x = self.conv1(x, edge_index).relu()
        x_edges = x[edge_index]
        x_edges = torch.cat((x_edges[0], x_edges[1]), dim=1)
        return self.linear1(x_edges)

In [5]:
from sklearn.metrics import f1_score
"""
Method to evaluate the trained model against test data
to compute the test loss as well as the F1-score metrics.

Args:
    model: nn.Module -> GNN model
    loss_fcn: torch_geometric.nn.loss -> Loss function that was used for training.
    device: str -> 'cpu' or 'cuda' to mention the device to use for evaluation.
    dataloader -> torch_geometric.loader.DataLoader -> Dataloader for test dataset

Returns:
    np.float64 -> Average F1 score on test dataset
"""
def evaluate(model, loss_fcn, device, dataloader):

    score_list_batch = []

    model.eval()
    for i, batch in enumerate(dataloader):
        batch = batch.to(device)
        output = torch.sigmoid(model(batch.x, batch.edge_index))

        loss_test = loss_fcn(output, batch.y)
        print("Test loss {}".format(loss_test.mean()))
        predict = np.where(output.detach().cpu().numpy() >= 0.5, 1, 0)
        labels = batch.y.cpu().numpy()

        score = f1_score(labels, predict, average="weighted")
        score_list_batch.append(score)

    return np.array(score_list_batch).mean()

In [6]:
"""
Implementation of train method for GNNs

Args:
    model: nn.Module -> Model to use for training and validation
    loss_fcn: nn.BCELoss() -> Binary cross entropy or any loss function for the training task
    device: 'str' ['cpu' | 'cuda' ] -> Device to use for training
    optimizer: torch.optim.<Optimizer> -> Optimizer
    max_epocs: int -> Number of epochs (max) to run training for.
    train_dataloader: torch_geometric.loader.Dataloader -> Dataloader for training samples  
"""
def train(model, loss_fcn, device, optimizer, max_epochs, train_dataloader):
    epoch_list = []
    scores_list = []
    print("Training the model")

    # loop over epochs
    for epoch in range(max_epochs):
        model.train()
        losses = []
        # loop over batches
        for i, train_batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            train_batch_device = train_batch.to(device)
            # logits is the output of the model
            logits = model(train_batch_device.x, train_batch_device.edge_index)
            # compute the loss
            loss = loss_fcn(logits, train_batch_device.y)
            # optimizer step
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        loss_data = np.array(losses).mean()
        print("Epoch {:05d} | Loss: {:.4f}".format(epoch + 1, loss_data))

    return epoch_list, scores_list

In [7]:
# We only consider the full route type as it has the data regarding
# the target values for all the various types of transport.
route_type = 'full'
num_targets = len(RouteType)

In [8]:
"""
Generates target labels for the edges in a graph

Args:
    graph: nx.MultiDiGraph -> Mulit directed-graph with parallel edges corresponding to different target labels.
    num_targets: int -> Number of target (distinct) labels.

Returns:
    np.ndarray: [num_edges, num_targets] -> Target label mask for each edge of the graph
"""
def generate_edge_targets(graph: nx.Graph, num_targets: int) -> np.ndarray:
    targets = []
    edges_unique = []
    for edge in graph.edges():
        if edge not in edges_unique:
            edges_unique.append(edge)

    for node_1, node_2 in edges_unique:
        target = np.zeros(num_targets)
        edge_data = graph.get_edge_data(node_1, node_2)
        for edge_attr in edge_data.values():
            target[edge_attr['route_type']] = 1
        targets.append(target)

    return np.array(targets)

In [9]:
def Node2Vec_node_feature_extraction(graph: nx.Graph, num_features: int, p: float, q: float, seed: int) -> Dict[float, np.ndarray]:
    ''' 
    INPUT:
    graph: the graph
    num_features: dimension of node2vec embeddings, int
    p: float
    q: float
    seed: please always set to 0

    OUTPUT:
    features: feature matrix of dimensions (N, D) (N: number of samples; D: dimension of Node2Vec embeddings) 
    '''

    node2vec_ = Node2Vec(graph, dimensions=num_features, p=p, q=q, seed=seed)
    model = node2vec_.fit()
    features_dict = {node: model.wv[idx] for idx, node in enumerate(graph.nodes())}
    return features_dict

In [14]:
import torch_geometric
from torch_geometric.data import Data
def get_torch_data(graph: nx.MultiDiGraph, num_targets: int) -> torch_geometric.data.Data:
    graph_nodes = list(graph.nodes())
    node_features = Node2Vec_node_feature_extraction(graph, num_features=25, p=1.00, q=5.00, seed=0)
    node_features = torch.tensor(np.array([ node_features[node] for node in graph.nodes() ]))

    # Labels for the graph edges
    targets = []
    edges_unique = []
    node_to_idx_map = {}

    for idx, node in enumerate(graph_nodes):
        node_to_idx_map[int(node)] = idx

    for edge in graph.edges():
        if edge not in edges_unique:
            edges_unique.append((edge[0], edge[1]))

    # Construct edge index (COO type) corresponding to a complete directed graph
    num_edges_graph = len(edges_unique)
    coo_vector = np.zeros(shape=(2, 2 * num_edges_graph), dtype=int)
    coo_vector = torch.tensor(coo_vector, dtype=torch.long)
    idx = 0
    targets = []
    # print("Number of nodes: {}".format(num_nodes))
    # print("Max node value {}, min node value {}".format(min(graph_nodes), max(graph_nodes)))
    # Add data entries for edges and non-edges equally
    
    num_non_edges = 0
    num_edges_visited = 0
    for node_1 in graph_nodes:
        for node_2 in graph_nodes:
            if node_1 == node_2:
                continue
            
            target = np.zeros(num_targets, dtype=np.float32)
            if (node_1, node_2) in edges_unique:
                coo_vector[0][idx] = node_to_idx_map[node_1]
                coo_vector[1][idx] = node_to_idx_map[node_2]
                idx = idx + 1
                edge_data = graph.get_edge_data(node_1, node_2)
                for edge_attr in edge_data.values():
                    target[edge_attr['route_type']] = 1
                targets.append(target)
                num_edges_visited = num_edges_visited + 1
            else:
                if num_non_edges < num_edges_graph:
                    coo_vector[0][idx] = node_to_idx_map[node_1]
                    coo_vector[1][idx] = node_to_idx_map[node_2]
                    idx = idx + 1
                    num_non_edges = num_non_edges + 1
                    targets.append(target)
            if (num_edges_visited == num_edges_graph) and (num_non_edges == num_edges_graph):
                break
            
    targets = np.array(targets)
    print("Number of targets {}".format(len(targets)))
    return Data(x=node_features, edge_index=coo_vector, y=torch.tensor(np.array(targets), dtype=torch.float32))

In [16]:
# Split the graphs for train and test and create the dataset and dataloaders for each
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx
import torch

# Create a random number generator
from numpy.random import default_rng

rng = default_rng(seed=2106)

test_split = 0.3
device = 'cpu'

cities = sorted([x.stem for x in city_network_graphs_dir.glob('*.gpickle')])
city_graphs_dir = load_all_cities_graphs(cities, city_network_graphs_dir)
city_routes = {}
dataset_train = []
dataset_test  = []

for city, city_graphs in city_graphs_dir.items():
    routes_ = []
    for route_type, route_graph in city_graphs.items():
        if ((route_type in ['full', 'cablecar']) or (route_graph is None)):
            continue
        routes_.append(RouteType[route_type].value)
    if len(routes_) > 1:
        city_routes[city] = routes_
print(city_routes)

for city, city_graphs in tqdm(city_graphs_dir.items()):
    if city.lower() == 'sydney':
        continue

    if city.lower() != 'grenoble':
        continue

    city_graph_scores = {}
    if city in city_routes:
        route_graph = city_graphs[route_type]
        num_targets = len(RouteType)

        torch_geometric_data = get_torch_data(graph=route_graph, num_targets=num_targets)
        num_nodes = len(route_graph.nodes())
        # Torch data generated corresponds to an equivalent complete graph
        num_edges = len(torch_geometric_data.edge_index[0])
        torch_geometric_data = torch_geometric_data.to(device)

        # Split train and test data
        train_mask = np.zeros(num_edges, dtype=int)
        train_mask[:int((1.000 - test_split) * num_edges)] = True
        rng.shuffle(train_mask)
        test_mask = ~train_mask

        train_mask = torch.tensor(train_mask, device=device).bool()

        train_data = Data(x=torch_geometric_data.x, edge_index=torch_geometric_data.edge_index[:, train_mask], y=torch_geometric_data.y[train_mask])
        test_data  = Data(x=torch_geometric_data.x, edge_index=torch_geometric_data.edge_index[:, test_mask], y=torch_geometric_data.y[test_mask])

        dataloader_train = DataLoader([train_data], shuffle=True)
        dataloader_test = DataLoader([test_data], shuffle=True)

        # GNN model for label prediction in edges
        model = GNN(25, num_targets)
        model = model.to(device)
        loss_fn = nn.BCEWithLogitsLoss().to(device)
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

        # Train
        train(model, loss_fn, device=device, optimizer=optimizer, max_epochs=50, train_dataloader=dataloader_train)

        print("Training complete for city {}".format(city))
        print("Evaluating the model on testing data")
        # Evaluate the model
        print(evaluate(model, loss_fcn=loss_fn, device=device, dataloader=dataloader_test))
        

{'adelaide': [0, 2, 3], 'berlin': [0, 1, 2, 3, 4], 'bordeaux': [0, 3, 4], 'brisbane': [2, 3, 4], 'canberra': [2, 3], 'dublin': [0, 2, 3], 'grenoble': [0, 3], 'helsinki': [0, 1, 2, 3, 4], 'lisbon': [1, 2, 3, 4], 'luxembourg': [2, 3], 'melbourne': [0, 2, 3], 'nantes': [0, 3], 'paris': [0, 1, 2, 3], 'prague': [0, 1, 3, 4], 'rennes': [1, 3], 'rome': [0, 1, 2, 3], 'sydney': [0, 2, 3, 4], 'toulouse': [0, 1, 3], 'venice': [0, 3, 4]}


  0%|          | 0/25 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/697 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:01<00:00,  6.91it/s]


Number of targets 3092
Training the model
Epoch 00001 | Loss: 0.9465
Epoch 00002 | Loss: 0.9444
Epoch 00003 | Loss: 0.9423
Epoch 00004 | Loss: 0.9403
Epoch 00005 | Loss: 0.9382
Epoch 00006 | Loss: 0.9362
Epoch 00007 | Loss: 0.9342
Epoch 00008 | Loss: 0.9322
Epoch 00009 | Loss: 0.9302
Epoch 00010 | Loss: 0.9282
Epoch 00011 | Loss: 0.9263
Epoch 00012 | Loss: 0.9243
Epoch 00013 | Loss: 0.9224
Epoch 00014 | Loss: 0.9205
Epoch 00015 | Loss: 0.9186
Epoch 00016 | Loss: 0.9168
Epoch 00017 | Loss: 0.9149
Epoch 00018 | Loss: 0.9131
Epoch 00019 | Loss: 0.9112
Epoch 00020 | Loss: 0.9094
Epoch 00021 | Loss: 0.9076
Epoch 00022 | Loss: 0.9058
Epoch 00023 | Loss: 0.9040
Epoch 00024 | Loss: 0.9023
Epoch 00025 | Loss: 0.9005
Epoch 00026 | Loss: 0.8988
Epoch 00027 | Loss: 0.8971
Epoch 00028 | Loss: 0.8954
Epoch 00029 | Loss: 0.8937
Epoch 00030 | Loss: 0.8920
Epoch 00031 | Loss: 0.8903
Epoch 00032 | Loss: 0.8887
Epoch 00033 | Loss: 0.8870
Epoch 00034 | Loss: 0.8854
Epoch 00035 | Loss: 0.8838
Epoch 00036 |

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
100%|██████████| 25/25 [00:19<00:00,  1.27it/s]

Epoch 00050 | Loss: 0.8611
Training complete for city grenoble
Evaluating the model on testing data
Test loss 1.0818166732788086
1.0





In [None]:
print(evaluate(model, loss_fcn=loss_fn, device=device, dataloader=dataloader_test))

Test loss 1.0474982261657715
0.0


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
