# Exploitation - Label Prediction

In the exploitation task, we aim to predict the type of edges (transport routes). We start with Hand crafted features, followed by node embedding and finally use GNNs. In this notebook, we will work using GNNs.

## Task - 2

In the second task, we predict the edge labels between the given nodes. 

**Imports**

In [1]:
import glob
import pickle
import pathlib
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns

from enum import Enum
from tqdm import tqdm
from matplotlib import pyplot as plt
from typing import Union, List, Dict, Literal, Tuple

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from skmultilearn.model_selection import iterative_train_test_split

from node2vec import Node2Vec

#### Paths for input and output

In [2]:
rel_data_folder_path = pathlib.Path("./../../../data")
transport_data_path = rel_data_folder_path.joinpath('transport_data')
city_network = rel_data_folder_path.joinpath('network_graphs')
city_network_graphs = city_network.joinpath('graphs')
city_network_graphs_dir = city_network_graphs.joinpath('directed_graphs')
city_network_bones = city_network.joinpath('nodes-edges')

checkpoints_folder_path = rel_data_folder_path.joinpath("checkpoints")
city_network_graphs_dir_label_pred_node2vec = checkpoints_folder_path.joinpath('node2vec-label-pred')

In [3]:
# Define enum for route types
class RouteType(Enum):
    tram, subway, rail, bus, ferry, cablecar, gondola = range(7)

def load_city_graphs(city_name: str, graphs_folder: pathlib.Path) -> Dict[str, Union[float, List[List[int]], nx.Graph]]:
    with open(graphs_folder.joinpath(city_name.lower() + '.gpickle'), 'rb') as f:
        city_graph = pickle.load(f)
    return city_graph

def load_all_cities_graphs(cities: List[str], graphs_folder: pathlib.Path) -> Dict[str, Dict[str, Union[float, List[List[int]], nx.Graph]]]:
    return {city: load_city_graphs(city, graphs_folder) for city in cities}

In [4]:
import torch_geometric as pyg
from torch_geometric.data import Data
from torch import nn
import torch

"""
Graph neural network module. It comprises of a series of `pyg.nn.GraphConv` Graph convolutional layers
followed by the pooling layer that uses addition based reduction.
"""
class GNN(nn.Module):
    """
    Initialize the GNN model layers.
    Args:
        num_node_features: int -> Dimension of the edge-feature vector.
        num_classes: int -> Number of classes to consider for the final linear layer's output, the output vector dimension

    Returns:
        nn.Module -> GNN model

    """
    def __init__(self, num_node_features: int, num_classes: int):
        super().__init__()

        self.conv1 = pyg.nn.GraphConv(num_node_features, 32)
        self.conv2 = pyg.nn.GraphConv(32, 64)

        self.linear1 = nn.Linear(128, num_classes)
    
    """
    Forward pass function for the GNN model.
    Args:
        x -> Node feature matrix
        edge_index -> connectivity tensor
        batch -> batch vector that assigns a node to a specific data sample.
    """
    def forward(self, x, edge_index, batch=None):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x_edges = x[edge_index]
        x_edges = torch.cat((x_edges[0], x_edges[1]), dim=1)
        return self.linear1(x_edges)

In [5]:
from sklearn.metrics import f1_score
"""
Method to evaluate the trained model against test data
to compute the test loss as well as the F1-score metrics.

Args:
    model: nn.Module -> GNN model
    loss_fcn: torch_geometric.nn.loss -> Loss function that was used for training.
    device: str -> 'cpu' or 'cuda' to mention the device to use for evaluation.
    dataloader -> torch_geometric.loader.DataLoader -> Dataloader for test dataset

Returns:
    np.float64 -> Average F1 score on test dataset
"""
def evaluate(model, loss_fcn, device, dataloader):

    score_list_batch = []

    model.eval()
    for i, batch in enumerate(dataloader):
        batch = batch.to(device)
        output = model(batch.x, batch.edge_index)
        loss_test = loss_fcn(output, batch.y)
        predict = np.where(output.detach().cpu().numpy() >= 0, 1, 0)
        score = f1_score(batch.y.cpu().numpy(), predict, average="micro")
        score_list_batch.append(score)

    return np.array(score_list_batch).mean()

In [6]:
"""
Implementation of train method for GNNs

Args:
    model: nn.Module -> Model to use for training and validation
    loss_fcn: nn.CrossEntropyLoss() -> Cross entropy or any loss function for the training task
    device: 'str' ['cpu' | 'cuda' ] -> Device to use for training
    optimizer: torch.optim.<Optimizer> -> Optimizer
    max_epocs: int -> Number of epochs (max) to run training for.
    train_dataloader: torch_geometric.loader.Dataloader -> Dataloader for training samples
    val_dataloader: torch_geometric.loader.Dataloader -> Dataloader for test samples.  
"""
def train(model, loss_fcn, device, optimizer, max_epochs, train_dataloader, val_dataloader):
    epoch_list = []
    scores_list = []

    # loop over epochs
    for epoch in range(max_epochs):
        model.train()
        losses = []
        # loop over batches
        for i, train_batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            train_batch_device = train_batch.to(device)
            # logits is the output of the model
            logits = model(train_batch_device.x, train_batch_device.edge_index)
            # compute the loss
            loss = loss_fcn(logits, train_batch_device.y)
            # optimizer step
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        loss_data = np.array(losses).mean()
        print("Epoch {:05d} | Loss: {:.4f}".format(epoch + 1, loss_data))

        if epoch % 5 == 0:
            # evaluate the model on the validation set
            # computes the f1-score (see next function)
            score = evaluate(model, loss_fcn, device, val_dataloader)
            print("F1-Score: {:.4f}".format(score))
            scores_list.append(score)
            epoch_list.append(epoch)

    return epoch_list, scores_list

In [7]:
# We only consider the full route type as it has the data regarding
# the target values for all the various types of transport.
route_type = 'full'
num_targets = len(RouteType)

In [1]:
"""
Generates target labels for the edges in a graph

Args:
    graph: nx.MultiDiGraph -> Mulit directed-graph with parallel edges corresponding to different target labels.
    num_targets: int -> Number of target (distinct) labels.

Returns:
    np.ndarray: [num_edges, num_targets] -> Target label mask for each edge of the graph
"""
def generate_edge_targets(graph: nx.Graph, num_targets: int) -> np.ndarray:
    targets = []
    edges_unique = []
    for edge in graph.edges():
        if edge not in edges_unique:
            edges_unique.append(edge)
    
    for node_1, node_2 in edges_unique:
        target = np.zeros(num_targets)
        edge_data = graph.get_edge_data(node_1, node_2)
        for edge_attr in edge_data.values():
            target[edge_attr['route_type']] = 1
        targets.append(target)

    return np.array(targets)

NameError: name 'nx' is not defined

In [9]:
def Node2Vec_node_feature_extraction(graph: nx.Graph, num_features: int, p: float, q: float, seed: int) -> Dict[float, np.ndarray]:
    ''' 
    INPUT:
    graph: the graph
    num_features: dimension of node2vec embeddings, int
    p: float
    q: float
    seed: please always set to 0

    OUTPUT:
    features: feature matrix of dimensions (N, D) (N: number of samples; D: dimension of Node2Vec embeddings) 
    '''
     
    node2vec_ = Node2Vec(graph, dimensions=num_features, p=p, q=q, seed=seed)
    model = node2vec_.fit()
    features_dict = {node: model.wv[idx] for idx, node in enumerate(graph.nodes())}
    return features_dict

In [10]:
# Split the graphs for train and test and create the dataset and dataloaders for each
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx
import torch

# Create a random number generator
from numpy.random import default_rng

rng = default_rng(seed=2106)

test_split = 0.3
device = 'cpu'

cities = sorted([x.stem for x in city_network_graphs_dir.glob('*.gpickle')])
city_graphs_dir = load_all_cities_graphs(cities, city_network_graphs_dir)
city_routes = {}
dataset_train = []
dataset_test  = []

for city, city_graphs in city_graphs_dir.items():
    routes_ = []
    for route_type, route_graph in city_graphs.items():
        if ((route_type in ['full', 'cablecar']) or (route_graph is None)):
            continue
        routes_.append(RouteType[route_type].value)
    if len(routes_) > 1:
        city_routes[city] = routes_

for city, city_graphs in tqdm(city_graphs_dir.items()):
    if city.lower() == 'sydney':
        continue
    city_graph_scores = {}
    if city in city_routes:
        route_graph = city_graphs[route_type]
        num_targets = len(RouteType)

        targets_edge_all = generate_edge_targets(route_graph, num_targets=num_targets)
        targets_edge = np.array(targets_edge_all)
        num_edges = targets_edge.shape[0]
        torch_data = from_networkx(route_graph)

        # Take the complement of the `route_graph` and augment equal number of edges as original graph to the dataset.
        # As this is computationally expensive, we find some of these edges from its adjacency matrix
        adj_matrix = nx.adjacency_matrix(route_graph).todense()
        complement_edges = set()
        for row_idx in range(adj_matrix.shape[0]):
            for col_idx in range(adj_matrix.shape[0]):
                if row_idx == col_idx:
                    continue
                if (row_idx, col_idx) not in complement_edges:
                    complement_edges.add((row_idx, col_idx))
                if len(complement_edges) == num_edges:
                    break
        complement_edges = list(complement_edges)
        complement_edges_row = [edge[0] for edge in complement_edges]
        complement_edges_col = [edge[1] for edge in complement_edges]
        del complement_edges

        targets_complement = np.zeros(shape=(targets_edge.shape))
        complement_edge_index = torch.tensor([np.array(complement_edges_row), np.array(complement_edges_col)], dtype=torch.long)
        del complement_edges_row
        del complement_edges_col

        # Initial input features as node2vec features
        node_features = Node2Vec_node_feature_extraction(route_graph, num_features=10, p=1.00, q=5.00, seed=0)
        node_features = np.array([ node_features[node] for node in route_graph.nodes() ])

        dataset = Data(x=node_features, edge_index=torch.cat((torch_data.edge_index, complement_edge_index), dim=1), y=torch.cat((targets_edge, targets_complement)))
        dataset = dataset.to(device)

        num_edges_dataset = 2 * num_edges

        # Split train and test data
        train_mask = np.zeros(num_edges_dataset, dtype=int)
        train_mask[:int((1.000 - test_split) * num_edges_dataset)] = True
        rng.shuffle(train_mask)
        test_mask = ~train_mask

        train_mask = torch.tensor(train_mask, device=device).bool()

        train_data = Data(x=dataset.x, edge_index=dataset.edge_index[:, train_mask], y=dataset.y[train_mask])
        test_data  = Data(x=dataset.x, edge_index=dataset.edge_index[:, test_mask], y=dataset.y[test_mask])

        dataset_train.append(train_data)
        dataset_test.append(test_data)

  adj_matrix = nx.adjacency_matrix(route_graph).todense()


: 

: 

In [None]:
# Define the dataloaders
dataloader_train = DataLoader(dataset_train, shuffle=True)
dataloader_test  = DataLoader(dataset_test, shuffle=True)

# GNN model for label prediction in edges
model = GNN(10, num_targets)
model = model.to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

# Train
train(model, loss_fn, device=device, optimizer=optimizer, max_epochs=50, train_dataloader=dataloader_train, val_dataloader=dataloader_test)