In [None]:
# %%
# import libraries
import os
import sys
import argparse
import time
from zipfile import ZipFile
import networkx as nx
import pickle
import json
from collections import Counter
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import config
from utils.data_utils import (
    seed_everything,
    get_num_workers,
    get_edge_attributes,
    get_node_attributes,
    load_graph,
)
from utils.plot_utils import (
    plot_data_distribution,
    plot_learning_curves,
    plot_targets_and_predictions,
    plot_targets_and_mre_boxplots,
)
from utils.torch_utils import (
    get_available_device,
    get_optimizer,
    get_criterion,
    save_dataset,
    load_dataset,
    save_model,
    save_dictionary,
)

In [None]:
data_dir = "../data"
data_names = ["W_Beijing", "W_Chengdu", "W_Chicago", "W_Jinan", "W_NewYork", "W_Shenzhen"]

for data_name in data_names[:]:
    print(f"Processing dataset: {data_name}")
    node_path = f"{data_dir}/{data_name}/{data_name}.nodes"
    edge_path = f"{data_dir}/{data_name}/{data_name}.edges"
    query_path = f"{data_dir}/{data_name}/{data_name}.queries"
    groundtruth_path = f"{data_dir}/{data_name}/{data_name}.groundtruth"

    for path in [node_path, edge_path, query_path, groundtruth_path]:
        if not os.path.exists(path):
            print(f"File does not exist: {path}")

    # Read node_path and get num nodes and check if all nodes are from 0 to n-1
    nodes = pd.read_csv(node_path, sep=",", header=None)
    node_list_from_nodes = nodes[0].sort_values().tolist()
    num_nodes_from_nodes = len(node_list_from_nodes)
    assert len(nodes) == num_nodes_from_nodes, "Error"
    if (node_list_from_nodes == list(range(num_nodes_from_nodes))):
        print("  - Nodes are indexed from 0 to n-1")
    elif (node_list_from_nodes == list(range(1, num_nodes_from_nodes + 1))):
        print("  - Nodes are indexed from 1 to n")
    else:
        print("  - Node IDs are not continuous")

    # Read edge_path and get num nodes and check node range as done previously
    edges = pd.read_csv(edge_path, sep=",", header=None, comment="#")
    node_list_from_edges = np.sort(np.unique(edges[[0, 1]].values.ravel())).tolist()
    num_nodes_from_edges = len(node_list_from_edges)
    if node_list_from_edges == list(range(num_nodes_from_edges)):
        print("  - Edges are indexed from 0 to n-1")
    elif node_list_from_edges == list(range(1, num_nodes_from_edges + 1)):
        print("  - Edges are indexed from 1 to n")
    else:
        print("  - Edge IDs are not continuous")

    assert node_list_from_nodes == node_list_from_edges, "Node IDs do not match between nodes and edges"
    assert num_nodes_from_nodes == num_nodes_from_edges, "Number of nodes do not match between nodes and edges"

    # Read groundtruth_path and get compare same stats
    groundtruth = pd.read_csv(groundtruth_path, sep=",", header=None)
    node_list_from_groundtruth = np.sort(np.unique(groundtruth[[0, 1]].values.ravel())).tolist()
    num_nodes_from_groundtruth = len(node_list_from_groundtruth)
    assert set(node_list_from_groundtruth).issubset(set(node_list_from_nodes)), "Groundtruth node IDs are not a subset of node IDs"
    print(f"  - Number of nodes: {len(nodes)}")
    print(f"  - Number of edges: {len(edges)}")
    print(f"  - Number of queryies: {len(groundtruth)} (equivalent to {len(groundtruth)/num_nodes_from_nodes:.1f} landmarks)")
    print(f"  - Groundtruth contains {num_nodes_from_groundtruth/num_nodes_from_nodes*100:.2f}% of nodes")

Processing dataset: W_Beijing
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 74383
  - Number of edges: 206894
  - Number of queryies: 16505 (equivalent to 0.2 landmarks)
  - Groundtruth contains 8.47% of nodes
Processing dataset: W_Chengdu
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 17567
  - Number of edges: 50590
  - Number of queryies: 19033 (equivalent to 1.1 landmarks)
  - Groundtruth contains 24.08% of nodes
Processing dataset: W_Chicago
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 386533
  - Number of edges: 1099270
  - Number of queryies: 774289 (equivalent to 2.0 landmarks)
  - Groundtruth contains 0.07% of nodes
Processing dataset: W_Jinan
  - Nodes are indexed from 0 to n-1
  - Edges are indexed from 0 to n-1
  - Number of nodes: 8908
  - Number of edges: 28140
  - Number of queryies: 117726 (equivalent to 13.2 landmarks)
  - Groundtruth contain

In [None]:
def reindex_nodes_edges_groundtruth(nodes_df, edges_df, groundtruth_df):
    # Create mapping from old node IDs to new node IDs (0 to n-1)
    old_ids = np.sort(np.unique(nodes_df[0].values)).tolist()  # generate node IDs from nodes
    # old_ids = np.sort(np.unique(edges[[0, 1]].values.ravel())).tolist()  # generate edge IDs from edges
    id_map = {old_id: new_id+1 for new_id, old_id in enumerate(old_ids)}

    # Reindex nodes
    nodes_df_reindexed = nodes_df.copy()
    nodes_df_reindexed[0] = nodes_df_reindexed[0].map(id_map)

    # Reindex edges
    edges_df_reindexed = edges_df.copy()
    edges_df_reindexed[0] = edges_df_reindexed[0].map(id_map)
    edges_df_reindexed[1] = edges_df_reindexed[1].map(id_map)

    # Reindex groundtruth
    groundtruth_df_reindexed = groundtruth_df.copy()
    groundtruth_df_reindexed[0] = groundtruth_df_reindexed[0].map(id_map)
    groundtruth_df_reindexed[1] = groundtruth_df_reindexed[1].map(id_map)

    return nodes_df_reindexed, edges_df_reindexed, groundtruth_df_reindexed

In [None]:
data_dir = "../data"
for data_name in ["W_Shenzhen", "W_Jinan"]:
    print(f"Processing {data_name}...")
    node_path = f"{data_dir}/{data_name}/{data_name}.nodes"
    edge_path = f"{data_dir}/{data_name}/{data_name}.edges"
    groundtruth_path = f"{data_dir}/{data_name}/{data_name}.groundtruth"

    # Load data
    nodes_df = pd.read_csv(node_path, sep=",", header=None)
    edges_df = pd.read_csv(edge_path, sep=",", header=None)
    groundtruth_df = pd.read_csv(groundtruth_path, sep=",", header=None)

    # Reindex
    nodes_new, edges_new, groundtruth_new = reindex_nodes_edges_groundtruth(nodes_df, edges_df, groundtruth_df)

    # Save back (overwrite original files)
    nodes_new.to_csv(node_path, index=False, header=False)
    edges_new.to_csv(edge_path, index=False, header=False)
    groundtruth_new.to_csv(groundtruth_path, index=False, header=False)
    print(f"Reindexed and saved: {data_name}")

Processing W_Shenzhen...
Reindexed and saved: W_Shenzhen
Processing W_Jinan...
Reindexed and saved: W_Jinan


In [None]:
data_dir = "../data"
data_names = [
    "W_Jinan",
    "W_Shenzhen",
    "W_Chengdu",
    "W_Beijing",
    "W_NewYork",
    "W_Chicago",
]

for data_name in data_names[:]:
    print(f"Processing dataset: {data_name}")
    node_path = f"{data_dir}/{data_name}/{data_name}.nodes"
    edge_path = f"{data_dir}/{data_name}/{data_name}.edges"
    query_path = f"{data_dir}/{data_name}/{data_name}.queries"
    groundtruth_path = f"{data_dir}/{data_name}/{data_name}.groundtruth"

    for path in [node_path, edge_path, query_path, groundtruth_path]:
        if not os.path.exists(path):
            print(f"File does not exist: {path}")

    # Read node_path and get num nodes and check if all nodes are from 0 to n-1
    nodes = pd.read_csv(node_path, sep=",", header=None)
    node_list_from_nodes = nodes[0].sort_values().tolist()
    num_nodes_from_nodes = len(node_list_from_nodes)
    assert len(nodes) == num_nodes_from_nodes, "Error"
    if (node_list_from_nodes == list(range(num_nodes_from_nodes))):
        print("  - Nodes are indexed from 0 to n-1")
    elif (node_list_from_nodes == list(range(1, num_nodes_from_nodes + 1))):
        print("  - Nodes are indexed from 1 to n")
    else:
        print("  - Node IDs are not continuous")

    # Read edge_path and get num nodes and check node range as done previously
    edges = pd.read_csv(edge_path, sep=",", header=None, comment="#")
    node_list_from_edges = np.sort(np.unique(edges[[0, 1]].values.ravel())).tolist()
    num_nodes_from_edges = len(node_list_from_edges)
    if node_list_from_edges == list(range(num_nodes_from_edges)):
        print("  - Edges are indexed from 0 to n-1")
    elif node_list_from_edges == list(range(1, num_nodes_from_edges + 1)):
        print("  - Edges are indexed from 1 to n")
    else:
        print("  - Edge IDs are not continuous")

    assert node_list_from_nodes == node_list_from_edges, "Node IDs do not match between nodes and edges"
    assert num_nodes_from_nodes == num_nodes_from_edges, "Number of nodes do not match between nodes and edges"

    # Read groundtruth_path and get compare same stats
    groundtruth = pd.read_csv(groundtruth_path, sep=",", header=None)
    node_list_from_groundtruth = np.sort(np.unique(groundtruth[[0, 1]].values.ravel())).tolist()
    num_nodes_from_groundtruth = len(node_list_from_groundtruth)
    assert set(node_list_from_groundtruth).issubset(set(node_list_from_nodes)), "Groundtruth node IDs are not a subset of node IDs"
    print(f"  - Number of nodes: {len(nodes)}")
    print(f"  - Number of edges: {len(edges)}")
    print(f"  - Number of queryies: {len(groundtruth)} (Num nodes: {len(node_list_from_groundtruth)}) (equivalent to {len(groundtruth)/num_nodes_from_nodes:.1f} landmarks)")
    print(f"  - Groundtruth contains {num_nodes_from_groundtruth/num_nodes_from_nodes*100:.2f}% of nodes")

Processing dataset: W_Jinan
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 8908
  - Number of edges: 28140
  - Number of queryies: 117726 (Num nodes: 1141) (equivalent to 13.2 landmarks)
  - Groundtruth contains 12.81% of nodes
Processing dataset: W_Shenzhen
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 11933
  - Number of edges: 37808
  - Number of queryies: 90273 (Num nodes: 656) (equivalent to 7.6 landmarks)
  - Groundtruth contains 5.50% of nodes
Processing dataset: W_Chengdu
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 17567
  - Number of edges: 50590
  - Number of queryies: 19033 (Num nodes: 4231) (equivalent to 1.1 landmarks)
  - Groundtruth contains 24.08% of nodes
Processing dataset: W_Beijing
  - Nodes are indexed from 1 to n
  - Edges are indexed from 1 to n
  - Number of nodes: 74383
  - Number of edges: 206894
  - Number of queryies: 16505 (Num n