In [13]:
# import libraries
import os
import argparse
import networkx as nx

import numpy as np
import pandas as pd

import config
from data_utils import (
    seed_everything,
    read_figshare_data,
    graph_from_edgelist,
    preprocess_graph,
    get_edge_attributes,
    get_node_attributes,
    save_graph,
    AllPairsDataset,
    LandmarkPairsDataset,
    RandomPairsDataset,
    save_dataset,
)


# Parse command line arguments
parser = argparse.ArgumentParser(description='Prepare data for shortest distance calculation.')
parser.add_argument('--data_name', type=str, default='Surat',
                    help='Name of the dataset')
parser.add_argument('--data_strategy', type=str, default='landmark',
                    choices=['all', 'landmark', 'random'],
                    help='Strategy for creating dataset')
parser.add_argument('--data_version', type=str, default='v0',
                    help='Version of the preprocessed dataset (e.g., v1, v2, etc.)')
parser.add_argument('--landmarks', type=float, default=100,
                    help='Number of landmarks for landmark strategy. If >=1, treated as an absolute \
                    number. If <1, treated as a percentage of total nodes.')
parser.add_argument('--random_pairs', type=int, default=1_000_000,
                    help='Number of random pairs for random strategy')
parser.add_argument('--seed', type=int, default=42,
                    help='Random seed for reproducibility')
args = parser.parse_args([
    '--data_name', 'Moscow',
])
print("Arguments:")
for arg, value in vars(args).items():
    print(f"  - {arg}: {value}")

# Get arguments
data_name = args.data_name
data_strategy = args.data_strategy
data_version = args.data_version
landmarks = args.landmarks
random_pairs = args.random_pairs
seed = args.seed

# Set seed for reproducibility
seed_everything(seed)

Arguments:
  - data_name: Moscow
  - data_strategy: landmark
  - data_version: v0
  - landmarks: 100
  - random_pairs: 1000000
  - seed: 42
Seed set to 42


In [14]:
# Set directories
RAW_DATA_DIR = config.FIGSHARE_DATA_DIR
PROCESSED_DATA_DIR = os.path.join(config.PROCESSED_DATA_DIR, data_version, data_name)

# Read the edgelist data for the specified dataset
edgelist = read_figshare_data(data_name, dir_name=RAW_DATA_DIR)

# Display the first few rows of the edgelist
print(edgelist.head())

# Create a graph from the edgelist
G = graph_from_edgelist(edgelist)

# Add metadata to the graph
G.graph['data_name'] = data_name

# Print the first 5 nodes with their attributes
print("Node data in graph: ", list(G.nodes(data=True))[:2])

# Print the first 5 edges with their attributes
print("Edge data in graph: ", list(G.edges(data=True))[:2])

# # Plot the graph around a specific node
# plot_subgraph(G, 1)

# # Plot the graph around max degree node with radius 2
# max_degree_node = max(dict(G.degree()).items(), key=lambda x: x[1])[0]
# print(f"Max degree node: {max_degree_node} (degree={G.degree(max_degree_node)})")
# plot_subgraph(G, max_degree_node, radius=2)

# # Plot the degree distribution before preprocessing
# plot_degree_distribution(G)

# Preprocess the graph
G_prime = preprocess_graph(G, reindex=True)

# # Plot the degree distribution after preprocessing
# plot_degree_distribution(G_prime)

[92mReading data: ../data/raw/figshare-2061897/Moscow.zip[0m
Finished reading data: Moscow
Before conversion:
          XCoord        YCoord  START_NODE  END_NODE  EDGE      LENGTH
0  480133.684133  6.009245e+06           1         2     1  386.540729
1  480330.093606  6.009578e+06           2         1     1  386.540729
2  480330.093606  6.009578e+06           2         4     2  263.981363
3  480330.093606  6.009578e+06           2         6     3   98.542220
4  480549.811762  6.009453e+06           4         2     2  263.981363
After conversion:
          XCoord        YCoord  START_NODE  END_NODE  EDGE  LENGTH
0  480133.684133  6.009245e+06           1         2     1     386
1  480330.093606  6.009578e+06           2         1     1     386
2  480330.093606  6.009578e+06           2         4     2     263
3  480330.093606  6.009578e+06           2         6     3      98
4  480549.811762  6.009453e+06           4         2     2     263
Finished checking: graph has no self loops

In [15]:
# Save the graph
save_graph(G_prime, data_name, dir_name=PROCESSED_DATA_DIR)

[92mSaving graph: ../data/processed/v0/Moscow/Moscow_nx_graph.pkl[0m


In [16]:
!ls -alh ../data/processed/v0/Rome/Rome_nx_graph.pkl

-rw-r--r-- 1 gchoudha student 12M Jul 16 00:25 ../data/processed/v0/Rome/Rome_nx_graph.pkl


In [26]:
# Write G_prime to a .edgelist file as an edgelist (src dst weight)
file_name = os.path.join(PROCESSED_DATA_DIR, f"{data_name}.edgelist")
with open(file_name, "w") as f:
    for u, v, data in G_prime.edges(data=True):
        weight = data.get("weight", 1)
        f.write(f"{u} {v} {weight}\n")
print(f"Graph written to {file_name}")

print(f"Size of edgelist: {os.path.getsize(file_name)/1024/1024:.2f} MB")

Graph written to ../data/processed/v0/Moscow/Moscow.edgelist
Size of edgelist: 13.84 MB


In [27]:
import gzip

compressed_file_name = os.path.join(PROCESSED_DATA_DIR, f"{data_name}.edgelist.gz")
with gzip.open(compressed_file_name, "wt") as f:
    for u, v, data in G_prime.edges(data=True):
        weight = data.get("weight", 1)
        f.write(f"{u} {v} {weight}\n")
print(f"Compressed edgelist written to {compressed_file_name}")

print(f"Size of compressed edgelist: {os.path.getsize(compressed_file_name)/1024/1024:.2f} MB")

Compressed edgelist written to ../data/processed/v0/Moscow/Moscow.edgelist.gz
Size of compressed edgelist: 3.37 MB


In [28]:
# Save node attributes of G_prime to a CSV file
node_attributes_prime = get_node_attributes(G_prime)
node_attr_file = os.path.join(PROCESSED_DATA_DIR, f"{data_name}_node_attributes.csv")
node_attributes_prime.to_csv(node_attr_file, index=True)
print(f"Node attributes written to {node_attr_file}")

print(f"Size of node attributes file: {os.path.getsize(node_attr_file)/1024/1024:.2f} MB")

Node attributes written to ../data/processed/v0/Moscow/Moscow_node_attributes.csv
Size of node attributes file: 17.33 MB


In [None]:
import gzip

compressed_node_attr_file = os.path.join(PROCESSED_DATA_DIR, f"{data_name}_node_attributes.csv.gz")
with gzip.open(compressed_node_attr_file, "wt") as f:
    node_attributes_prime.to_csv(f, index=True)
print(f"Compressed node attributes written to {compressed_node_attr_file}")

print(f"Size of compressed node attributes file: {os.path.getsize(compressed_node_attr_file)/1024/1024:.2f} MB")

Compressed node attributes written to ../data/processed/v0/Moscow/Moscow_node_attributes.csv.gz
Size of compressed node attributes file: 5.77 MB


In [1]:
!ls -alh ../data/processed/v2-urban-landmark/Moscow/

total 745M
drwxr-xr-x  2 gchoudha student 4.0K Jun  5 10:02 .
drwxr-xr-x 32 gchoudha student 4.0K Jun  5 09:56 ..
-rw-r--r--  1 gchoudha student 346M Jun  5 10:02 Moscow_landmark_test_torch_dataset.pth
-rw-r--r--  1 gchoudha student 348M Jun  5 10:02 Moscow_landmark_train_torch_dataset.pth
-rw-r--r--  1 gchoudha student  53M Jun  5 09:56 Moscow_nx_graph.pkl


In [None]:


# Compute the edge list for the graph
edge_attributes_prime = get_edge_attributes(G_prime)
print("Edgelist.shape: ", edge_attributes_prime.shape)
print(edge_attributes_prime.head())

# Compute the node attributes for the graph
node_attributes_prime = get_node_attributes(G_prime)
print("Node Attributes.shape: ", node_attributes_prime.shape)
print(node_attributes_prime.head())

# Create dataset using graph
if data_strategy == "all":
    print("Creating train and test datasets using all pairs strategy...")
    train_dataset = AllPairsDataset(G_prime)
    test_dataset = train_dataset  # Since we are using all pairs, train and test datasets are the same
elif data_strategy == "landmark":
    print("Creating train and test datasets using landmark strategy...")
    train_dataset = LandmarkPairsDataset(G_prime, l=landmarks, seed=seed+1234)  ## Using different seeds compared to model seeds
    test_dataset = LandmarkPairsDataset(G_prime, l=landmarks, seed=seed+2345)
elif data_strategy == "random":
    print("Creating train and test datasets using random pairs strategy...")
    train_dataset = RandomPairsDataset(G_prime, k=random_pairs, seed=seed+3456)
    test_dataset = RandomPairsDataset(G_prime, k=random_pairs, seed=seed+4567)
else:
    raise ValueError("Invalid data_strategy")

# TODO: Shall we limit the dataset size to 1M pairs?
# e.g., train_dataset = Subset(train_dataset, 1_000_000)

# Save dataset
save_dataset(train_dataset, data_name, data_strategy, data_split="train", dir_name=PROCESSED_DATA_DIR)
save_dataset(test_dataset, data_name, data_strategy, data_split="test", dir_name=PROCESSED_DATA_DIR)
