# TO DO:

1. Dataset Loading
Implements the pipeline to load a dataset from the src domain. Since the challenge repository doesn’t allow storing large files, loaders must download datasets from external sources into the datasets/ folder.
This pipeline is provided for several graph-based datasets. For any other src domain, participants are allowed to transform graph datasets into the corresponding domain through our provided lifting mappings –or just dropping their connectivity to get point-clouds.
(Bonus) Designing a loader for a new dataset (ones that are not already provided in the tutorials) will be positively taken into consideration in the final evaluation.

2. Pre-processing the Dataset
Applies the lifting transform to the dataset.
Needs to be done through the PreProcessor, which we provide in
modules/io/preprocess/preprocessor.py.

3. Running a Model over the Lifted Dataset 
Creates a Neural Network model that operates over the dst domain, leveraging TopoModelX for higher order topologies or torch_geometric for graphs.
Runs the model on the lifted dataset.

In [1]:
import csv
import time
import torch
import numpy as np
import networkx as nx
import scipy.sparse as sp
import pyflagsercount as pfc
import sys

sys.path.append("../../")
from modules.transforms.liftings.graph2combinatorial.sp_lifting import (
    DirectedFlagComplex as dfc,
)

# from datasets.data_loading import get_dataset, get_dataset_split

In [2]:
# With this cell any imported module is reloaded before each cell execution
%load_ext autoreload
%autoreload 2
from modules.data.load.loaders import GraphLoader
from modules.data.preprocess.preprocessor import PreProcessor
from modules.utils.utils import (
    describe_data,
    load_dataset_config,
    load_model_config,
    load_transform_config,
)

In [3]:
CHAMELEON = "chameleon"
CORNELL = "Cornell"
WISCONSIN = "Wisconsin"
TEXAS = "Texas"
ROMAN_EMPIRE = "directed-roman-empire"
SQUIRREL = "squirrel"
OGBN_ARXIV = "ogbn-arxiv"
SNAP_PATENTS = "snap-patents"
CORA_ML = "cora_ml"
CITESEER_FULL = "citeseer_full"
ARXIV_YEAR = "arxiv-year"
SYN_DIR = "syn-dir"

In [4]:
dataset_name = "cocitation_cora"
dataset_config = load_dataset_config(dataset_name)
loader = GraphLoader(dataset_config)


Dataset configuration for cocitation_cora:

{'data_domain': 'graph',
 'data_type': 'cocitation',
 'data_name': 'Cora',
 'data_dir': 'datasets/graph/cocitation',
 'num_features': 1433,
 'num_classes': 7,
 'task': 'classification',
 'loss_type': 'cross_entropy',
 'monitor_metric': 'accuracy',
 'task_level': 'node'}


In [5]:
dataset = loader.load()
describe_data(dataset)


Dataset only contains 1 sample:
 - Graph with 2708 vertices and 10556 edges.
 - Features dimensions: [1433, 0]
 - There are 0 isolated nodes.



In [6]:
dataset.edge_index.shape

torch.Size([2, 10556])

In [11]:
# Define transformation type and id
transform_type = "liftings"
# If the transform is a topological lifting, it should include both the type of the lifting and the identifier
transform_id = "graph2combinatorial/sp_lifting"

# Read yaml file
transform_config = {
    "lifting": load_transform_config(transform_type, transform_id)
    # other transforms (e.g. data manipulations, feature liftings) can be added here
}


Transform configuration for graph2combinatorial/sp_lifting:

{'transform_type': 'lifting',
 'transform_name': 'Graph2CombinatorialLifting',
 'd1': 2,
 'd2': 2,
 'q': 1,
 'i': 0,
 'j': 2,
 'complex_dim': 2,
 'offset': 'torch.tensor([[0], [0]])',
 'chunk_size': 1024,
 'save_path': 'None',
 'threshold': 1}


In [None]:
lifted_dataset = PreProcessor(dataset, transform_config, loader.data_dir)
describe_data(lifted_dataset)

In [None]:
# # %%

# def create_csv_datasets(dataset_name, dataset_dir="../dataset/"):
#     dataset, evaluator = get_dataset(dataset_name, dataset_dir)
#     source = dataset.edge_index[0].tolist()  # source
#     target = dataset.edge_index[1].tolist()  # target

#     csv_file_name = "./dataset/vis/original/" + dataset_name + ".csv"

#     with open(csv_file_name, "w", newline="") as file:
#         writer = csv.writer(file)

#         # Write the list content as rows
#         for a, b in zip(source, target):
#             writer.writerow([a, b])

#     print(f'CSV file "{csv_file_name}" created successfully.')


# def create_csv_condensations(dataset_name):

#     dataset_digraph = create_digraph_from_dataset(dataset_name)
#     condensation_digraph = nx.condensation(dataset_digraph)

#     condensation_digraph_edges = list(condensation_digraph.edges)

#     if dataset_name == "cora_ml":
#         dataset_name = "cora-ml"

#     if dataset_name == "citeseer_full":
#         dataset_name = "citeseer-full"

#     csv_file_name = "./dataset/vis/condensations/" + dataset_name + "-condensation.csv"

#     with open(csv_file_name, "w", newline="") as file:
#         writer = csv.writer(file)

#         # Write the list content as rows
#         for e in condensation_digraph_edges:
#             writer.writerow([e[0], e[1]])

#     print(f'CSV file "{csv_file_name}" created successfully.')


# def create_csv_condensations_from_dataset():

#     dataset_list = [
#         CHAMELEON,
#         ROMAN_EMPIRE,
#         SQUIRREL,
#         OGBN_ARXIV,
#         CORA_ML,
#         CITESEER_FULL,
#         ARXIV_YEAR,
#     ]

#     for dataset in dataset_list:
#         create_csv_datasets(dataset)
#         create_csv_condensations(dataset)


# def flagser_count(dataset_name, complex_dim=2, num_threads=4):
#     dataset_digraph = create_digraph_from_dataset(dataset_name)

#     sparse_adjacency_matrix = nx.to_scipy_sparse_array(dataset_digraph, format="csr")

#     start_time = time.time()

#     X = pfc.flagser_count(
#         sparse_adjacency_matrix,
#         threads=num_threads,
#         return_simplices=True,
#         max_dim=complex_dim,
#     )
#     end_time = time.time()
#     print("Time elapsed: ", end_time - start_time)
#     return X


# def create_digraph_from_dataset(dataset_name, dataset_dir="../dataset/"):
#     dataset, evaluator = get_dataset(dataset_name, dataset_dir)
#     dataset_digraph = nx.DiGraph()
#     dataset_digraph.add_edges_from(
#         list(zip(dataset.edge_index[0].tolist(), dataset.edge_index[1].tolist()))
#     )
#     print("Number of nodes: ", dataset_digraph.number_of_nodes(), " Number of edges: ", dataset_digraph.number_of_edges())
#     return dataset_digraph


# def create_flag_complex_from_dataset(dataset_name, dataset_dir, complex_dim=2):
#     dataset_digraph = create_digraph_from_dataset(dataset_name, dataset_dir)
#     flag_complex = dfc.DirectedFlagComplex(dataset_digraph, complex_dim)
#     return flag_complex


# def create_condensed_digraph_from_dataset(dataset_name):
#     dataset_digraph = create_digraph_from_dataset(dataset_name)
#     condensation_digraph = nx.condensation(dataset_digraph)
#     return condensation_digraph


# def dataset_stats(dataset_name, complex_dim=2):
#     G = create_digraph_from_dataset(dataset_name)
#     FlG = dfc.DirectedFlagComplex(G, complex_dim)
#     for d in range(complex_dim + 1):
#         print(dataset_name + " number of " + str(d) + "-simplices", len(FlG.complex[d]))
#     return FlG


# def qij(
#     dataset_name, dataset_dir, q, i, j, complex_dim=2, chunk_size=1024, save_path=None
# ):
#     FlG = create_flag_complex_from_dataset(dataset_name, dataset_dir, complex_dim)
#     return FlG.qij(q, i, j, chunk_size, save_path)


# if __name__ == "__main__":
#     DATASET_DIR = "../../dataset/"
#     qij(
#         WISCONSIN,
#         DATASET_DIR,
#         1,
#         0,
#         2,
#         complex_dim=2,
#         chunk_size=100,
#         # save_path="../../dataset/cornell/102.pt",
#         save_path=None
#     )
#     pass