### GraphEm installation

In [None]:
!git clone https://github.com/sashakolpakov/graphem-rapids.git
# need to be in the repo root
%cd graphem-rapids
%pip install -e .

In [None]:
# Installs necessary for comparison and benchmarking
!pip install umap-learn trimap pacmap

### Imports

In [None]:
import plotly.io as pio
# Plotly settings: either interactive images or stills
pio.renderers.default = 'colab'  # interactive plots

In [None]:
import numpy as np
import networkx as nx
from time import time

import graphem_rapids as gr
from graphem_rapids import create_graphem
from graphem_rapids.benchmark import benchmark_correlations
from graphem_rapids.visualization import report_full_correlation_matrix

import loguru

logger = loguru.logger
loguru.logger.level("DEBUG")

### Testing the GraphEm layout

In [None]:
"""
Test the layout on graphs generated above.
"""

def graph_test(graph_generator,
                 graph_params,
                 dim=3,
                 num_iterations=20,
                 L_min=10.0,
                 k_attr=0.5,
                 k_inter=0.1,
                 n_neighbors=15,
                 edge_width=0.5,
                 node_size=8,
                 sample_size=512,
                 batch_size=None,
                 backend='auto',  # New parameter for backend selection
                 logger=None):    # Renamed from my_logger for consistency

      # Generate adjacency matrix
      adj = graph_generator(**graph_params)

      n = adj.shape[0]

      # Calculate degree directly from adjacency matrix
      deg = np.array(adj.sum(axis=1)).flatten()
      deg_normalized = (deg - np.min(deg)) / (np.max(deg) - np.min(deg))

      # Count edges from adjacency matrix
      edges_count = adj.nnz
      if logger:
          logger.debug(f"Vertices {n}, edges {edges_count}")

      # Create graphem embedder with updated API
      gm = gr.create_graphem(
          adjacency=adj,
          n_components=dim,
          backend=backend,
          L_min=L_min,
          k_attr=k_attr,
          k_inter=k_inter,
          n_neighbors=n_neighbors,  # Renamed from knn_k
          sample_size=sample_size,
          batch_size=batch_size,
          verbose=True,
          logger_instance=logger
      )

      # Display initial layout
      if logger:
          logger.info("Initial layout")
      gm.display_layout(edge_width=edge_width, node_size=node_size, node_colors=deg_normalized)

      # Run layout algorithm
      _ = gm.run_layout(num_iterations)

      # Display final layout
      if logger:
          logger.info("Final layout")
      gm.display_layout(edge_width=edge_width, node_size=node_size, node_colors=deg_normalized)

#### Bipartite graph (complete)

In [None]:
"""
Bipartite graph
"""
graph_test(gr.generate_complete_bipartite_graph, {"n_top": 50, "n_bottom": 100}, dim=2, L_min=10, num_iterations=10)
#

#### Balanced tree

In [None]:
"""
Balanced tree
"""
graph_test(gr.generate_balanced_tree, {"r": 3, "h": 8}, dim=2, sample_size=2048, num_iterations=10)
#

#### Grid graph / road network graph

In [None]:
"""
Grid (road network) graph
"""
graph_test(gr.generate_road_network, {"width": 30, "height": 40}, dim=2, num_iterations=20)
#

#### Power law cluster graph

In [None]:
"""
Test power cluster graph
"""
graph_test(gr.generate_power_cluster, {"n": 1000, "m": 5, "p": 0.75}, dim=2, sample_size=2048, num_iterations=20)

#### Barabási–Albert graph

In [None]:
"""
Test Barabási–Albert graph
"""
graph_test(gr.generate_ba, {"n": 1000, "m": 5}, dim=2, L_min=20, sample_size=2048, num_iterations=20)

#### Stochastic Block Model graph

In [None]:
"""
Test SBM graph
"""
graph_test(gr.generate_sbm, {"n_per_block": 100, "num_blocks": 4, "p_in": 0.15, "p_out": 0.01}, dim=2, L_min=60, sample_size=1024, num_iterations=20)

#### Watts-Strogaz "small world" graph

In [None]:
"""
Watts-Strogaz graph
"""
graph_test(gr.generate_ws, {"n": 1000, "k": 6, "p": 0.3}, dim=2, L_min=60, sample_size=1024, num_iterations=20)
#

#### Erdös-Renyi random graph

In [None]:
"""
Erdos-Renyi graph
"""
graph_test(gr.generate_er, {"n": 1000, "p": 0.1}, dim=2, L_min=40, sample_size=2048, num_iterations=20)
#

### Centrality measures correlation benchmarks

#### Erdös-Renyi rendom graph

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_er,
    {"n": 1000, "p": 0.1},
    dim=2,
    L_min=40,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_er,
    {"n": 1000, "p": 0.1},
    dim=3,
    L_min=40,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_er,
    {"n": 1000, "p": 0.1},
    dim=4,
    L_min=40,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_er,
    {"n": 1000, "p": 0.1},
    dim=10,
    L_min=40,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Watts-Strogaz "small world" graph

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ws,
    {"n": 1000, "k": 6, "p": 0.3},
    dim=2,
    L_min=60,
    sample_size=1024,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ws,
    {"n": 1000, "k": 6, "p": 0.3},
    dim=3,
    L_min=60,
    sample_size=1024,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ws,
    {"n": 1000, "k": 6, "p": 0.3},
    dim=4,
    L_min=60,
    sample_size=1024,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ws,
    {"n": 1000, "k": 6, "p": 0.3},
    dim=10,
    L_min=60,
    sample_size=1024,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Stochastic Block Model graph

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_sbm,
    {"n_per_block": 100, "num_blocks": 4, "p_in": 0.15, "p_out": 0.01},
    dim=2,
    L_min=60,
    sample_size=1024,
    num_iterations=80
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_sbm,
    {"n_per_block": 100, "num_blocks": 4, "p_in": 0.15, "p_out": 0.01},
    dim=3,
    L_min=60,
    sample_size=1024,
    num_iterations=80
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_sbm,
    {"n_per_block": 100, "num_blocks": 4, "p_in": 0.15, "p_out": 0.01},
    dim=4,
    L_min=60,
    sample_size=1024,
    num_iterations=80
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_sbm,
    {"n_per_block": 100, "num_blocks": 4, "p_in": 0.15, "p_out": 0.01},
    dim=10,
    L_min=60,
    sample_size=1024,
    num_iterations=80
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Barabási–Albert graph

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ba,
    {"n": 1000, "m": 5},
    dim=2,
    L_min=20,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ba,
    {"n": 1000, "m": 5},
    dim=3,
    L_min=20,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ba,
    {"n": 1000, "m": 5},
    dim=4,
    L_min=20,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_ba,
    {"n": 1000, "m": 5},
    dim=10,
    L_min=20,
    sample_size=2048,
    num_iterations=30
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Power law cluster graph

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_power_cluster,
    {"n": 1000, "m": 5, "p": 0.75},
    dim=2,
    sample_size=2048,
    num_iterations=20
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_power_cluster,
    {"n": 1000, "m": 5, "p": 0.75},
    dim=3,
    sample_size=2048,
    num_iterations=20
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_power_cluster,
    {"n": 1000, "m": 5, "p": 0.75},
    dim=4,
    sample_size=2048,
    num_iterations=20
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_power_cluster,
    {"n": 1000, "m": 5, "p": 0.75},
    dim=10,
    sample_size=2048,
    num_iterations=20
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Grid graph / road network graph

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_road_network,
    {"width": 30, "height": 40},
    dim=2,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_road_network,
    {"width": 30, "height": 40},
    dim=3,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_road_network,
    {"width": 30, "height": 40},
    dim=4,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_road_network,
    {"width": 30, "height": 40},
    dim=10,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Random Delaunay triangulation

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_delaunay_triangulation,
    {"n": 250},
    dim=2,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_delaunay_triangulation,
    {"n": 250},
    dim=3,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_delaunay_triangulation,
    {"n": 250},
    dim=4,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_delaunay_triangulation,
    {"n": 250},
    dim=10,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Bipartite graph (not necessarily complete)

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_bipartite_graph,
    {"n_top": 50, "n_bottom": 100},
    dim=2,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_bipartite_graph,
    {"n_top": 50, "n_bottom": 100},
    dim=3,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_bipartite_graph,
    {"n_top": 50, "n_bottom": 100},
    dim=4,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_bipartite_graph,
    {"n_top": 50, "n_bottom": 100},
    dim=10,
    L_min=20,
    num_iterations=5
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

#### Balanced tree

In [None]:
"""
Embedding in dimension 2
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_balanced_tree,
    {"r": 3, "h": 8},
    dim=2,
    sample_size=2048,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 3
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_balanced_tree,
    {"r": 3, "h": 8},
    dim=3,
    sample_size=2048,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 4
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_balanced_tree,
    {"r": 3, "h": 8},
    dim=4,
    sample_size=2048,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

In [None]:
"""
Embedding in dimension 10
"""

# Run benchmark to calculate correlations
results = benchmark_correlations(
    gr.generate_balanced_tree,
    {"r": 3, "h": 8},
    dim=10,
    sample_size=2048,
    num_iterations=60
)

# Display correlation matrix
corr_matrix = report_full_correlation_matrix(
    results['radii'],
    results['degree'],
    results['betweenness'],
    results['eigenvector'],
    results['pagerank'],
    results['closeness'],
    results['node_load']
)

### Real-world datasets

#### General Relativity and Quantum Cosmology collaboration network

In [None]:
# need to be in graphem-rapids/examples
%cd examples

In [None]:
from real_world_datasets_example import *

In [None]:
analyze_dataset('snap-ca-GrQc', sample_size=None, dim=3, num_iterations=30)

In [None]:
try:
  analyze_dataset('snap-ca-GrQc', sample_size=None, dim=4, num_iterations=30)
except Exception as e:
  print(e)

In [None]:
try:
  analyze_dataset('snap-ca-GrQc', sample_size=None, dim=6, num_iterations=30)
except Exception as e:
  print(e)

#### Social Circles: Facebook

In [None]:
analyze_dataset('snap-facebook_combined', sample_size=None, dim=3, num_iterations=30)

In [None]:
try:
  analyze_dataset('snap-facebook_combined', sample_size=None, dim=4, num_iterations=30)
except Exception as e:
  print(e)

In [None]:
try:
  analyze_dataset('snap-facebook_combined', sample_size=None, dim=6, num_iterations=30)
except Exception as e:
  print(e)

#### Wikipedia vote network

In [None]:
analyze_dataset('snap-wiki-vote', sample_size=None, dim=3, num_iterations=30)

In [None]:
try:
  analyze_dataset('snap-wiki-vote', sample_size=None, dim=4, num_iterations=30)
except Exception as e:
  print(e)

In [None]:
try:
  analyze_dataset('snap-wiki-vote', sample_size=None, dim=6, num_iterations=30)
except Exception as e:
  print(e)

### Node influence maximization

In [None]:
from time import time
from graphem_rapids.influence import graphem_seed_selection, greedy_seed_selection, ndlib_estimated_influence

In [None]:
import ndlib.models.ModelConfig as mc
import ndlib.models.epidemics as ep

def ndlib_estimated_influence(G, seeds, p=0.1, iterations_count=200):
    """
    Run NDlib's Independent Cascades model on graph G, starting with the given seeds,
    and return the estimated final influence (number of nodes in state 2) and
    the number of iterations executed.
    """
    model = ep.IndependentCascadesModel(G)
    config = mc.Configuration()
    config.add_model_parameter('fraction_infected', 0.1)
    for e in G.edges():
        config.add_edge_configuration("threshold", e, p)
    model.set_initial_status(config)
    sim_iterations = model.iteration_bunch(iterations_count)
    final_count = sim_iterations[-1]['node_count']
    influence = final_count.get(2, 0)
    return influence, len(sim_iterations)

def greedy_seed_selection(G, k, p=0.1, iterations_count=200):
    """
    Greedy seed selection using NDlib influence estimation.
    For each candidate node evaluation, it calls NDlib's simulation and accumulates
    the total number of iterations used across all evaluations.

    Returns:
        seeds: the selected seed set (list of nodes)
        total_iters: the total number of NDlib iterations run during selection.
    """
    seeds = []
    candidate_nodes = set(G.nodes())
    total_iters = 0
    for _ in range(k):
        best_candidate = None
        best_spread = -1
        # Evaluate each candidate's marginal gain when added to the current seed set.
        for node in candidate_nodes:
            current_seeds = seeds + [node]
            spread, iters = ndlib_estimated_influence(G, current_seeds, p=p, iterations_count=iterations_count)
            total_iters += iters  # accumulate iterations used for this simulation
            if spread > best_spread:
                best_spread = spread
                best_candidate = node
        seeds.append(best_candidate)
        candidate_nodes.remove(best_candidate)
    return seeds, total_iters

#### Synthetic dataset benchmark (Erdös-Renyi random graph)

In [None]:
# Benchmark parameters for the Erdos-Renyi graph
n_nodes = 256  # number of nodes
p_edge = 0.05  # probability of edge
ic_prob = 0.1  # node activation probability
k_seeds = 10   # number of seeds
sample_size = 10  # sample size for benchmark stats

def run_benchmark_synthetic():

    # Create a sample graph (Erdős–Rényi)
    adjacency = gr.generate_er(n_nodes, p_edge, seed=42)

    # Get NetworkX graph for influence estimation
    G_nx = nx.from_scipy_sparse_array(adjacency)

    # ------------------------------
    # GraphEm Influence Maximization
    # ------------------------------
    embedder = gr.GraphEmbedderPyTorch(
        adjacency=adjacency,
        n_components=2,
        L_min=10.0,
        k_attr=0.5,
        k_inter=0.1,
        n_neighbors=15,
        sample_size=256,
        batch_size=None,
        verbose=True
    )

    start_time = time()
    gm_seeds = gr.graphem_seed_selection(embedder, k_seeds, num_iterations=10)
    gm_time = time() - start_time
    gm_influence, gm_iter_count = gr.ndlib_estimated_influence(G_nx, gm_seeds, p=ic_prob, iterations_count=200)

    # -----------------------------
    # Greedy Influence Maximization
    # -----------------------------
    start_time = time()
    greedy_seeds, greedy_iters = gr.greedy_seed_selection(G_nx, k_seeds, p=ic_prob, iterations_count=200)
    greedy_time = time() - start_time
    greedy_influence, iters = gr.ndlib_estimated_influence(G_nx, greedy_seeds, p=ic_prob, iterations_count=200)
    greedy_iters += iters  # accumulate iterations used for the final simulation

    return gm_seeds, gm_influence, gm_iter_count, gm_time, greedy_seeds, greedy_influence, greedy_iters, greedy_time

gm_seeds_stats = []
gm_influence_stats = []
gm_iter_count_stats = []
gm_time_stats = []
greedy_seeds_stats = []
greedy_influence_stats = []
greedy_iters_stats = []
greedy_time_stats = []

for _ in range(sample_size):

    print("Iteration", _+1, "of", sample_size)

    gm_seeds, gm_influence, gm_iter_count, gm_time, greedy_seeds, greedy_influence, greedy_iters, greedy_time = run_benchmark_synthetic()

    gm_seeds_stats.append(gm_seeds)
    gm_influence_stats.append(gm_influence)
    gm_iter_count_stats.append(gm_iter_count)
    gm_time_stats.append(gm_time)

    greedy_seeds_stats.append(greedy_seeds)
    greedy_influence_stats.append(greedy_influence)
    greedy_iters_stats.append(greedy_iters)
    greedy_time_stats.append(greedy_time)

gm_seeds_stats = np.array(gm_seeds_stats)
gm_influence_stats = np.array(gm_influence_stats)
gm_iter_count_stats = np.array(gm_iter_count_stats)
gm_time_stats = np.array(gm_time_stats)

greedy_seeds_stats = np.array(greedy_seeds_stats)
greedy_influence_stats = np.array(greedy_influence_stats)
greedy_iters_stats = np.array(greedy_iters_stats)
greedy_time_stats = np.array(greedy_time_stats)

print("\nGraphEm Embedding Method:")
print("  Estimated Influence Spread:", gm_influence_stats.mean(), "(sigma)", gm_influence_stats.std())
print("  NDlib Iterations:", gm_iter_count_stats.mean(), "(sigma)", gm_iter_count_stats.std())
print("  Runtime (s):", gm_time_stats.mean(), "(sigma)", gm_time_stats.std())

print("\nGreedy Influence Maximization Method:")
print("  Estimated Influence Spread:", greedy_influence_stats.mean(), "(sigma)", greedy_influence_stats.std())
print("  NDlib Iterations:", greedy_iters_stats.mean(), "(sigma)", greedy_iters_stats.std())
print("  Runtime (s):", greedy_time_stats.mean(), "(sigma)", greedy_time_stats.std())

#### Real-world dataset benchmark (SNAP arXiv collaboration network)

In [None]:
# Benchmark parameters for the dataset
ic_prob = 0.1  # node activation probability
k_seeds = 10   # number of seeds
sample_size = 10  # sample size for benchmark stats
subsample_nodes = 256  # subsample to this many nodes for faster greedy

# subsample before running iterations
print("Loading and subsampling dataset...")
vertices, edges = gr.load_dataset('snap-ca-GrQc')

# Create NetworkX graph and take the largest connected component
G_full = nx.Graph()
G_full.add_nodes_from(vertices)
G_full.add_edges_from(edges)
G_full = G_full.subgraph(max(nx.connected_components(G_full), key=len)).copy()
G_full = nx.convert_node_labels_to_integers(G_full)

# Better subsampling: Use BFS from a random node to get a connected subgraph
np.random.seed(42)  # Fixed seed for reproducibility
start_node = np.random.choice(list(G_full.nodes()))

# BFS to get approximately subsample_nodes connected nodes
visited = set()
queue = [start_node]
visited.add(start_node)

while queue and len(visited) < subsample_nodes:
    node = queue.pop(0)
    neighbors = list(G_full.neighbors(node))
    np.random.shuffle(neighbors)
    for neighbor in neighbors:
        if neighbor not in visited and len(visited) < subsample_nodes:
            visited.add(neighbor)
            queue.append(neighbor)

# Create subgraph from visited nodes
G_sub = G_full.subgraph(visited).copy()
G_sub = nx.convert_node_labels_to_integers(G_sub)

print(f"Subsampled graph: {G_sub.number_of_nodes()} nodes, {G_sub.number_of_edges()} edges")
print()

def run_benchmark_snap():
    """Run one iteration of the benchmark on the fixed subsampled graph"""

    # Use the pre-subsampled graph
    G_nx = G_sub

    # Get adjacency matrix for embedder
    adjacency = nx.to_scipy_sparse_array(G_nx, format='csr')
    n_nodes = G_nx.number_of_nodes()

    # ------------------------------
    # GraphEm Influence Maximization
    # ------------------------------
    embedder = gr.GraphEmbedderPyTorch(
        adjacency=adjacency,
        n_components=6,
        L_min=4.0,
        k_attr=0.5,
        k_inter=0.1,
        n_neighbors=15,
        sample_size=512,
        batch_size=1024,
        verbose=False
    )

    start_time = time()
    gm_seeds = gr.graphem_seed_selection(embedder, k_seeds, num_iterations=10)
    gm_time = time() - start_time
    gm_influence, gm_iter_count = gr.ndlib_estimated_influence(G_nx, gm_seeds, p=ic_prob, iterations_count=200)

    # -----------------------------
    # Greedy Influence Maximization
    # -----------------------------
    start_time = time()
    greedy_seeds, greedy_iters = gr.greedy_seed_selection(G_nx, k_seeds, p=ic_prob, iterations_count=200)
    greedy_time = time() - start_time
    greedy_influence, iters = gr.ndlib_estimated_influence(G_nx, greedy_seeds, p=ic_prob, iterations_count=200)
    greedy_iters += iters  # accumulate iterations used for the final simulation

    return gm_seeds, gm_influence, gm_iter_count, gm_time, greedy_seeds, greedy_influence, greedy_iters, greedy_time

gm_seeds_stats = []
gm_influence_stats = []
gm_iter_count_stats = []
gm_time_stats = []
greedy_seeds_stats = []
greedy_influence_stats = []
greedy_iters_stats = []
greedy_time_stats = []

for _ in range(sample_size):

    print("Iteration", _+1, "of", sample_size)

    gm_seeds, gm_influence, gm_iter_count, gm_time, greedy_seeds, greedy_influence, greedy_iters, greedy_time = run_benchmark_snap()

    gm_seeds_stats.append(gm_seeds)
    gm_influence_stats.append(gm_influence)
    gm_iter_count_stats.append(gm_iter_count)
    gm_time_stats.append(gm_time)

    greedy_seeds_stats.append(greedy_seeds)
    greedy_influence_stats.append(greedy_influence)
    greedy_iters_stats.append(greedy_iters)
    greedy_time_stats.append(greedy_time)

gm_seeds_stats = np.array(gm_seeds_stats)
gm_influence_stats = np.array(gm_influence_stats)
gm_iter_count_stats = np.array(gm_iter_count_stats)
gm_time_stats = np.array(gm_time_stats)

greedy_seeds_stats = np.array(greedy_seeds_stats)
greedy_influence_stats = np.array(greedy_influence_stats)
greedy_iters_stats = np.array(greedy_iters_stats)
greedy_time_stats = np.array(greedy_time_stats)

print("\nGraphEm Embedding Method:")
print("  Estimated Influence Spread:", gm_influence_stats.mean(), "(sigma)", gm_influence_stats.std())
print("  NDlib Iterations:", gm_iter_count_stats.mean(), "(sigma)", gm_iter_count_stats.std())
print("  Runtime (s):", gm_time_stats.mean(), "(sigma)", gm_time_stats.std())

print("\nGreedy Influence Maximization Method:")
print("  Estimated Influence Spread:", greedy_influence_stats.mean(), "(sigma)", greedy_influence_stats.std())
print("  NDlib Iterations:", greedy_iters_stats.mean(), "(sigma)", greedy_iters_stats.std())
print("  Runtime (s):", greedy_time_stats.mean(), "(sigma)", greedy_time_stats.std())


### Comparison to other embeddings

In [None]:
import scipy.sparse.linalg as spla
import umap, trimap, pacmap
from scipy.stats import spearmanr
from graphem_rapids.datasets import load_dataset_as_networkx
import plotly.graph_objects as go

In [None]:
#
# Plot the graph embedding
#
def plot_graph_embedding(positions, G, title="Embedding"):
    """
    Interactive 2D scatter plot of node embedding using Plotly,
    colored by node degree.

    Args:
        positions: np.ndarray (n_nodes, dim)
        G: networkx.Graph
    """
    degree = np.array([d for _, d in G.degree()])
    norm_deg = (degree - degree.min()) / (degree.max() - degree.min() + 1e-9)

    fig = go.Figure(
        data=go.Scattergl(
            x=positions[:, 0],
            y=positions[:, 1],
            mode='markers',
            marker=dict(
                size=6,
                color=norm_deg,
                colorscale='Bluered',
                colorbar=dict(title='Degree'),
                showscale=True,
                line=dict(width=0)
            ),
            text=[f"Node {i}<br>Degree: {deg}" for i, deg in enumerate(degree)],
            hoverinfo='text'
        )
    )

    fig.update_layout(
        title=title,
        width=800,
        height=700,
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        plot_bgcolor='white',
        margin=dict(l=10, r=10, t=40, b=10)
    )

    fig.show()

In [None]:
import numpy as np
import networkx as nx
from scipy.stats import spearmanr

def bootstrap_ci(x, y, n_boot=1000, ci=95):
    """
    Bootstraps Spearman correlation confidence interval.
    """
    n = len(x)
    corrs = []
    for _ in range(n_boot):
        idx = np.random.choice(n, size=n, replace=True)
        corr, _ = spearmanr(x[idx], y[idx])
        corrs.append(corr)
    corrs = np.sort(corrs)
    alpha = 100 - ci
    lower = np.percentile(corrs, alpha / 2)
    upper = np.percentile(corrs, 100 - alpha / 2)
    return np.mean(corrs), (lower, upper)

def compute_and_display_correlations(G, positions, bootstrap=True, n_boot=1000):
    """
    Compute radial distances and correlate with various centrality measures.
    Args:
        G: networkx.Graph
        positions: np.ndarray (n_nodes, dim)
    """
    positions = np.array(positions)
    positions -= np.mean(positions, axis=0)
    radii = np.linalg.norm(positions, axis=1)

    degree = np.array([d for _, d in G.degree()])
    betweenness = np.array(list(nx.betweenness_centrality(G).values())) if G.number_of_nodes() < 5000 else np.zeros_like(radii)
    try:
        eigenvector = np.array(list(nx.eigenvector_centrality_numpy(G).values()))
    except:
        eigenvector = np.zeros_like(radii)
    pagerank = np.array(list(nx.pagerank(G).values()))
    closeness = np.array(list(nx.closeness_centrality(G).values()))
    try:
        node_load = np.array(list(nx.load_centrality(G).values()))
    except:
        node_load = np.zeros_like(radii)

    measures = {
        "Degree": degree,
        "Betweenness": betweenness,
        "Eigenvector": eigenvector,
        "PageRank": pagerank,
        "Closeness": closeness,
        "Node Load": node_load
    }

    print("\nCorrelation of Radial Distance with Centralities:")
    print("-" * 80)
    print(f"{'Centrality':<15} {'ρ':>7} {'p-value':>12} {'Confidence Interval':>25}")
    print("-" * 80)

    for name, values in measures.items():
        if np.all(values == 0):
            print(f"{name:<15} {'N/A (skipped)':>50}")
            continue
        corr, pval = spearmanr(radii, values)
        if bootstrap:
            mean_corr, (low, high) = bootstrap_ci(radii, values, n_boot=n_boot)
            print(f"{name:<15} {corr:7.3f} {pval:12.3g} {'':5} [{low:6.3f}, {high:6.3f}]")
        else:
            print(f"{name:<15} {corr:7.3f} {pval:12.3g}")


#### Laplacian eigenmaps

In [None]:
#
# Laplacian embedding
#
def laplacian_embedding(G, dim=2):
    """
    Compute Laplacian eigenmap + UMAP from a NetworkX graph.
    Args:
        G: networkx.Graph
        dim_lap: number of Laplacian eigenvectors (before UMAP)
        dim_umap: target embedding dimension
    Returns:
        np.ndarray: shape (n_nodes, dim_umap)
    """
    # Adjacency and Laplacian
    A = nx.adjacency_matrix(G)
    L = nx.normalized_laplacian_matrix(G)

    # Eigen-decomposition (smallest eigenvalues)
    k = min(dim + 1, G.number_of_nodes() - 1)
    eigvals, eigvecs = spla.eigsh(L, k=k, which='SM')

    # Skip trivial first eigenvector (constant)
    return eigvecs[:, 1:dim+1]

In [None]:
# Generate a random Erdos-Renyi graph
G = nx.erdos_renyi_graph(1000, 0.05, seed=0)

# Embedding
positions = laplacian_embedding(G, dim=2)  # 2D for visualization

# Correlation analysis
compute_and_display_correlations(G, positions, bootstrap=True, n_boot=500)

# Plot the result
plot_graph_embedding(positions, G, title="Laplacian embedding")

#### UMAP

In [None]:
# UMAP embedding
reducer = umap.UMAP(
    n_components=2,
    metric='euclidean',
    verbose=False
    )
umap_embedding = reducer.fit_transform(positions)

# Correlation analysis
compute_and_display_correlations(G, umap_embedding, bootstrap=True, n_boot=500)

# Plot the result
plot_graph_embedding(umap_embedding, G, title="UMAP embedding")

#### TriMAP

In [None]:
# TriMAP embedding
reducer = trimap.TRIMAP(
    n_dims=2,
    distance='euclidean',
    verbose=False
    )
trimap_embedding = reducer.fit_transform(positions)

# Correlation analysis
compute_and_display_correlations(G, trimap_embedding, bootstrap=True, n_boot=500)

# Plot the result
plot_graph_embedding(trimap_embedding, G, title="TriMAP embedding")

#### PaCMAP

In [None]:
# PaCMAP embedding
reducer = pacmap.PaCMAP(
    n_components=2,
    distance='euclidean',
    verbose=False
    )
pacmap_embedding = reducer.fit_transform(positions)

# Correlation analysis
compute_and_display_correlations(G, pacmap_embedding, bootstrap=True, n_boot=500)

# Plot the result
plot_graph_embedding(pacmap_embedding, G, title="PaCMAP embedding")