In [8]:
import sys
sys.path.append("..")
import torch
from src.data.graph import Graph

#### Graphs
- data/graph_classification/graph_mutag_with_node_num.pt
- data/graph_classification/graph_enzymes_with_node_num.pt

In [9]:
mutag_graphs = torch.load("../data/graph_classification/graph_enzymes_with_node_num.pt")

In [10]:
mutag_graphs_train = mutag_graphs["train"]

In [11]:
mutag_graphs_train[0]

{'adj': tensor(indices=tensor([[ 0,  1,  9, 11, 27,  0,  1,  9, 11, 27,  2,  3, 15, 17,
                         18, 27,  2,  3, 15, 17, 18, 27,  4,  5,  6, 15, 17, 26,
                         27,  4,  5,  6,  7, 15, 27,  4,  5,  6, 26, 27,  5,  7,
                         13, 26, 27,  8, 12, 13, 14, 16, 27,  0,  1,  9, 11, 21,
                         25, 27, 10, 18, 19, 23, 27,  0,  1,  9, 11, 21, 25, 27,
                          8, 12, 14, 16, 27,  7,  8, 13, 14, 16, 26, 27,  8, 12,
                         13, 14, 16, 27,  2,  3,  4,  5, 15, 17, 27,  8, 12, 13,
                         14, 16, 27,  2,  3,  4, 15, 17, 27,  2,  3, 10, 18, 27,
                         10, 19, 20, 22, 23, 27, 19, 20, 22, 23, 24, 27,  9, 11,
                         21, 25, 27, 19, 20, 22, 23, 24, 27, 10, 19, 20, 22, 23,
                         27, 20, 22, 24, 25, 27,  9, 11, 21, 24, 25, 27,  4,  6,
                          7, 13, 26, 27,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
                     

In [12]:
len(mutag_graphs_train)  # number of graphs

360

In [13]:
def get_graphs_stats(graphs):
    def _get_graph_statistics(graph):
        num_edges = len(graph["adj"].coalesce().indices().T.tolist())
        num_nodes = graph["node_features"].shape[0]
        return num_nodes, num_edges
    stats = [_get_graph_statistics(graph) for graph in graphs]
    avg_num_nodes = round(sum([s[0] for s in stats]) / len(stats), 2)
    avg_num_edges = round(sum([s[1] for s in stats]) / len(stats), 2)
    node_feature_dim = graphs[0]["node_features"].shape[1]
    print(f"Num graphs = {len(graphs)}")
    print(f"Avg. num nodes = {avg_num_nodes}")
    print(f"Avg. num edges = {avg_num_edges}")
    print(f"Avg. num edges = {node_feature_dim}")
    

In [14]:
get_graphs_stats(mutag_graphs_train)

Num graphs = 360
Avg. num nodes = 33.27
Avg. num edges = 221.19
Avg. num edges = 22


In [15]:
import sys
sys.path.append("..")
import torch

def get_graphs_stats(graphs):
    """Calculate average statistics for a list of graphs.
    
    Args:
        graphs: List of graph dictionaries containing 'adj' and 'node_features'
    """
    def _get_graph_statistics(graph):
        num_edges = len(graph["adj"].coalesce().indices().T.tolist())
        num_nodes = graph["node_features"].shape[0]
        return num_nodes, num_edges
    
    stats = [_get_graph_statistics(graph) for graph in graphs]
    avg_num_nodes = round(sum([s[0] for s in stats]) / len(stats), 2)
    avg_num_edges = round(sum([s[1] for s in stats]) / len(stats), 2)
    node_feature_dim = graphs[0]["node_features"].shape[1]
    
    print(f"Number of graphs: {len(graphs)}")
    print(f"Average number of nodes: {avg_num_nodes}")
    print(f"Average number of edges: {avg_num_edges}")
    print(f"Node feature dimension: {node_feature_dim}")
    print("-" * 50)
    
    return {
        'num_graphs': len(graphs),
        'avg_nodes': avg_num_nodes,
        'avg_edges': avg_num_edges,
        'feature_dim': node_feature_dim
    }

# Load and analyze MUTAG dataset
print("MUTAG Dataset Analysis:")
mutag_graphs = torch.load("../data/graph_classification/graph_mutag_with_node_num.pt")
mutag_stats = get_graphs_stats(mutag_graphs["train"])

# Load and analyze ENZYMES dataset
print("\nENZYMES Dataset Analysis:")
enzymes_graphs = torch.load("../data/graph_classification/graph_enzymes_with_node_num.pt")
enzymes_stats = get_graphs_stats(enzymes_graphs["train"])

# Print summary table
print("\nSummary Statistics for Training Split:")
print("-" * 60)
print(f"{'Metric':<20} {'MUTAG':>15} {'ENZYMES':>15}")
print("-" * 60)
print(f"{'Number of graphs':<20} {mutag_stats['num_graphs']:>15} {enzymes_stats['num_graphs']:>15}")
print(f"{'Avg. nodes/graph':<20} {mutag_stats['avg_nodes']:>15.2f} {enzymes_stats['avg_nodes']:>15.2f}")
print(f"{'Avg. edges/graph':<20} {mutag_stats['avg_edges']:>15.2f} {enzymes_stats['avg_edges']:>15.2f}")
print(f"{'Node feature dim':<20} {mutag_stats['feature_dim']:>15} {enzymes_stats['feature_dim']:>15}")
print("-" * 60)

MUTAG Dataset Analysis:
Number of graphs: 141
Average number of nodes: 18.85
Average number of edges: 94.04
Node feature dimension: 8
--------------------------------------------------

ENZYMES Dataset Analysis:
Number of graphs: 360
Average number of nodes: 33.27
Average number of edges: 221.19
Node feature dimension: 22
--------------------------------------------------

Summary Statistics for Training Split:
------------------------------------------------------------
Metric                         MUTAG         ENZYMES
------------------------------------------------------------
Number of graphs                 141             360
Avg. nodes/graph               18.85           33.27
Avg. edges/graph               94.04          221.19
Node feature dim                   8              22
------------------------------------------------------------
