In [15]:
import sys

In [16]:
sys.path.append("..")

In [17]:
import torch
from src.data.graph import Graph

In [18]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


#### We will start exploring the graph by instantiating a Graph object. That requires defining an argument dictionary.


In [19]:
args = {
        "graph": "karate",  # name of the graph
        "basepath": "../data",  # path to the directory that has the graph files
        "task": "classify",  # the task
        "test_frac": 0.20,  #  fraction of the edges to be used as test split
        "val_frac": 0.20,  #  fraction of the edges to be used as val split
        "gpu": False  # we don't need a GPU for exploring the graph
        } 

In [20]:
graph = Graph(**args)

Loading karate dataset...
Found 78 edges


### Each graph object has three important attributes
1. Node features
2. Node labels
3. The adjacency matrix (sparse coo matrix)

In [21]:
# extract the number of nodes and features
num_nodes, num_features = graph.features.shape
num_nodes, num_features

(34, 34)

In [22]:
graph.labels

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

##### Extract the ede information

In [23]:
edges = graph.adj.coalesce().indices().T.tolist()

In [24]:
edges[:4]  # edges is a list of [node1, node2] elements, indicating an edge between node1 and node 2 

[[0, 0], [0, 1], [0, 2], [0, 3]]

In [25]:
num_edges = len(edges);
num_edges

190

In [26]:
from collections import defaultdict
def find_degree_per_node(edges):
    in_degree = defaultdict(int)

    for (src, tgt) in edges:
        in_degree[tgt] += 1
    
    avg_in_degree = sum(in_degree.values()) / len(in_degree)
    max_in_degree = max(in_degree.values())
    min_in_degree = min(in_degree.values())
    
    
    print(avg_in_degree, max_in_degree, min_in_degree)
    
        

In [27]:
find_degree_per_node(edges)

5.588235294117647 18 2


In [29]:
# Creating a function to analyze any graph as long as it saved in ../data
def analyze_graph(graph_name):
    args = {
        "graph": graph_name,  # name of the graph
        "basepath": "../data",  # path to the directory that has the graph files
        "task": "classify",  # the task
        "test_frac": 0.20,  # fraction of the edges to be used as test split
        "val_frac": 0.20,  # fraction of the edges to be used as val split
        "gpu": False  # we don't need a GPU for exploring the graph
    }
    
    print(f"\nAnalyzing {graph_name.upper()} dataset...")
    graph = Graph(**args)
    
    # Get basic stats
    num_nodes, num_features = graph.features.shape
    edges = graph.adj.coalesce().indices().T.tolist()
    num_edges = len(edges)
    
    # Calculate degrees
    from collections import defaultdict
    in_degree = defaultdict(int)
    for (src, tgt) in edges:
        in_degree[tgt] += 1
    
    avg_in_degree = sum(in_degree.values()) / len(in_degree)
    max_in_degree = max(in_degree.values())
    min_in_degree = min(in_degree.values())
    
    # Print results
    print(f"Statistics for {graph_name.upper()}:")
    print(f"Maximum in-degree: {max_in_degree}")
    print(f"Minimum in-degree: {min_in_degree}")
    print(f"Average in-degree: {avg_in_degree:.2f}")
    print(f"Number of nodes: {num_nodes}")
    print(f"Number of edges: {num_edges}")
    print(f"Node feature dimension: {num_features}")
    print("-" * 50)

# Analyze all three datasets
datasets = ["karate", "cora", "citeseer"]
for dataset in datasets:
    analyze_graph(dataset)


Analyzing KARATE dataset...
Loading karate dataset...
Found 78 edges
Statistics for KARATE:
Maximum in-degree: 18
Minimum in-degree: 2
Average in-degree: 5.59
Number of nodes: 34
Number of edges: 190
Node feature dimension: 34
--------------------------------------------------

Analyzing CORA dataset...
Loading cora dataset...
Found 5429 edges
Statistics for CORA:
Maximum in-degree: 169
Minimum in-degree: 2
Average in-degree: 4.90
Number of nodes: 2708
Number of edges: 13264
Node feature dimension: 1433
--------------------------------------------------

Analyzing CITESEER dataset...
Loading citeseer dataset...
Found 4715 edges
Statistics for CITESEER:
Maximum in-degree: 100
Minimum in-degree: 1
Average in-degree: 3.74
Number of nodes: 3312
Number of edges: 12384
Node feature dimension: 3703
--------------------------------------------------
