In [28]:
import sys
sys.path.append("..")
import torch
from collections import Counter
from src.data.graph import Graph

### Load the graph

In [29]:
args = {
        "graph": "cora",  # name of the graph
        "basepath": "../data",  # path to the directory that has the graph files
        "task": "link_pred",  # the task
        "test_frac": 0.20,  #  fraction of the edges to be used as test split
        "val_frac": 0.20,  #  fraction of the edges to be used as val split
        "gpu": False  # we don't need a GPU for exploring the graph
        } 

In [30]:
graph = Graph(**args)

Loading cora dataset...


INFO:root:Loading edges


Found 5429 edges


#### Explore Edges

In [31]:
graph.train_edges_positive.shape  # (2, num_edges)

torch.Size([2, 7960])

In [32]:
graph.train_edge_labels.shape # (num_edges)

torch.Size([15920])

In [33]:
Counter(graph.train_edge_labels.tolist() \
        + graph.val_edge_labels.tolist() \
        + graph.test_edge_labels.tolist())  #  total number of positive and negative edges

Counter({1: 13264, 0: 13264})

In [34]:
def analyze_dataset(dataset_name):
    print(f"\nAnalyzing {dataset_name.upper()} dataset...")
    
    args = {
        "graph": dataset_name,
        "basepath": "../data",
        "task": "link_pred",
        "test_frac": 0.20,
        "val_frac": 0.20,
        "gpu": False
    }
    
    graph = Graph(**args)
    
    # Get all edge labels (train + val + test)
    all_edge_counts = Counter(graph.train_edge_labels.tolist() + 
                            graph.val_edge_labels.tolist() + 
                            graph.test_edge_labels.tolist())
    
    print("\nTotal Edge Distribution:")
    print(f"Positive edges: {all_edge_counts[1]}")
    print(f"Negative edges: {all_edge_counts[0]}")
    
    return all_edge_counts

# Analyze all datasets
datasets = ["karate", "cora", "citeseer"]

print("Link Prediction Training Data Analysis")
print("=" * 40)

results = {}
for dataset in datasets:
    results[dataset] = analyze_dataset(dataset)

# Print summary table
print("\nSummary of Training Data:")
print("-" * 60)
print(f"{'Dataset':<12} {'Positive Edges':>15} {'Negative Edges':>15}")
print("-" * 60)
for dataset, counts in results.items():
    print(f"{dataset.upper():<12} {counts[1]:>15} {counts[0]:>15}")
print("-" * 60)

INFO:root:Loading edges


Link Prediction Training Data Analysis

Analyzing KARATE dataset...
Loading karate dataset...
Found 78 edges

Total Edge Distribution:
Positive edges: 190
Negative edges: 190

Analyzing CORA dataset...
Loading cora dataset...


INFO:root:Loading edges


Found 5429 edges

Total Edge Distribution:
Positive edges: 13264
Negative edges: 13264

Analyzing CITESEER dataset...
Loading citeseer dataset...
Found 4715 edges


INFO:root:Loading edges



Total Edge Distribution:
Positive edges: 12384
Negative edges: 12384

Summary of Training Data:
------------------------------------------------------------
Dataset       Positive Edges  Negative Edges
------------------------------------------------------------
KARATE                   190             190
CORA                   13264           13264
CITESEER               12384           12384
------------------------------------------------------------
