# MAG-10

In [1]:
import numpy as np
from collections import Counter

In [2]:
folder = "../../data/raw/mag_10"
with open(f'{folder}/hyperedges.txt', 'r') as f:
    hyperedges = [list(map(int, line.strip().split('\t'))) for line in f]

with open(f'{folder}/hyperedge-label-identities.txt', 'r') as f:
    hyperedges_labels = [line.strip() for line in f]
    # ['KDD', 'WWW', 'ICML',...]
with open(f'{folder}/hyperedge-labels.txt', 'r') as f:
    hyperedges_labels_idx = [int(line.strip()) for line in f]
    # [4, 10, 8, ...]

In [4]:
filtered = [(idx, e) for idx, e in zip(hyperedges_labels_idx, hyperedges) if idx <= 10]
hyperedges_labels_idx, hyperedges = zip(*filtered)
hyperedges_labels_idx = list(hyperedges_labels_idx)
hyperedges = list(hyperedges)

In [5]:
# stats
print("# of edges:", len(hyperedges))
print("# of nodes:", len({node for edge in hyperedges for node in edge}))

node_counts = Counter(node for edge in hyperedges for node in edge)
node_deg = np.array(list(node_counts.values()))
mean_node_deg = node_deg.mean()
std_node_deg  = node_deg.std(ddof=0)
print(f"node degree: - average: {mean_node_deg:.1f}; sd:{std_node_deg:.1f}")

num_edges = len(hyperedges)
edge_deg = np.array([len(edge) for edge in hyperedges])
mean_edge_deg = edge_deg.mean()
std_edge_deg  = edge_deg.std(ddof=0)
print(f"edge degree: - average {mean_edge_deg:.1f}; sd: {std_edge_deg:.1f}")

max_edge_deg = int(edge_deg.max()) if num_edges > 0 else 0
print("max edge degree:", max_edge_deg)

num_deg2 = int((edge_deg == 2).sum())
pct_deg2 = (num_deg2 / num_edges) * 100 if num_edges > 0 else 0.0
print(f"percent of edges with degree 2: {pct_deg2:.1f}%")

# of edges: 51888
# of nodes: 80198
node degree: - average: 2.3; sd:4.6
edge degree: - average 3.5; sd: 1.6
max edge degree: 25
percent of edges with degree 2: 29.9%


In [8]:
# save as tsv for HORC computation
with open('./derived_data/hyperedges.tsv', 'w') as f:
    for edge in hyperedges:
        f.write('\t'.join(map(str, edge)) + '\n')