# MADStat

In [5]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
madstat = pd.read_csv("derived_data/AuPapMat.txt")
madstat.head()

Unnamed: 0,idxAu,idxPap,year,journal
0,1,80017,2014,Biost
1,2,47012,2001,SMed
2,3,63665,2010,CSTM
3,4,3528,1985,Bcs
4,4,7245,1977,Bka


In [3]:
madstat_hg = (
    madstat.groupby("idxPap", as_index=False).agg(
        edges    = ("idxAu", list),
        year       = ("year",    "first"),
        journal    = ("journal", "first")
    )
)
madstat_hg = madstat_hg.rename(columns={
    "idxPap": "idxPap"
})
madstat_hg.head()

Unnamed: 0,idxPap,edges,year,journal
0,1,"[16118, 24934, 46252, 46442]",2014,Bcs
1,2,"[17528, 22080, 24306, 25297, 34423, 46604, 46851]",2014,Bcs
2,3,"[5140, 23393, 26434]",2014,Bcs
3,4,"[19477, 22801, 31195, 37481]",2014,Bcs
4,5,"[7154, 10981, 29847, 34337, 46127]",2014,Bcs


In [6]:
hyperedges = madstat_hg['edges'].tolist()
# stats
print("# of edges:", len(hyperedges))
print("# of nodes:", len({node for edge in hyperedges for node in edge}))

node_counts = Counter(node for edge in hyperedges for node in edge)
node_deg = np.array(list(node_counts.values()))
mean_node_deg = node_deg.mean()
std_node_deg  = node_deg.std(ddof=0)
print(f"node degree: - average: {mean_node_deg:.1f}; sd:{std_node_deg:.1f}")

num_edges = len(hyperedges)
edge_deg = np.array([len(edge) for edge in hyperedges])
mean_edge_deg = edge_deg.mean()
std_edge_deg  = edge_deg.std(ddof=0)
print(f"edge degree: - average {mean_edge_deg:.1f}; sd: {std_edge_deg:.1f}")

max_edge_deg = int(edge_deg.max()) if num_edges > 0 else 0
print("max edge degree:", max_edge_deg)

num_deg2 = int((edge_deg == 2).sum())
pct_deg2 = (num_deg2 / num_edges) * 100 if num_edges > 0 else 0.0
print(f"percent of edges with degree 2: {pct_deg2:.1f}%")

# of edges: 83331
# of nodes: 47311
node degree: - average: 3.7; sd:7.8
edge degree: - average 2.1; sd: 1.1
max edge degree: 33
percent of edges with degree 2: 40.8%


In [8]:
# save hyperedges as tsv for HORC computation
with open('./derived_data/hyperedges.tsv', 'w') as f:
    for edge in hyperedges:
        f.write('\t'.join(map(str, edge)) + '\n')

In [9]:
# save madstat_hg
madstat_hg.to_pickle('./derived_data/madstat.pkl')