# Mus

In [5]:
import pandas as pd
from tqdm import tqdm
import gzip
import numpy as np

In [2]:
file_path = "../../data/raw/mus/mus.chg.tsv.gz"
idx = []
freq = []

# Open the gzipped file and read line by line
with gzip.open(file_path, 'rt') as f:
    # Read line by line
    for line in tqdm(f, desc="Processing rows"):
        columns = list(map(int, line.strip().split("\t")))
        idx.append(columns[0])  # First element
        freq.append(columns[1:])  # Rest of the elements
mus = pd.DataFrame({'idx': idx, 'edges': freq})
mus.head()


Processing rows: 496792it [00:01, 434721.50it/s]


Unnamed: 0,idx,edges
0,1,"[13, 8, 16, 10]"
1,1,"[17, 7, 13, 9, 21]"
2,1,"[13, 21, 9, 7]"
3,1,"[16, 13, 8, 18, 10]"
4,1,"[8, 18, 13, 10]"


In [3]:
from itertools import chain

nodes_per_graph = (
    mus
    .groupby("idx")["edges"]
    .apply(lambda lists: len(set(chain.from_iterable(lists))))
    .rename("num_nodes")
    .reset_index()
)

In [6]:
print(f"# of graphs: {mus['idx'].nunique()}")

hyperedges = mus.edges.to_list()
# average & SD of edges per graph
edge_counts = mus.groupby('idx')['edges'].count()
print(f"average number of edges: {edge_counts.mean():.1f}")
print(f"SD number of edges: {edge_counts.std(ddof=0):.1f}")

# average & SD of nodes per graph
print(f"average number of nodes: {nodes_per_graph['num_nodes'].mean():.1f}")
print(f"SD number of nodes: {nodes_per_graph['num_nodes'].std(ddof=0):.1f}")

# max edge degree
max_edge_deg = max(len(edge) for edge in mus['edges'])
print(f"max edge degree: {max_edge_deg}")

# percent of edges with degree == 2
pct_deg2 = np.mean([len(edge) == 2 for edge in mus['edges']]) * 100
print(f"percent of edges with degree 2: {pct_deg2:.1f}%")

# of graphs: 1944
average number of edges: 255.6
SD number of edges: 417.1
average number of nodes: 24.5
SD number of nodes: 6.6
max edge degree: 12
percent of edges with degree 2: 12.3%


## Save

In [7]:
mus.to_pickle("derived_data/mus.pkl")

In [4]:
# save each hypergraph as a tsv file for HORC computation
for idx in mus['idx'].unique():
    mus_data = mus[mus['idx'] == idx]
    file_path = f"derived_data/mus/mus_{idx}.tsv"
    edges = mus_data['edges'].apply(lambda x: '\t'.join(map(str, x)))
    # Write the graph to a .tsv file
    with open(file_path, 'w') as f:
        for edge in edges:
            f.write(f"{edge}\n")