# Stex

In [1]:
import pandas as pd
from tqdm import tqdm
import gzip
import numpy as np
from itertools import chain

In [2]:
file_path = "../../data/raw/stex/stex.chg.tsv.gz"
site_ids = []
tags = []

# Open the gzipped file and read line by line
with gzip.open(file_path, 'rt') as f:
    # Read line by line
    for line in tqdm(f, desc="Processing rows"):
        columns = list(map(int, line.strip().split("\t")))
        site_ids.append(columns[0])  # First element
        tags.append(columns[1:])  # Rest of the elements
stex = pd.DataFrame({'idx': site_ids, 'edges': tags})
stex.head()

Processing rows: 6689705it [00:16, 406752.93it/s]


Unnamed: 0,idx,edges
0,1,"[208, 134]"
1,1,"[14, 254, 51, 98]"
2,1,[358]
3,1,"[275, 254, 289]"
4,1,"[79, 44]"


In [3]:
stex[stex["edges"].str.len() > 5].idx.unique()

array([80])

In [3]:
nodes_per_graph = (
    stex
    .groupby("idx")["edges"]
    .apply(lambda lists: len(set(chain.from_iterable(lists))))
    .rename("num_nodes")
    .reset_index()
)

In [7]:
print(f"# of graphs: {stex['idx'].nunique()}")

hyperedges = stex.edges.to_list()
# average & SD of edges per graph
edge_counts = stex.groupby('idx')['edges'].count()
print(f"average number of edges: {edge_counts.mean():.1f}")
print(f"SD number of edges: {edge_counts.std(ddof=0):.1f}")

# average & SD of nodes per graph
print(f"average number of nodes: {nodes_per_graph['num_nodes'].mean():.1f}")
print(f"SD number of nodes: {nodes_per_graph['num_nodes'].std(ddof=0):.1f}")

# max edge degree
max_edge_deg = max(len(edge) for edge in stex['edges'])
print(f"max edge degree: {max_edge_deg}")

# percent of edges with degree == 2
pct_deg2 = np.mean([len(edge) == 2 for edge in stex['edges']]) * 100
print(f"percent of edges with degree 2: {pct_deg2:.1f}%")

# of graphs: 355
average number of edges: 18844.2
SD number of edges: 50323.4
average number of nodes: 433.6
SD number of nodes: 745.7
max edge degree: 7
percent of edges with degree 2: 29.4%


# Save

In [13]:
stex.to_pickle("./derived_data/stex.pkl")

In [3]:
# save each hypergraph as a tsv file for HORC computation
for idx in stex['idx'].unique():
    stex_data = stex[stex['idx'] == idx]
    file_path = f"derived_data/stex/stex_{idx}.tsv"
    edges = stex_data['edges'].apply(lambda x: '\t'.join(map(str, x)))
    # Write the graph to a .tsv file
    with open(file_path, 'w') as f:
        for edge in edges:
            f.write(f"{edge}\n")