In [1]:
import pandas as pd

## Target - pathway matrix

### Filder by FDR, add hierarchies

In [3]:
def load_reactome_pathways_with_hierarchy(pathways_path, hierarchy_path, fdr_cutoff=0.05):
    # Step 1: Load pathways TSV (instead of multiple parquet files)
    df = pd.read_csv(pathways_path, sep="\t")
    
    # Step 2: Filter by FDR cutoff
    df_filtered = df[df["fdr"] <= fdr_cutoff].copy()

    # Step 3: Load pathway hierarchy file
    pathways_hierarchy_df = pd.read_csv(
        hierarchy_path, sep="\t", header=None, names=["parentId", "childId"]
    )

    # Step 4: Compute hierarchy levels
    parent_map = dict(zip(pathways_hierarchy_df["childId"], pathways_hierarchy_df["parentId"]))

    def get_level(child_id):
        level, current = 0, child_id
        while current in parent_map and pd.notna(parent_map[current]):
            current = parent_map[current]
            level += 1
            if level > 50:  # safety break for cycles
                break
        return level

    pathways_hierarchy_df["hierLevel"] = pathways_hierarchy_df["childId"].map(get_level)

    # Step 5: Merge filtered pathways with hierarchy
    joined_df = df_filtered.merge(
        pathways_hierarchy_df, left_on="ID", right_on="childId", how="left"
    )

    return joined_df


In [20]:
pathways_path = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_gsea.tsv"
hierarchy_path = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/Pathways_hierarchy_relationship.txt"

path_disease = load_reactome_pathways_with_hierarchy(pathways_path, hierarchy_path, fdr_cutoff=0.05)

In [21]:
pathways_path = "/Users/polina/genetics_gsea/data/input/geneset_ta_zscore_gsea.tsv"
hierarchy_path = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/Pathways_hierarchy_relationship.txt"

path_ta = load_reactome_pathways_with_hierarchy(pathways_path, hierarchy_path, fdr_cutoff=0.05)

### Filter only targets from intial list of genetic hits

In [25]:
path_disease_targets = path_disease.loc[:, ["Term", "ID", "propagated_edge", "parentId", "hierLevel"]]
path_disease_targets.loc[:, "propagated_edge"] = path_disease_targets["propagated_edge"].str.split(",")
path_disease_targets_exp = path_disease_targets.explode("propagated_edge")

In [26]:
path_ta_targets = path_ta[["Term", "ID", "propagated_edge", "parentId", "hierLevel"]].copy()
path_ta_targets["propagated_edge"] = path_ta_targets["propagated_edge"].str.split(",")
path_ta_targets_exp = path_ta_targets.explode("propagated_edge")

In [29]:
path_ta_targets_exp

Unnamed: 0,Term,ID,propagated_edge,parentId,hierLevel
0,Signal Transduction,R-HSA-162582,AAMP,,
0,Signal Transduction,R-HSA-162582,AATF,,
0,Signal Transduction,R-HSA-162582,ABCA1,,
0,Signal Transduction,R-HSA-162582,ABCD3,,
0,Signal Transduction,R-HSA-162582,ABCG1,,
...,...,...,...,...,...
501,Growth hormone receptor signaling,R-HSA-982772,SOCS3,R-HSA-1280215,2.0
501,Growth hormone receptor signaling,R-HSA-982772,STAT1,R-HSA-1280215,2.0
501,Growth hormone receptor signaling,R-HSA-982772,STAT3,R-HSA-1280215,2.0
501,Growth hormone receptor signaling,R-HSA-982772,STAT5A,R-HSA-1280215,2.0


## Pathway embeddings

## Target embeddings