In [3]:
import numpy as np
import plotly.express as px
import pandas as pd

In [4]:
geneset = pd.read_csv("/Users/polina/genetics_gsea/data/geneset_annotated/geneset_reactome_2025_disease_v2.csv")

In [5]:
pathways = geneset[["Term", "ID"]].drop_duplicates().reset_index(drop=True).copy()

In [6]:
pathways = pathways.assign(
    Term=pathways["Term"].str.split(";"),
    ID=pathways["ID"].str.split(";")
)

pathways_exploded = pathways.explode(["Term", "ID"]).drop_duplicates().reset_index(drop=True)

In [7]:
hieararchy = pd.read_csv("/Users/polina/genetics_gsea/data/gmt/Reactome_2025/Pathways_hierarchy_relationship.txt", sep="\t", header=None)
hieararchy.columns = ["parentId", "childId"]

In [8]:
pathway_hieararchy = pd.merge(pathways_exploded, hieararchy, left_on="ID", right_on="childId", how="left")[["ID", "parentId", "Term"]]

In [9]:
gsea_disease = pd.read_csv("/Users/polina/genetics_gsea/data/gsea/from_database/disease_zscore_reactome_2025.tsv", sep="\t")

In [20]:
gsea_disease

Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Signal Transduction,R-HSA-162582,0.476955,5.879025,4.126910e-09,0.000007,0.000007,1412,"CDKN2B,APOE,SH2B3,TERT,SMAD3,TCF7L2,ESR1,MYC,I...","AAMP,AATF,ABCA1,ABCD3,ABCG1,ABCG5,ABCG8,ABHD12..."
1,Fatty Acids bound to GPR40 (FFAR1) regulate in...,R-HSA-434316,-0.724533,-4.992017,5.975189e-07,0.001019,0.000214,5,"GNAQ,GNA15,PLCB2,PLCB3","GNA11,GNA14,GNA15,GNAQ,PLCB1,PLCB2,PLCB3"
2,Alpha-defensins,R-HSA-1462054,-0.700975,-4.899360,9.614920e-07,0.001640,0.000214,5,"DEFA4,DEFA1,DEFA5,DEFA3","CD4,DEFA1,DEFA3,DEFA4,DEFA5,DEFA6,PRSS2,PRSS3,env"
3,Adrenoceptors,R-HSA-390696,-0.696625,-4.882216,1.049003e-06,0.001789,0.000214,5,"ADRA1B,ADRA2B,ADRA1D","ADRA1B,ADRA1D,ADRA2A,ADRA2B,ADRA2C,ADRB1,ADRB2..."
4,RNA Pol II CTD phosphorylation and interaction...,R-HSA-167160,-0.608281,-4.862360,1.159944e-06,0.001978,0.000214,9,"GTF2H1,GTF2H5,POLR2B,MNAT1,ERCC3,CDK7,POLR2F","CDK7,ERCC2,ERCC3,GTF2F1,GTF2F2,GTF2H1,GTF2H2,G..."
...,...,...,...,...,...,...,...,...,...,...
1702,Nonsense-Mediated Decay (NMD),R-HSA-927802,0.327343,-0.000000,1.000000e+00,1.000000,1.000000,44,SMG6,"28S rRNA,5.8S rRNA,5S rRNA,CASC3,DCP1A,EIF4A3,..."
1703,Norepinephrine Neurotransmitter Release Cycle,R-HSA-181430,0.295406,-0.000000,1.000000e+00,1.000000,1.000000,11,SLC22A1,"MAOA,PPFIA1,PPFIA2,PPFIA3,PPFIA4,RAB3A,RIMS1,S..."
1704,Nuclear Envelope (NE) Reassembly,R-HSA-2995410,0.156919,-0.000000,1.000000e+00,1.000000,1.000000,32,,"ANKLE2,BANF1,CC2D1B,CCNB1,CCNB2,CDK1,CHMP2A,CH..."
1705,Neuronal System,R-HSA-112316,0.284462,-0.000000,1.000000e+00,1.000000,1.000000,219,"SYN3,ADCY5,CAMK2D,PRKAG2,CHRNA4,ERBB4,SLC22A2,...","ABCC8,ABCC9,ACHE,ACTN2,ADCY1,ADCY2,ADCY3,ADCY4..."


In [10]:
pathway_hieararchy_gsea = pd.merge(pathway_hieararchy, gsea_disease, left_on="ID", right_on="ID", how="left")[["ID", "parentId", "Term_x", "nes", "fdr"]].rename(columns={"Term_x": "Term"})

# Plot parents that were not enriched in grey

In [60]:
# Filter pathways by FDR
pathway_hieararchy_gsea_001 = pathway_hieararchy_gsea[pathway_hieararchy_gsea["fdr"] < 0.001].copy()

In [61]:
hieararchy = pd.read_csv("/Users/polina/genetics_gsea/data/gmt/Reactome_2025/Pathways_hierarchy_relationship.txt", sep="\t", header=None)

In [62]:
hieararchy.rename(columns={0: "parentId", 1: "childId"}, inplace=True)

In [63]:
# Recreate full hierarchical tree for pathway_hieararchy_gsea_001 by adding ancestor rows
# Build child -> parent mapping from the Reactome hierarchy table
# Expecting `hieararchy` with columns ["parentId", "childId"] already loaded
child_to_parent = dict(zip(hieararchy["childId"], hieararchy["parentId"]))

base_df = pathway_hieararchy_gsea_001.copy()

# Start with all rows in base_df
rows_to_add = []
existing_ids = set(base_df["ID"].astype(str))

# For each row, climb up the parent chain and add missing ancestor rows
for _, row in base_df.iterrows():
    current_parent = row["parentId"]
    visited = set()
    while isinstance(current_parent, str) and current_parent and (current_parent not in visited):
        visited.add(current_parent)
        # Determine this parent's parent from the mapping (if any)
        next_parent = child_to_parent.get(current_parent, None)
        # Add the row for the current parent if it's not already present
        if current_parent not in existing_ids:
            rows_to_add.append({
                "ID": current_parent,
                "parentId": next_parent if isinstance(next_parent, str) else np.nan,
                "Term": np.nan,
                "nes": 0,
                "fdr": np.nan
            })
            existing_ids.add(current_parent)
        # Move up the chain
        current_parent = next_parent

# Append new rows, if any, and reset index
if rows_to_add:
    ancestors_df = pd.DataFrame(rows_to_add, columns=["ID", "parentId", "Term", "nes", "fdr"]).drop_duplicates()
    pathway_hieararchy_gsea_001_expanded = pd.concat([base_df, ancestors_df], ignore_index=True)
else:
    pathway_hieararchy_gsea_001_expanded = base_df.copy()

pathway_hieararchy_gsea_001_expanded

Unnamed: 0,ID,parentId,Term,nes,fdr
0,R-HSA-162582,,Signal Transduction,5.879025,0.000007
1,R-HSA-212436,R-HSA-73857,Generic Transcription Pathway,4.456707,0.000382
2,R-HSA-73857,R-HSA-74160,RNA Polymerase II Transcription,4.202785,0.000634
3,R-HSA-74160,,Gene expression (Transcription),4.022728,0.000928
4,R-HSA-1280215,R-HSA-168256,Cytokine Signaling in Immune system,4.523809,0.000357
...,...,...,...,...,...
226,R-HSA-5250941,R-HSA-212165,,0.000000,
227,R-HSA-1461973,R-HSA-6803157,,0.000000,
228,R-HSA-6803157,R-HSA-168249,,0.000000,
229,R-HSA-425397,R-HSA-425407,,0.000000,


In [64]:
# Fill Term only for rows where Term is NaN, using complete_list_of_pathways
complete_path = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/complete_list_of_pathways.txt"
complete_df = pd.read_csv(complete_path, sep="\t", header=None, names=["ID", "Term", "extra"], dtype=str)[["ID", "Term"]]

# Choose source table (prefer expanded; else base filtered)
try:
    target_df = pathway_hieararchy_gsea_001_expanded.copy()
except NameError:
    target_df = pathway_hieararchy_gsea_001.copy()

# Ensure strings
target_df["ID"] = target_df["ID"].astype(str)
complete_df["ID"] = complete_df["ID"].astype(str)

# Build mapping and fill only NaN Terms
term_map = dict(zip(complete_df["ID"], complete_df["Term"]))
mask_na_term = target_df["Term"].isna()
target_df.loc[mask_na_term, "Term"] = target_df.loc[mask_na_term, "ID"].map(term_map)

# Output updated table without adding new IDs
pathway_hieararchy_gsea_001_full = target_df
pathway_hieararchy_gsea_001_full


Unnamed: 0,ID,parentId,Term,nes,fdr
0,R-HSA-162582,,Signal Transduction,5.879025,0.000007
1,R-HSA-212436,R-HSA-73857,Generic Transcription Pathway,4.456707,0.000382
2,R-HSA-73857,R-HSA-74160,RNA Polymerase II Transcription,4.202785,0.000634
3,R-HSA-74160,,Gene expression (Transcription),4.022728,0.000928
4,R-HSA-1280215,R-HSA-168256,Cytokine Signaling in Immune system,4.523809,0.000357
...,...,...,...,...,...
226,R-HSA-5250941,R-HSA-212165,Negative epigenetic regulation of rRNA expression,0.000000,
227,R-HSA-1461973,R-HSA-6803157,Defensins,0.000000,
228,R-HSA-6803157,R-HSA-168249,Antimicrobial peptides,0.000000,
229,R-HSA-425397,R-HSA-425407,"Transport of vitamins, nucleosides, and relate...",0.000000,


In [65]:
# Merge NES/FDR from gsea_disease into pathway_hieararchy_gsea_001_full by ID
# Keep only rows that exist in pathway_hieararchy_gsea_001_full (left-merge)
# Fill missing NES with 0 and FDR with NaN

def merge_nes_fdr(path_df: pd.DataFrame, gsea_df: pd.DataFrame) -> pd.DataFrame:
    cols_needed = ["ID", "nes", "fdr"]
    gsea_min = gsea_df[[c for c in cols_needed if c in gsea_df.columns]].copy()
    out = path_df.merge(gsea_min, on="ID", how="left", suffixes=("", "_gsea"))
    if "nes" not in out.columns:
        out["nes"] = np.nan
    if "fdr" not in out.columns:
        out["fdr"] = np.nan
    out["nes"] = out["nes"].fillna(0)
    out["fdr"] = out["fdr"].astype(float)
    return out

# Apply
pathway_hieararchy_gsea_001_full_nes = merge_nes_fdr(pathway_hieararchy_gsea_001_full, gsea_disease)
plot = pathway_hieararchy_gsea_001_full_nes[["Term" ,"ID", "nes_gsea", "fdr_gsea", "parentId"]].rename(columns={"nes_gsea": "nes", "fdr_gsea": "fdr"})

In [151]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
# import pprint


def plot_sunburst(df, width=1100, height=800, show_labels=True, labels='all'):
    df = df.copy()
    df["parentId"] = df["parentId"].fillna("ROOT")
    valid_ids = set(df["ID"])
    df.loc[~df["parentId"].isin(valid_ids) & (df["parentId"] != "ROOT"), "parentId"] = "ROOT"

    # simplified wrapping
    df["wrapped_term"] = df["Term"]

    root = pd.DataFrame([{
        "ID": "ROOT",
        "parentId": "",
        "wrapped_term": "Reactome pathways",
        "Term": "Reactome pathways",
        "nes": 0
    }])
    df = pd.concat([df, root], ignore_index=True)

    color_scale = ['#0571b0','#92c5de','#f7f7f7','#f4a582','#ca0020']

    text_colors = [
        '#2F4F4F' if term == "Reactome pathways" else (
            '#A9A9A9' if (pd.notna(nes_val) and float(nes_val) == 0) else 'white'
        )
        for term, nes_val in zip(df["Term"], df["nes"])
    ]

    df.loc[df["ID"].eq("ROOT"), "wrapped_term"] = " "

    if not show_labels:
        effective_labels = 'none'
    else:
        lbl = str(labels).strip().lower()
        if lbl in {'none', 'no', 'off', 'false'}:
            effective_labels = 'none'
        elif lbl in {'root', 'root_only', 'rootonly', 'first', 'rootlevel', 'root_level'}:
            effective_labels = 'root_only'
        else:
            effective_labels = 'all'

    if effective_labels == 'none':
        df['label_to_show'] = ''
    elif effective_labels == 'root_only':
        df['label_to_show'] = df.apply(
            lambda r: (r['wrapped_term'] if (r['parentId'] == 'ROOT' and r['ID'] != 'ROOT') else ''),
            axis=1
        )
    else:
        df['label_to_show'] = df.apply(lambda r: ('' if r['ID'] == 'ROOT' else r['wrapped_term']), axis=1)

    fig = px.sunburst(
        df,
        names="wrapped_term",
        ids="ID",
        parents="parentId",
        values=None,
        color="nes",
        color_continuous_scale=color_scale,
        color_continuous_midpoint=0,
        branchvalues='total',
        width=width,
        height=height
    )

    fig.update_layout(
        margin=dict(t=50, l=50, r=50, b=50),
        coloraxis_colorbar=dict(
            title=dict(text="NES", font=dict(size=16)),
            tickvals=[-5, -2.5, 0, 2.5, 5],
            tickfont=dict(size=14),
            ticks="outside",
            thickness=20,
            len=0.6
        ),
        font=dict(size=14)
    )

    fig.update_traces(
        insidetextorientation='radial',
        textfont=dict(size=14, family="Arial", color=text_colors),
        hovertemplate='<b>%{customdata[0]}</b><br>NES: %{color:.2f}<extra></extra>',
        customdata=df[["Term"]],
        text=df['label_to_show'],
        textinfo='text',
        texttemplate='<b>%{text}</b>'
    )

    # # === Add duplicated labels on outer ring ===
    # annotations = []
    # trace = fig.data[0]

    # domain = getattr(trace, "domain")
    # x0, x1 = domain["x"][0], domain["x"][1]
    # pprint.pprint(fig.to_dict(), width=200)

    # for i, lbl in enumerate(trace.labels):
    #     if lbl.strip() == "" or lbl == "Reactome pathways":
    #         continue
    #     # sector middle angle (radians)
    #     theta = 180 #(trace.theta[i] + trace.theta[i] + trace.thetaunit[i]) / 2 if hasattr(trace, "theta") else None
    #     if theta is None:
    #         print(theta)
    #         continue

    #     # convert polar to cartesian, push outward
    #     r = 0.4 #trace.r[i] * 1.15
    #     x = r * np.cos(np.deg2rad(theta)) + 0.5
    #     y = r * np.sin(np.deg2rad(theta)) + 0.5

    #     annotations.append(dict(
    #         x=x, y=y, text=lbl,
    #         showarrow=False,
    #         textangle=theta + 90,
    #         font=dict(size=12, color="black"),
    #         xanchor="center", yanchor="middle"
    #     ))

    # fig.update_layout(annotations=annotations)

    return fig


In [153]:
fig2 = plot_sunburst(plot, width=1100, height=1100, labels='all')
fig2.show()

In [68]:
fig2.write_image("/Users/polina/genetics_gsea/data/sunburst/sunburst_plot_no_labels_v1.png", width=1100, height=1100, scale=2)

In [155]:
# Save fig2 to HTML
output_html = "/Users/polina/genetics_gsea/data/sunburst/sunburst_plot_all_labels_v1.html"
fig2.write_html(output_html, include_plotlyjs='cdn')
output_html

'/Users/polina/genetics_gsea/data/sunburst/sunburst_plot_all_labels_v1.html'

### Plot local sunbursts

In [None]:
plot_sunburst(df, width=1100, height=800, show_labels=True)