## Run blitzGSEA

In [53]:
import os
import pandas as pd
import blitzgsea as blitz

In [50]:
def load_custom_gmt(path):
    """
    Parse a GMT file into a dict: {term_name: [gene1, gene2, ...], …}
    """
    with open(path, 'r') as f:
        return {
            parts[0]: parts[2:]   # skip description at index 1
            for line in f
            if (parts := line.strip().split('\t')) and len(parts) > 2
        }


def run_gsea_pandas(input_tsv, gmt_file, output_tsv=None, processes=4):
    """
    Reads TSV, renames columns, runs GSEA with custom pathways, saves as TSV.
    
    Parameters
    ----------
    input_tsv : str
        Input file with at least 'symbol' and 'globalScore' columns.
    gmt_file : str
        Path to custom GMT file.
    processes : int
        Number of processes for GSEA.
    output_tsv : str or None
        Custom output filename. If None, defaults to <input_basename>_gsea.tsv
    """
    # Load library
    library_sets = load_custom_gmt(gmt_file)

    # Read input TSV
    df = pd.read_csv(input_tsv, sep="\t", header=0, index_col=None)
    
    # Create the expected format for blitz.gsea: columns [0, 1]
    gsea_df = pd.DataFrame()
    gsea_df[1] = df['symbol']  # gene symbols (matching GMT library)
    gsea_df[0] = pd.to_numeric(df['globalScore'], errors='coerce')  # scores
    
    # Drop rows with NaN scores
    gsea_df = gsea_df.dropna(subset=[0])
    
    print(f"GSEA input shape: {gsea_df.shape}")
    print(f"GSEA input columns: {gsea_df.columns.tolist()}")
    print(f"GSEA input sample:\n{gsea_df.head()}")
    
    # Run GSEA
    res_df = blitz.gsea(gsea_df, library_sets, processes=processes).reset_index(names="Term")

    # Add propagated_edge after GSEA computation is complete
    res_df["propagated_edge"] = res_df["Term"].apply(
        lambda t: ",".join(library_sets.get(t, [])) if library_sets.get(t) else ""
    )

    # Extract ID and clean Term
    term_series = res_df["Term"]
    res_df["ID"] = term_series.str.extract(r"\[([^\]]+)\]", expand=False).fillna("")
    res_df["Term"] = term_series.str.replace(r"\s*\[[^\]]+\]", "", regex=True).str.strip()

    # Ensure leading_edge is a string
    if "leading_edge" in res_df.columns:
        res_df["leading_edge"] = res_df["leading_edge"].apply(
            lambda x: ",".join(map(str, x)) if isinstance(x, (list, tuple)) else str(x)
        )

    # Reorder columns
    first_cols = ["Term", "ID"]
    res_df = res_df[first_cols + [c for c in res_df.columns if c not in first_cols]]

    # Determine output filename
    if output_tsv is None:
        output_tsv = f"{os.path.splitext(input_tsv)[0]}_gsea.tsv"

    # Save output
    res_df.to_csv(output_tsv, sep="\t", index=False)

    print(f"GSEA results saved to {output_tsv}")
    return res_df

### Reactome from database

In [43]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_ta_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/from_database/ta_zscore_reactome_2025.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1         0
0  CDKN2B  8.694865
1     ABO  8.224087
2     FTO  7.753310
3   SH2B3  6.811755
4    APOE  6.340978


OSError: Cannot save file into a non-existent directory: '/Users/polina/genetics_gsea/data/gsea/from_database/from_database'

In [44]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/from_database/disease_zscore_reactome_2025.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/from_database/disease_zscore_reactome_2025.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Signal Transduction,R-HSA-162582,0.476955,5.879025,4.126910e-09,0.000007,0.000007,1412,"CDKN2B,APOE,SH2B3,TERT,SMAD3,TCF7L2,ESR1,MYC,I...","AAMP,AATF,ABCA1,ABCD3,ABCG1,ABCG5,ABCG8,ABHD12..."
1,Fatty Acids bound to GPR40 (FFAR1) regulate in...,R-HSA-434316,-0.724533,-4.992017,5.975189e-07,0.001019,0.000214,5,"GNAQ,GNA15,PLCB2,PLCB3","GNA11,GNA14,GNA15,GNAQ,PLCB1,PLCB2,PLCB3"
2,Alpha-defensins,R-HSA-1462054,-0.700975,-4.899360,9.614920e-07,0.001640,0.000214,5,"DEFA4,DEFA1,DEFA5,DEFA3","CD4,DEFA1,DEFA3,DEFA4,DEFA5,DEFA6,PRSS2,PRSS3,env"
3,Adrenoceptors,R-HSA-390696,-0.696625,-4.882216,1.049003e-06,0.001789,0.000214,5,"ADRA1B,ADRA2B,ADRA1D","ADRA1B,ADRA1D,ADRA2A,ADRA2B,ADRA2C,ADRB1,ADRB2..."
4,RNA Pol II CTD phosphorylation and interaction...,R-HSA-167160,-0.608281,-4.862360,1.159944e-06,0.001978,0.000214,9,"GTF2H1,GTF2H5,POLR2B,MNAT1,ERCC3,CDK7,POLR2F","CDK7,ERCC2,ERCC3,GTF2F1,GTF2F2,GTF2H1,GTF2H2,G..."
...,...,...,...,...,...,...,...,...,...,...
1702,Nonsense-Mediated Decay (NMD),R-HSA-927802,0.327343,-0.000000,1.000000e+00,1.000000,1.000000,44,SMG6,"28S rRNA,5.8S rRNA,5S rRNA,CASC3,DCP1A,EIF4A3,..."
1703,Norepinephrine Neurotransmitter Release Cycle,R-HSA-181430,0.295406,-0.000000,1.000000e+00,1.000000,1.000000,11,SLC22A1,"MAOA,PPFIA1,PPFIA2,PPFIA3,PPFIA4,RAB3A,RIMS1,S..."
1704,Nuclear Envelope (NE) Reassembly,R-HSA-2995410,0.156919,-0.000000,1.000000e+00,1.000000,1.000000,32,,"ANKLE2,BANF1,CC2D1B,CCNB1,CCNB2,CDK1,CHMP2A,CH..."
1705,Neuronal System,R-HSA-112316,0.284462,-0.000000,1.000000e+00,1.000000,1.000000,219,"SYN3,ADCY5,CAMK2D,PRKAG2,CHRNA4,ERBB4,SLC22A2,...","ABCC8,ABCC9,ACHE,ACTN2,ADCY1,ADCY2,ADCY3,ADCY4..."


# From facets

In [57]:
def load_custom_gmt(path):
    """
    Parse a GMT file into a dict: {term_name: [gene1, gene2, ...], …}
    """
    with open(path, 'r') as f:
        return {
            parts[0]: parts[2:]   # skip description at index 1
            for line in f
            if (parts := line.strip().split('\t')) and len(parts) > 2
        }


def run_gsea_pandas_from_facets(input_tsv, gmt_file, output_tsv=None, processes=4):
    """
    Reads TSV, renames columns, runs GSEA with custom pathways, saves as TSV.
    
    Parameters
    ----------
    input_tsv : str
        Input file with at least 'symbol' and 'globalScore' columns.
    gmt_file : str
        Path to custom GMT file.
    processes : int
        Number of processes for GSEA.
    output_tsv : str or None
        Custom output filename. If None, defaults to <input_basename>_gsea.tsv
    """
    # Load library
    library_sets = load_custom_gmt(gmt_file)

    # Read input TSV
    df = pd.read_csv(input_tsv, sep="\t", header=0, index_col=None)
    
    # Create the expected format for blitz.gsea: columns [0, 1]
    gsea_df = pd.DataFrame()
    gsea_df[1] = df['symbol']  # gene symbols (matching GMT library)
    gsea_df[0] = pd.to_numeric(df['globalScore'], errors='coerce')  # scores
    
    # Drop rows with NaN scores
    gsea_df = gsea_df.dropna(subset=[0])
    
    print(f"GSEA input shape: {gsea_df.shape}")
    print(f"GSEA input columns: {gsea_df.columns.tolist()}")
    print(f"GSEA input sample:\n{gsea_df.head()}")
    
    # Run GSEA
    res_df = blitz.gsea(gsea_df, library_sets, processes=processes).reset_index(names="Term")

    # Add propagated_edge after GSEA computation is complete
    res_df["propagated_edge"] = res_df["Term"].apply(
        lambda t: ",".join(library_sets.get(t, [])) if library_sets.get(t) else ""
    )

    # Extract ID from { } and clean Term
    term_series = res_df["Term"]
    res_df["ID"] = term_series.str.extract(r"\{([^}]*)\}", expand=False).fillna("")
    res_df["Term"] = term_series.str.replace(r"\{[^}]*\}", "", regex=True).str.strip()

    # Ensure leading_edge is a string
    if "leading_edge" in res_df.columns:
        res_df["leading_edge"] = res_df["leading_edge"].apply(
            lambda x: ",".join(map(str, x)) if isinstance(x, (list, tuple)) else str(x)
        )

    # Reorder columns
    first_cols = ["Term", "ID"]
    res_df = res_df[first_cols + [c for c in res_df.columns if c not in first_cols]]

    # Determine output filename
    if output_tsv is None:
        output_tsv = f"{os.path.splitext(input_tsv)[0]}_gsea.tsv"

    # Save output
    res_df.to_csv(output_tsv, sep="\t", index=False)

    print(f"GSEA results saved to {output_tsv}")
    return res_df


## Disease pleiotropy score

In [59]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/Reactome.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_facets.tsv"

run_gsea_pandas_from_facets(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_facets.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Fatty Acids bound to GPR40 (FFAR1) regulate in...,R-HSA-434316,-0.724533,-4.992017,5.975189e-07,0.000698,0.000232,5,"GNAQ,GNA15,PLCB2,PLCB3","PLCB3,PLCB1,GNAQ,PLCB2,GNA15,FFAR1,GNA14"
1,Mitochondrial calcium ion transport,R-HSA-8949215,-0.637698,-4.936928,7.936274e-07,0.000927,0.000232,8,"SLC8A3,VDAC1,MICU2,MCU,AKAP1,SLC8B1,VDAC3","MICU1,LETM1,VDAC2,SLC8A3,MICU2,VDAC1,AKAP1,MCU..."
2,Defensins,R-HSA-1461973,-0.573524,-4.882541,1.047277e-06,0.001224,0.000232,12,"DEFA4,DEFB130A,DEFA3,DEFB131A,DEFB134,DEFB112,...","DEFB116,DEFB103A,DEFB1,DEFA1B,DEFB110,DEFA5,DE..."
3,RNA Polymerase III Transcription Initiation Fr...,R-HSA-76071,-0.622846,-4.871134,1.109597e-06,0.001296,0.000232,8,"POLR3K,ZNF143,POU2F1,SNAPC2,POLR2F","SNAPC1,SNAPC3,POLR3E,SNAPC2,POLR3H,POLR3C,POLR..."
4,mRNA Capping,R-HSA-72086,-0.587861,-4.826763,1.387704e-06,0.001621,0.000232,10,"GTF2H1,GTF2H5,POLR2B,MNAT1,CCNH,ERCC3,CDK7,POLR2F","ERCC3,MNAT1,CCNH,GTF2F1,ERCC2,POLR2I,GTF2H2,GT..."
...,...,...,...,...,...,...,...,...,...,...
1164,Separation of Sister Chromatids,R-HSA-2467813,0.338843,-0.000000,1.000000e+00,1.000000,1.000000,70,"PSMA4,SPC24,NUF2,STAG1,SPDL1,ANAPC1,PSMD13,PTT...","PDS5B,PSMB5,ADRM1,SPDL1,PSMD3,KNL1,RPS27,ZW10,..."
1165,Sensory processing of sound by outer hair cell...,R-HSA-9662361,0.262286,-0.000000,1.000000e+00,1.000000,1.000000,23,"MYH9,RIPOR2,SPTBN1,EPB41L1,SLC26A5,TRIOBP","TPRN,TMC2,GRXCR2,TMC1,MYO3A,PJVK,CIB2,TWF2,CAS..."
1166,Sema4D mediated inhibition of cell attachment ...,R-HSA-416550,0.270011,-0.000000,1.000000e+00,1.000000,1.000000,5,,"RAC1,ARHGAP35,RND1,RRAS,RHOA,MET,PLXNB1"
1167,IRAK2 mediated activation of TAK1 complex upon...,R-HSA-975163,0.366307,-0.000000,1.000000e+00,1.000000,1.000000,7,"TAB2,TICAM1","UBB,MAP3K7,LY96,TAB2,TICAM1,TRAF6,TLR4,UBA52,T..."


In [60]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/ChEMBL Target Class.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_chemblTclass.tsv"

run_gsea_pandas_from_facets(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_chemblTclass.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,GABA-A receptor,,-0.654306,-4.427678,0.000010,0.001399,0.000835,6,"GABRB2,GABRG2,GABRA1,GABRB1,GABRP,GABRA2","GABRA6,GABRA3,GABRD,GABRA2,GABRB2,GABRB3,GABRA..."
1,Serine/threonine protein phosphatase,,-0.466958,-4.219841,0.000024,0.003587,0.000835,9,"PPM1G,PPM1A,PPP5C,PPP1CB,AKT1,PPM1D,PPP3CA,PPP2CA","PPM1D,UBLCP1,PPM1G,PPP2CB,PPP5C,PPP1CA,PPP3CB,..."
2,Nucleotide-like receptor (family A GPCR),,-0.463090,-4.162250,0.000032,0.004622,0.000835,8,"P2RY11,P2RY14,P2RY12,P2RY13","P2RY13,ADORA3,P2RY11,ADORA2A,ADORA1,P2RY8,P2RY..."
3,Purine receptor,,-0.510749,-4.148172,0.000034,0.004915,0.000835,5,"P2RY1,P2RY12,P2RY2,P2RY11,P2RY13","P2RY13,P2RY4,P2RY11,P2RY1,P2RY8,P2RY12,P2RY2,P..."
4,Ionotropic glutamate receptor,,-0.432883,-4.106876,0.000040,0.005878,0.000835,10,"GRIK4,CACNG2,GRIK1,GRIK3","GRIA3,GRIN2B,GRIN3B,GRIA2,CACNG8,CACNG2,GRID2,..."
...,...,...,...,...,...,...,...,...,...,...
142,Epigenetic regulator,,0.275472,-0.000000,1.000000,1.000000,1.000000,98,"BRWD1,SMARCA4,KMT2E,ZMYND8,HDAC9,JMJD1C,DOT1L,...","KAT2B,CECR2,FEM1B,DNMT3B,MBTD1,PHLPP1,TAF3,BPT..."
143,Electrochemical transporter,,0.241674,-0.000000,1.000000,1.000000,1.000000,70,"SLC22A5,SLC22A1,SLC22A4,SLC22A3,SLC2A9,SLC22A2...","SLC22A6,SLC25A6,SLC18A3,SLC11A1,SLC6A11,SLC5A1..."
144,STE protein kinase STE7 family,,0.282884,-0.000000,1.000000,1.000000,1.000000,6,"MAP2K4,MAP2K1,MAP2K3,MAP2K6,MAP2K7","MAP2K2,MAP2K1,MAP2K3,CRBN,MAP2K7,MAP2K4,MAP2K6"
145,Protein kinase regulatory subunit,,0.249383,-0.000000,1.000000,1.000000,1.000000,12,PRKAG2,"CDC7,PRKAR2B,PIK3CB,CDK17,CDK12,IKBKG,PRKAR1A,..."


In [61]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/GO:BP.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_gobp.tsv"

run_gsea_pandas_from_facets(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_gobp.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,positive regulation of transcription by RNA po...,GO:0045944,0.573947,5.898197,3.674941e-09,0.000012,0.000012,740,"SLC30A9,ISL1,BMP4,SMAD3,TCF7L2,AHI1,TNFSF11,IR...","SOX4,NFKBIZ,E2F7,PRKD2,CD28,RPS6KA3,RELB,CASK,..."
1,regulation of transcription by RNA polymerase II,GO:0006357,0.549088,5.708157,1.142059e-08,0.000037,0.000019,780,"SMAD3,TCF7L2,PHF2,IRF4,ESR1,ZFPM2,POU5F1B,FOXP...","HES5,FOXS1,ZNF587,MED20,LRRFIP1,MXD3,ZNF644,JA..."
2,positive regulation of DNA-templated transcrip...,GO:0045893,0.619666,5.290644,1.218861e-07,0.000396,0.000080,466,"APOE,BMP4,SMAD3,PHF2,IRF4,CHEK2,ESR1,ZFPM2,MYC...","HES5,ELK1,SOX4,FOS,MAZ,ZNF281,TP73,LEF1,TAF5L,..."
3,subtelomeric heterochromatin formation,GO:0031509,-0.725472,-5.258849,1.449602e-07,0.000471,0.000080,7,"RIF1,HAT1,H3-3A,EZH1,SIRT1","EZH1,SIRT2,ATRX,SIRT1,H3-3A,H3-3B,RIF1,SIRT6,H..."
4,receptor guanylyl cyclase signaling pathway,GO:0007168,-0.787093,-5.236157,1.639547e-07,0.000533,0.000080,5,"NPR1,NPPA,GUCY2C,GNG7","GUCY2F,NHERF4,GUCA1B,NPPA,NPR2,GUCY2D,NPR1,GUC..."
...,...,...,...,...,...,...,...,...,...,...
3247,clathrin-dependent endocytosis,GO:0072583,0.283384,-0.000000,1.000000e+00,1.000000,1.000000,15,"PICALM,SGIP1,SNAP91,DLL1,MAGI2","INPP5F,MAGI2,CLTA,FCHSD2,AP2A1,SNAP91,SCYL2,CL..."
3248,corpus callosum development,GO:0022038,0.247929,-0.000000,1.000000e+00,1.000000,1.000000,11,,"RTN4R,CDK5,RYK,KCNC1,NSUN5,HERC1,PTPRS,RTN4RL1..."
3249,cortical actin cytoskeleton organization,GO:0030866,0.242813,-0.000000,1.000000e+00,1.000000,1.000000,13,AKAP11,"PLEK,LLGL1,EHD2,FMNL3,ROCK2,FMNL2,EPB41L1,EPB4..."
3250,cellular response to potassium ion,GO:0035865,0.325002,-0.000000,1.000000e+00,1.000000,1.000000,11,,"HSF1,NEK7,SLC12A2,CYP11B2,STK39,CYP11B1,ABCC9,..."


In [62]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/GO:CC.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_gocc.tsv"

run_gsea_pandas_from_facets(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_gocc.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,nucleoplasm,GO:0005654,0.414334,10.928300,8.441534e-28,4.904531e-25,,1899,"FTO,ARL14EP,TERT,SMAD3,TCF7L2,EMSY,IRF4,CHEK2,...","SLC35B1,USP13,ORC1,SOX4,MED20,MAPKBP1,E2F7,STY..."
1,chromatin,GO:0000785,0.582169,5.399043,6.699722e-08,3.892463e-05,,640,"ISL1,SMAD3,TCF7L2,IRF4,ESR1,ZFPM2,POU5F1B,FOXP...","HES5,FOXS1,SOX4,MXD3,LBX1,E2F7,RELB,HOXB13,GCM..."
2,inhibitory synapse,GO:0060077,-0.722929,-5.248021,1.537415e-07,8.931985e-05,,7,"IGSF21,GAD1,SLC32A1,IQSEC3,GLRA1,GABRA2","SLC32A1,GIT1,MAF1,GABRA2,GAD1,IQSEC3,DTNB,SYT1..."
3,TRAPP complex,GO:0030008,-0.786715,-5.234690,1.652618e-07,9.601251e-05,,5,"TRAPPC9,TRAPPC10,TRAPPC11,TRAPPC3,TRAPPC4","TRAPPC8,TRAPPC6A,TRAPPC2L,TRAPPC11,TRAPPC2B,TR..."
4,TRAPPII protein complex,GO:1990071,-0.719928,-4.973931,6.560863e-07,3.811136e-04,,5,"TRAPPC9,TRAPPC13,TRAPPC10,TRAPPC3,TRAPPC4","TRAPPC3,TRAPPC10,TRAPPC2,TRAPPC14,TRAPPC9,TRAP..."
...,...,...,...,...,...,...,...,...,...,...
576,cytoplasm,GO:0005737,0.341400,inf,,,,3476,"CDKN2B,FTO,APOE,TERT,SMAD3,PTPN22,IRF4,CHEK2,L...","MTMR11,MAP2,SOX4,MAPKBP1,GNPNAT1,STYX,HAL,PCBP..."
577,cytosol,GO:0005829,0.364897,inf,,,,2528,"CDKN2B,FTO,SH2B3,TERT,SMAD3,PTPN22,IRF4,LPP,ES...","USP13,ORC1,MAP2,LRRFIP1,GNPNAT1,CHODL,CD28,DNA..."
578,membrane,GO:0016020,0.264552,inf,,,,3586,"APOE,SLC39A8,IRF4,LPP,ESR1,CLPTM1L,DCC,IL2RA,V...","MTMR11,IGKV6D-41,GNPNAT1,SLC39A3,CD28,HTR3E,FL..."
579,nucleus,GO:0005634,0.408129,inf,,,,2964,"CDKN2B,FTO,APOE,TERT,SMAD3,TCF7L2,PTPN22,IRF4,...","USP13,ORC1,SOX4,LRRFIP1,CDKL3,CHCHD10,NFKBIZ,L..."


In [63]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/GO:MF.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_gomf.tsv"

run_gsea_pandas_from_facets(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_gomf.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,metal ion binding,GO:0046872,0.382125,6.921980,4.453741e-12,3.745596e-09,3.745596e-09,1791,"FTO,RPH3A,ADCY5,ISL1,TERT,SMAD3,PHF2,CHEK2,LPP...","USP13,ORC1,ZNF610,DNAJA2,TPO,TYROBP,FAN1,CSGAL..."
1,DNA binding,GO:0003677,0.505862,5.372407,7.769241e-08,6.533718e-05,2.305287e-05,1036,"TERT,SMAD3,TCF7L2,IRF4,ESR1,ZFPM2,POU5F1B,FOXP...","ZNF587,ORC1,SOX4,LRRFIP1,NFKBIZ,LBX1,E2F7,ZNF6..."
2,RNA polymerase II cis-regulatory region sequen...,GO:0000978,0.588377,5.268912,1.372346e-07,1.154076e-04,2.305287e-05,558,"ISL1,SMAD3,TCF7L2,IRF4,ESR1,POU5F1B,FOXP1,SATB...","HES5,FOXS1,ZNF587,SOX4,LRRFIP1,MXD3,ZNF644,NFK..."
3,glutathione binding,GO:0043295,-0.794098,-5.263293,1.414976e-07,1.189924e-04,2.305287e-05,5,"GSTM4,MGST2,LANCL1,MMACHC","GSTM4,LANCL1,GSTM1,MGST2,GSTM3,MMACHC,GSTM2,PT..."
4,molecular carrier activity,GO:0140104,-0.788073,-5.239956,1.606152e-07,1.350682e-04,2.305287e-05,5,"GOLGA1,CD14,DISP1","MTRR,RBP4,RBP2,MMADHC,MMAA,STARD7,GOLGA1,SERPI..."
...,...,...,...,...,...,...,...,...,...,...
836,protein kinase C binding,GO:0005080,0.312573,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,31,"HDAC9,HDAC7,IRS1","DACT2,GRK5,RACK1,HDAC7,PICK1,CCDC88A,MARCKS,IR..."
837,protein kinase activator activity,GO:0030295,0.316878,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,24,"INSR,MOB3B,PRKAG2","AJUBA,RPTOR,MOB3A,SPDYA,RPLP1,GPRC5A,NCKAP1L,P..."
838,cell adhesion molecule binding,GO:0050839,0.338404,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,56,"PTPRF,NRXN3,CD226,TENM2,FGA,CXADR,FGG,PTPN11,S...","PCDHA11,PCDHA7,POSTN,PCDHGA4,AFDN,CDHR4,CD1D,M..."
839,clathrin binding,GO:0030276,0.181293,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,21,"PICALM,BIN1,SNAP91,AP4B1","DNAJC6,BIN1,AFTPH,AP1B1,ENTHD1,LDLRAP1,PIK3C2A..."


In [64]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/Subcellular Location.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_subcell.tsv"

run_gsea_pandas_from_facets(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_subcell.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Cytosol,SL-0091,0.340561,inf,0.000000,0.000000,,2345,"CDKN2B,FTO,TACC3,ARL14EP,TERT,TCF7L2,IRF4,LPP,...","MAP2,LRRFIP1,CHODL,LBX1,DNAJA2,HAL,PARP11,PCBP..."
1,Nucleus,SL-0191,0.424045,inf,0.000000,0.000000,,2276,"FTO,TERT,SMAD3,TCF7L2,IRF4,CHEK2,LPP,ESR1,POU5...","MED20,ORC1,SOX4,LRRFIP1,MAPKBP1,NFKBIZ,LBX1,E2..."
2,[Isoform 1]: Membrane,SL-0162,-0.541298,-4.462265,0.000008,0.001127,,7,"STX1B,CMTM8,MUC3A,C3orf33,F3","CMTM8,STX1B,MUC3A,CD300H,EPGN,PAM,C3orf33,F3,S..."
3,Nucleus inner membrane,SL-0179,-0.378960,-3.997301,0.000064,0.008866,,15,"DPY19L2,SPAG4,SUN5,TOR1AIP1,SUN1,ARL6IP6,TERB2...","LRPPRC,MFSD10,TMEM43,EMD,TMEM201,NEMP2,LEMD3,I..."
4,Sarcoplasmic reticulum lumen,SL-0240,-0.444597,-3.894866,0.000098,0.013565,,5,"CALR,HSP90B1,CASQ2,MANF","HRC,HSP90B1,CASQ2,CALU,MANF,CASQ1,CALR"
...,...,...,...,...,...,...,...,...,...,...
134,Lysosome,SL-0158,0.356721,-0.000000,1.000000,1.000000,,57,"MANBA,INSR,CXCR4,CTSH,TNFAIP3,PCSK9,CTSS","CTSK,PIP4K2A,ACP5,LAMTOR5,RAB8A,ZNRF1,ARSG,RAB..."
135,Peroxisome membrane,SL-0203,0.197654,-0.000000,1.000000,1.000000,,16,"TMEM135,PEX14,HMGCR","PEX19,HMGCR,PEX16,ACBD5,PJVK,TMEM135,SLC25A17,..."
136,[Isoform 7]: Cytoplasm,SL-0086,0.366022,-0.000000,1.000000,1.000000,,7,SEMA6D,"CFAP91,FLT1,PDE4A,ENDOV,DST,CLEC7A,EPM2A,NF2,S..."
137,Cytoplasm,SL-0086,0.319696,inf,,,,2548,"CDKN2B,TERT,SMAD3,PTPN22,LPP,ESR1,POU5F1B,MYC,...","POLDIP3,KCNE3,USP13,MAP2,MAPKBP1,LRRFIP1,CDKL3..."


## TA pleiotropy score

In [None]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_2025.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

# Merge and filter by fdr

In [65]:
import os
import pandas as pd

def merge_and_filter_tsv(folder_path, output_file="merged_filtered.xlsx", fdr_threshold=0.001):
    """
    Merges all TSV files from a folder into one DataFrame, adds 'gene_set' column from filename,
    filters by FDR column, and saves both merged + filtered tables into an Excel file.
    """
    all_dataframes = []
    
    for file in os.listdir(folder_path):
        if file.endswith(".tsv"):
            file_path = os.path.join(folder_path, file)
            df = pd.read_csv(file_path, sep="\t")
            
            if "fdr" not in df.columns:
                raise ValueError(f"'fdr' column not found in {file}")
            
            gene_set = file.replace("disease_zscore_", "").replace(".tsv", "")
            df["gene_set"] = gene_set
            all_dataframes.append(df)
    
    if not all_dataframes:
        raise ValueError("⚠️ No TSV files found in the folder.")
    
    # Merge all files (row-wise concat by column names)
    merged_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Filter rows
    filtered_df = merged_df[merged_df["fdr"] <= fdr_threshold]
    
    # Save to Excel with two sheets
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, sheet_name="merged_all", index=False)
        filtered_df.to_excel(writer, sheet_name="filtered", index=False)
    
    print(f"✅ File saved as: {output_file}")
    print(f"   - Sheet 'merged_all': {merged_df.shape[0]} rows")
    print(f"   - Sheet 'filtered': {filtered_df.shape[0]} rows")


In [66]:
folder_path = "/Users/polina/genetics_gsea/data/gsea/from_facets_disease"
output_file = "/Users/polina/genetics_gsea/data/gsea_merged/from_facets_disease/disease_zscore_gsea.xlsx"

merge_and_filter_tsv(folder_path, output_file, fdr_threshold=0.001)

✅ File saved as: /Users/polina/genetics_gsea/data/gsea_merged/from_facets_disease/disease_zscore_gsea.xlsx
   - Sheet 'merged_all': 7836 rows
   - Sheet 'filtered': 383 rows
