## Run blitzGSEA

In [1]:
import os
import pandas as pd
import blitzgsea as blitz

In [8]:
def load_custom_gmt(path):
    """
    Parse a GMT file into a dict: {term_name: [gene1, gene2, ...], …}
    """
    with open(path, 'r') as f:
        return {
            parts[0]: parts[2:]   # skip description at index 1
            for line in f
            if (parts := line.strip().split('\t')) and len(parts) > 2
        }


def run_gsea_pandas(input_tsv, gmt_file, processes=4):
    """
    Reads TSV, renames columns, runs GSEA with custom pathways, saves as TSV.
    """
    # Load library
    library_sets = load_custom_gmt(gmt_file)

    # Read input TSV
    df = pd.read_csv(input_tsv, sep="\t", header=0, index_col=None)
    
    # Create the expected format for blitz.gsea: columns [0, 1]
    # Column 0: scores, Column 1: gene symbols
    gsea_df = pd.DataFrame()
    gsea_df[1] = df['symbol']  # gene symbols (matching GMT library)
    gsea_df[0] = pd.to_numeric(df['globalScore'], errors='coerce')  # scores
    
    # Drop rows with NaN scores
    gsea_df = gsea_df.dropna(subset=[0])
    
    print(f"GSEA input shape: {gsea_df.shape}")
    print(f"GSEA input columns: {gsea_df.columns.tolist()}")
    print(f"GSEA input sample:\n{gsea_df.head()}")
    
    # Run GSEA with the properly formatted dataframe
    res_df = blitz.gsea(gsea_df, library_sets, processes=processes).reset_index(names="Term")

    # Add propagated_edge after GSEA computation is complete
    res_df["propagated_edge"] = res_df["Term"].apply(
        lambda t: ",".join(library_sets.get(t, [])) if library_sets.get(t) else ""
    )

    # Extract ID and clean Term
    term_series = res_df["Term"]
    res_df["ID"] = term_series.str.extract(r"\[([^\]]+)\]", expand=False).fillna("")
    res_df["Term"] = term_series.str.replace(r"\s*\[[^\]]+\]", "", regex=True).str.strip()

    # Ensure leading_edge is a string
    if "leading_edge" in res_df.columns:
        res_df["leading_edge"] = res_df["leading_edge"].apply(
            lambda x: ",".join(map(str, x)) if isinstance(x, (list, tuple)) else str(x)
        )

    # Reorder columns
    first_cols = ["Term", "ID"]
    res_df = res_df[first_cols + [c for c in res_df.columns if c not in first_cols]]

    # Save output
    output_path = f"{os.path.splitext(input_tsv)[0]}_gsea.tsv"
    res_df.to_csv(output_path, sep="\t", index=False)

    print(f"GSEA results saved to {output_path}")
    return res_df

In [9]:
scores = "data/input/geneset_ta_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"

run_gsea_pandas(scores, library, processes=4)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1         0
0  CDKN2B  8.694865
1     ABO  8.224087
2     FTO  7.753310
3   SH2B3  6.811755
4    APOE  6.340978
GSEA results saved to data/input/geneset_ta_zscore_gsea.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Signal Transduction,R-HSA-162582,0.398511,6.495889,8.254439e-11,1.409033e-07,1.409033e-07,1412,"CDKN2B,SH2B3,APOE,SMAD3,TCF7L2,VEGFA,PPARG,NCA...","AAMP,AATF,ABCA1,ABCD3,ABCG1,ABCG5,ABCG8,ABHD12..."
1,Cytokine Signaling in Immune system,R-HSA-1280215,0.510645,5.155147,2.534324e-07,4.325155e-04,1.493192e-04,412,"SH2B3,SMAD3,IFNAR2,NOD2,VEGFA,IRF1,IRF7,FANCA,...","AAAS,ABCE1,ABL2,ADAM17,ADAR,ADRM1,AGER,AIP,AKT..."
2,HIV Transcription Initiation,R-HSA-167161,-0.525742,-4.961420,6.997971e-07,1.193841e-03,1.493192e-04,16,"MNAT1,CDK7,GTF2A2,ERCC3,TAF4,GTF2H5,GTF2H1,GTF...","CDK7,ERCC2,ERCC3,GTF2A1,GTF2A2,GTF2B,GTF2E1,GT..."
3,RNA Polymerase II Transcription Pre-Initiation...,R-HSA-73779,-0.525742,-4.961420,6.997971e-07,1.193841e-03,1.493192e-04,16,"MNAT1,CDK7,GTF2A2,ERCC3,TAF4,GTF2H5,GTF2H1,GTF...","CDK7,ERCC2,ERCC3,GTF2A1,GTF2A2,GTF2B,GTF2E1,GT..."
4,RNA Polymerase II Transcription Initiation And...,R-HSA-76042,-0.525742,-4.961420,6.997971e-07,1.193841e-03,1.493192e-04,16,"MNAT1,CDK7,GTF2A2,ERCC3,TAF4,GTF2H5,GTF2H1,GTF...","CDK7,ERCC2,ERCC3,GTF2A1,GTF2A2,GTF2B,GTF2E1,GT..."
...,...,...,...,...,...,...,...,...,...,...
1702,NRIF signals cell death from the nucleus,R-HSA-205043,0.322152,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,6,,"APH1B,ITGB3BP,MAPK8,NCSTN,NGF,NGFR,PSEN1,PSEN2..."
1703,Neddylation,R-HSA-8951664,0.135931,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,91,"PSMA4,FBXL22,KLHL25,CDKN1A,BTRC,KEAP1,LMO7,SPS...","AMER1,ANKRD9,ASB1,ASB10,ASB11,ASB12,ASB13,ASB1..."
1704,Negative regulation of NMDA receptor-mediated ...,R-HSA-9617324,0.199347,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,13,"DLG3,CAMK2B,CAMK2D,GRIN2B,DLG2,CAMK2G,DLG1,LRR...","CALM1,CAMK1,CAMK2A,CAMK2B,CAMK2D,CAMK2G,CAMK4,..."
1705,Peptide ligand-binding receptors,R-HSA-375276,0.269567,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,78,"MLN,XCR1,CCR1,RXFP2,MC4R,CXCR4,EDNRA,POMC,F2,N...","ACKR2,ACKR3,ACKR4,AGT,AGTR1,AGTR2,ANXA1,APLN,A..."


In [10]:
scores = "data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"

run_gsea_pandas(scores, library, processes=4)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to data/input/geneset_disease_zscore_gsea.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Signal Transduction,R-HSA-162582,0.476955,5.709466,1.133311e-08,0.000019,0.000019,1412,"CDKN2B,APOE,SH2B3,TERT,SMAD3,TCF7L2,ESR1,MYC,I...","AAMP,AATF,ABCA1,ABCD3,ABCG1,ABCG5,ABCG8,ABHD12..."
1,Fatty Acids bound to GPR40 (FFAR1) regulate in...,R-HSA-434316,-0.724533,-5.503425,3.724821e-08,0.000064,0.000021,5,"GNAQ,PLCB3,PLCB2,GNA15","GNA11,GNA14,GNA15,GNAQ,PLCB1,PLCB2,PLCB3"
2,Alpha-defensins,R-HSA-1462054,-0.700975,-5.417470,6.044838e-08,0.000103,0.000021,5,"DEFA4,DEFA1,DEFA5,DEFA3","CD4,DEFA1,DEFA3,DEFA4,DEFA5,DEFA6,PRSS2,PRSS3,env"
3,Adrenoceptors,R-HSA-390696,-0.696625,-5.401593,6.605167e-08,0.000113,0.000021,5,"ADRA1B,ADRA2B,ADRA1D","ADRA1B,ADRA1D,ADRA2A,ADRA2B,ADRA2C,ADRB1,ADRB2..."
4,DAG1 glycosylations,R-HSA-8931838,-0.691286,-5.382105,7.361963e-08,0.000126,0.000021,5,"FKTN,POMGNT1,CRPPA,CHST10","B4GAT1,CHST10,CRPPA,DAG1,FKRP,FKTN,LARGE1,LARG..."
...,...,...,...,...,...,...,...,...,...,...
1702,Nonsense Mediated Decay (NMD) enhanced by the ...,R-HSA-975957,0.327343,-0.000000,1.000000e+00,1.000000,1.000000,44,SMG6,"28S rRNA,5.8S rRNA,5S rRNA,CASC3,DCP1A,EIF4A3,..."
1703,Nonsense-Mediated Decay (NMD),R-HSA-927802,0.327343,-0.000000,1.000000e+00,1.000000,1.000000,44,SMG6,"28S rRNA,5.8S rRNA,5S rRNA,CASC3,DCP1A,EIF4A3,..."
1704,Norepinephrine Neurotransmitter Release Cycle,R-HSA-181430,0.295406,-0.000000,1.000000e+00,1.000000,1.000000,11,SLC22A1,"MAOA,PPFIA1,PPFIA2,PPFIA3,PPFIA4,RAB3A,RIMS1,S..."
1705,Nuclear Envelope Breakdown,R-HSA-2980766,0.316954,-0.000000,1.000000e+00,1.000000,1.000000,27,NEK6,"BANF1,CCNB1,CCNB2,CDK1,CNEP1R1,CTDNEP1,EMD,LEM..."
