## Run blitzGSEA

In [3]:
import os
import pandas as pd
import blitzgsea as blitz

In [12]:
def load_custom_gmt(path):
    """
    Parse a GMT file into a dict: {term_name: [gene1, gene2, ...], …}
    """
    with open(path, 'r') as f:
        return {
            parts[0]: parts[2:]   # skip description at index 1
            for line in f
            if (parts := line.strip().split('\t')) and len(parts) > 2
        }


def run_gsea_pandas(input_tsv, gmt_file, output_tsv=None, processes=4):
    """
    Reads TSV, renames columns, runs GSEA with custom pathways, saves as TSV.
    
    Parameters
    ----------
    input_tsv : str
        Input file with at least 'symbol' and 'globalScore' columns.
    gmt_file : str
        Path to custom GMT file.
    processes : int
        Number of processes for GSEA.
    output_tsv : str or None
        Custom output filename. If None, defaults to <input_basename>_gsea.tsv
    """
    # Load library
    library_sets = load_custom_gmt(gmt_file)

    # Read input TSV
    df = pd.read_csv(input_tsv, sep="\t", header=0, index_col=None)
    
    # Create the expected format for blitz.gsea: columns [0, 1]
    gsea_df = pd.DataFrame()
    gsea_df[1] = df['symbol']  # gene symbols (matching GMT library)
    gsea_df[0] = pd.to_numeric(df['globalScore'], errors='coerce')  # scores
    
    # Drop rows with NaN scores
    gsea_df = gsea_df.dropna(subset=[0])
    
    print(f"GSEA input shape: {gsea_df.shape}")
    print(f"GSEA input columns: {gsea_df.columns.tolist()}")
    print(f"GSEA input sample:\n{gsea_df.head()}")
    
    # Run GSEA
    res_df = blitz.gsea(gsea_df, library_sets, processes=processes).reset_index(names="Term")

    # Add propagated_edge after GSEA computation is complete
    res_df["propagated_edge"] = res_df["Term"].apply(
        lambda t: ",".join(library_sets.get(t, [])) if library_sets.get(t) else ""
    )

    # Extract ID and clean Term
    term_series = res_df["Term"]
    res_df["ID"] = term_series.str.extract(r"\[([^\]]+)\]", expand=False).fillna("")
    res_df["Term"] = term_series.str.replace(r"\s*\[[^\]]+\]", "", regex=True).str.strip()

    # Ensure leading_edge is a string
    if "leading_edge" in res_df.columns:
        res_df["leading_edge"] = res_df["leading_edge"].apply(
            lambda x: ",".join(map(str, x)) if isinstance(x, (list, tuple)) else str(x)
        )

    # Reorder columns
    first_cols = ["Term", "ID"]
    res_df = res_df[first_cols + [c for c in res_df.columns if c not in first_cols]]

    # Determine output filename
    if output_tsv is None:
        output_tsv = f"{os.path.splitext(input_tsv)[0]}_gsea.tsv"

    # Save output
    res_df.to_csv(output_tsv, sep="\t", index=False)

    print(f"GSEA results saved to {output_tsv}")
    return res_df

### Reactome from database

In [14]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_ta_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/ta_zscore_reactome_2025.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1         0
0  CDKN2B  8.694865
1     ABO  8.224087
2     FTO  7.753310
3   SH2B3  6.811755
4    APOE  6.340978
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/ta_zscore_reactome_2025.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Signal Transduction,R-HSA-162582,0.398511,6.473849,9.553729e-11,1.630821e-07,1.630822e-07,1412,"CDKN2B,SH2B3,APOE,SMAD3,TCF7L2,VEGFA,PPARG,NCA...","AAMP,AATF,ABCA1,ABCD3,ABCG1,ABCG5,ABCG8,ABHD12..."
1,Immune System,R-HSA-168256,0.390655,5.198059,2.013799e-07,3.436965e-04,1.157133e-04,973,"SH2B3,ANAPC1,SMAD3,IFNAR2,PTPN22,NOD2,VEGFA,IR...","8,9b,A1BG,AAAS,AAMP,ABCA13,ABCE1,ABI1,ABI2,ABL..."
2,Cytokine Signaling in Immune system,R-HSA-1280215,0.510645,5.196237,2.033626e-07,3.470797e-04,1.157133e-04,412,"SH2B3,SMAD3,IFNAR2,NOD2,VEGFA,IRF1,IRF7,FANCA,...","AAAS,ABCE1,ABL2,ADAM17,ADAR,ADRM1,AGER,AIP,AKT..."
3,G1/S-Specific Transcription,R-HSA-69205,-0.546531,-4.639686,3.489383e-06,5.938684e-03,5.863855e-04,11,"FBXO5,LIN54,TFDP2,CCNE1,CDC25A,CDK1,E2F1","CCNE1,CDC25A,CDC45,CDC6,CDK1,CDT1,DHFR,E2F1,E2..."
4,RNA Polymerase II Transcription Pre-Initiation...,R-HSA-73779,-0.525742,-4.622514,3.791169e-06,6.450642e-03,5.863855e-04,16,"MNAT1,CDK7,GTF2A2,ERCC3,TAF4,GTF2H5,GTF2H1,GTF...","CDK7,ERCC2,ERCC3,GTF2A1,GTF2A2,GTF2B,GTF2E1,GT..."
...,...,...,...,...,...,...,...,...,...,...
1702,NRIF signals cell death from the nucleus,R-HSA-205043,0.322152,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,6,,"APH1B,ITGB3BP,MAPK8,NCSTN,NGF,NGFR,PSEN1,PSEN2..."
1703,Neddylation,R-HSA-8951664,0.135931,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,91,"PSMA4,FBXL22,KLHL25,BTRC,CDKN1A,KEAP1,LMO7,SPS...","AMER1,ANKRD9,ASB1,ASB10,ASB11,ASB12,ASB13,ASB1..."
1704,Negative regulation of MAPK pathway,R-HSA-5675221,0.305555,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,25,"DUSP10,MAPK3,MARK3","BRAF,BRAP,DUSP1,DUSP10,DUSP16,DUSP2,DUSP4,DUSP..."
1705,Metabolism of lipids,R-HSA-556833,0.236886,-0.000000,1.000000e+00,1.000000e+00,1.000000e+00,344,"SBF2,PRXL2B,HSD3B7,FUT2,PPARD,AHR,SREBF1,MED27...","AACS,ABCA1,ABCB11,ABCB4,ABCC1,ABCC3,ABCD1,ABCG..."


In [16]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_2025.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
        1          0
0  CDKN2B  21.634413
1     FTO  18.318823
2    APOE  15.455358
3     ABO  15.153941
4   SH2B3  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_2025.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Signal Transduction,R-HSA-162582,0.476955,5.879025,4.126910e-09,0.000007,0.000007,1412,"CDKN2B,APOE,SH2B3,TERT,SMAD3,TCF7L2,ESR1,MYC,I...","AAMP,AATF,ABCA1,ABCD3,ABCG1,ABCG5,ABCG8,ABHD12..."
1,Fatty Acids bound to GPR40 (FFAR1) regulate in...,R-HSA-434316,-0.724533,-4.992017,5.975189e-07,0.001019,0.000214,5,"GNAQ,GNA15,PLCB2,PLCB3","GNA11,GNA14,GNA15,GNAQ,PLCB1,PLCB2,PLCB3"
2,Alpha-defensins,R-HSA-1462054,-0.700975,-4.899360,9.614920e-07,0.001640,0.000214,5,"DEFA4,DEFA1,DEFA5,DEFA3","CD4,DEFA1,DEFA3,DEFA4,DEFA5,DEFA6,PRSS2,PRSS3,env"
3,Adrenoceptors,R-HSA-390696,-0.696625,-4.882216,1.049003e-06,0.001789,0.000214,5,"ADRA1B,ADRA2B,ADRA1D","ADRA1B,ADRA1D,ADRA2A,ADRA2B,ADRA2C,ADRB1,ADRB2..."
4,RNA Pol II CTD phosphorylation and interaction...,R-HSA-167160,-0.608281,-4.862360,1.159944e-06,0.001978,0.000214,9,"GTF2H1,GTF2H5,POLR2B,MNAT1,ERCC3,CDK7,POLR2F","CDK7,ERCC2,ERCC3,GTF2F1,GTF2F2,GTF2H1,GTF2H2,G..."
...,...,...,...,...,...,...,...,...,...,...
1702,Nonsense-Mediated Decay (NMD),R-HSA-927802,0.327343,-0.000000,1.000000e+00,1.000000,1.000000,44,SMG6,"28S rRNA,5.8S rRNA,5S rRNA,CASC3,DCP1A,EIF4A3,..."
1703,Norepinephrine Neurotransmitter Release Cycle,R-HSA-181430,0.295406,-0.000000,1.000000e+00,1.000000,1.000000,11,SLC22A1,"MAOA,PPFIA1,PPFIA2,PPFIA3,PPFIA4,RAB3A,RIMS1,S..."
1704,Nuclear Envelope (NE) Reassembly,R-HSA-2995410,0.156919,-0.000000,1.000000e+00,1.000000,1.000000,32,,"ANKLE2,BANF1,CC2D1B,CCNB1,CCNB2,CDK1,CHMP2A,CH..."
1705,Neuronal System,R-HSA-112316,0.284462,-0.000000,1.000000e+00,1.000000,1.000000,219,"SYN3,ADCY5,CAMK2D,PRKAG2,CHRNA4,ERBB4,SLC22A2,...","ABCC8,ABCC9,ACHE,ACTN2,ADCY1,ADCY2,ADCY3,ADCY4..."


# From facets

## Disease pleiotropy score

In [18]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_symbol.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/Reactome.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_facets.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
                 1          0
0  ENSG00000147883  21.634413
1  ENSG00000140718  18.318823
2  ENSG00000130203  15.455358
3  ENSG00000175164  15.153941
4  ENSG00000111252  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_facets.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Interleukin-4 and Interleukin-13 signaling,R-HSA-6785807,0.768158,3.233479,0.001223,0.760804,0.288118,79,"ENSG00000137265,ENSG00000096968,ENSG0000013699...","ENSG00000211896,ENSG00000113520,ENSG0000011541..."
1,TGFBR1 KD Mutants in Cancer,R-HSA-3656532,0.938031,2.840540,0.004504,0.994891,0.288118,5,ENSG00000166949,"ENSG00000105329,ENSG00000166949,ENSG0000015707..."
2,SMAD2/3 Phosphorylation Motif Mutants in Cancer,R-HSA-3304356,0.938031,2.840540,0.004504,0.994891,0.288118,5,ENSG00000166949,"ENSG00000105329,ENSG00000166949,ENSG0000015707..."
3,Fatty Acids bound to GPR40 (FFAR1) regulate in...,R-HSA-434316,-0.724533,-2.813248,0.004904,0.996809,0.288118,5,"ENSG00000156052,ENSG00000149782,ENSG0000013784...","ENSG00000149782,ENSG00000182621,ENSG0000015605..."
4,SMAD2/SMAD3:SMAD4 heterotrimer regulates trans...,R-HSA-2173796,0.889258,2.787902,0.005305,0.998007,0.288118,17,"ENSG00000147883,ENSG00000166949,ENSG0000014056...","ENSG00000106366,ENSG00000185591,ENSG0000017031..."
...,...,...,...,...,...,...,...,...,...,...
1164,GSK3B and BTRC:CUL1-mediated-degradation of NF...,R-HSA-9762114,0.213679,-0.000000,1.000000,1.000000,1.000000,19,ENSG00000041357,"ENSG00000136930,ENSG00000100804,ENSG0000010090..."
1165,Cholesterol biosynthesis,R-HSA-191273,0.240707,-0.000000,1.000000,1.000000,1.000000,10,,"ENSG00000072310,ENSG00000205808,ENSG0000006706..."
1166,GPER1 signaling,R-HSA-9634597,0.275218,-0.000000,1.000000,1.000000,1.000000,26,"ENSG00000173175,ENSG00000162104,ENSG0000012128...","ENSG00000114450,ENSG00000115414,ENSG0000015009..."
1167,HSF1-dependent transactivation,R-HSA-3371571,0.178769,-0.000000,1.000000,1.000000,1.000000,14,"ENSG00000058404,ENSG00000105993,ENSG0000014534...","ENSG00000141564,ENSG00000170315,ENSG0000014866..."


In [20]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_symbol.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/ChEMBL Target Class.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_chemblTclass.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
                 1          0
0  ENSG00000147883  21.634413
1  ENSG00000140718  18.318823
2  ENSG00000130203  15.455358
3  ENSG00000175164  15.153941
4  ENSG00000111252  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_chemblTclass.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Transcription factor[],,0.701554,3.511310,0.000446,0.063460,,122,"ENSG00000166949,ENSG00000148737,ENSG0000009183...","ENSG00000126767,ENSG00000151702,ENSG0000017748..."
1,Secreted protein[],,0.687392,3.219750,0.001283,0.171986,,105,"ENSG00000254647,ENSG00000166949,ENSG0000010031...","ENSG00000115457,ENSG00000172156,ENSG0000024024..."
2,Kinase[],,0.388727,2.710176,0.006725,0.629121,,280,"ENSG00000068078,ENSG00000145349,ENSG0000021306...","ENSG00000214102,ENSG00000008128,ENSG0000006536..."
3,Protein Kinase[],,0.396246,2.568979,0.010200,0.778443,,274,"ENSG00000135250,ENSG00000068078,ENSG0000014534...","ENSG00000214102,ENSG00000008128,ENSG0000006536..."
4,GABA-A receptor[],,-0.654306,-2.497018,0.012524,0.843185,,6,"ENSG00000145864,ENSG00000113327,ENSG0000002235...","ENSG00000145863,ENSG00000011677,ENSG0000018773..."
...,...,...,...,...,...,...,...,...,...,...
142,Small molecule receptor (family A GPCR)[],,0.267224,-0.000000,1.000000,1.000000,,44,ENSG00000171522,"ENSG00000139679,ENSG00000282608,ENSG0000016427..."
143,CMGC protein kinase group[],,0.265287,-0.000000,1.000000,1.000000,,29,"ENSG00000135250,ENSG00000102882,ENSG0000011009...","ENSG00000181085,ENSG00000156345,ENSG0000000812..."
144,SLC superfamily of solute carriers[],,0.250613,-0.000000,1.000000,1.000000,,68,"ENSG00000197375,ENSG00000175003,ENSG0000019720...","ENSG00000197901,ENSG00000169100,ENSG0000018771..."
145,Enzyme[],,0.390555,inf,,,,1150,"ENSG00000140718,ENSG00000175164,ENSG0000017317...","ENSG00000231852,ENSG00000127241,ENSG0000012334..."


In [21]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_symbol.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/GO:BP.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_gobp.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
                 1          0
0  ENSG00000147883  21.634413
1  ENSG00000140718  18.318823
2  ENSG00000130203  15.455358
3  ENSG00000175164  15.153941
4  ENSG00000111252  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_gobp.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,cell adhesion,GO:0007155,0.369973,inf,0.000000e+00,0.000000e+00,,372,"ENSG00000145012,ENSG00000073712,ENSG0000014481...","ENSG00000197921,ENSG00000172156,ENSG0000020496..."
1,negative regulation of DNA-templated transcrip...,GO:0045892,0.579957,11.695338,1.346497e-31,4.378809e-28,,335,"ENSG00000125378,ENSG00000148737,ENSG0000014265...","ENSG00000197921,ENSG00000177485,ENSG0000017977..."
2,positive regulation of cell population prolife...,GO:0008284,0.593719,7.607661,2.791006e-14,9.076351e-11,,306,"ENSG00000254647,ENSG00000016082,ENSG0000012537...","ENSG00000124766,ENSG00000103495,ENSG0000019677..."
3,nervous system development,GO:0007399,0.360056,7.226699,4.948765e-13,1.609338e-09,,335,"ENSG00000124766,ENSG00000166833,ENSG0000013867...","ENSG00000137261,ENSG00000204963,ENSG0000003605..."
4,positive regulation of gene expression,GO:0010628,0.625124,6.808540,9.859430e-12,3.206287e-08,,292,"ENSG00000254647,ENSG00000125378,ENSG0000016694...","ENSG00000078081,ENSG00000103495,ENSG0000017856..."
...,...,...,...,...,...,...,...,...,...,...
3247,cell differentiation,GO:0030154,0.430274,inf,,,,572,"ENSG00000016082,ENSG00000125378,ENSG0000016694...","ENSG00000136802,ENSG00000197921,ENSG0000017977..."
3248,lipid metabolic process,GO:0006629,0.420921,inf,,,,400,"ENSG00000162407,ENSG00000107798,ENSG0000013020...","ENSG00000231852,ENSG00000150867,ENSG0000011967..."
3249,negative regulation of transcription by RNA po...,GO:0000122,0.584173,inf,,,,561,"ENSG00000125378,ENSG00000166949,ENSG0000014873...","ENSG00000197921,ENSG00000179772,ENSG0000012476..."
3250,positive regulation of DNA-templated transcrip...,GO:0045893,0.619666,inf,,,,466,"ENSG00000130203,ENSG00000125378,ENSG0000016694...","ENSG00000197921,ENSG00000126767,ENSG0000012476..."


In [22]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_symbol.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/GO:CC.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_gocc.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
                 1          0
0  ENSG00000147883  21.634413
1  ENSG00000140718  18.318823
2  ENSG00000130203  15.455358
3  ENSG00000175164  15.153941
4  ENSG00000111252  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_gocc.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,glutamatergic synapse,GO:0098978,0.296211,7.156418,8.281232e-13,4.811396e-10,,343,"ENSG00000130203,ENSG00000185666,ENSG0000006986...","ENSG00000082458,ENSG00000132589,ENSG0000010629..."
1,centrosome,GO:0005813,0.319900,4.114676,3.877240e-05,2.227536e-02,,319,"ENSG00000013810,ENSG00000054282,ENSG0000021306...","ENSG00000116871,ENSG00000154645,ENSG0000010485..."
2,extracellular matrix,GO:0031012,0.505941,3.891210,9.974554e-05,5.630763e-02,,268,"ENSG00000130203,ENSG00000122194,ENSG0000010031...","ENSG00000197859,ENSG00000063660,ENSG0000012334..."
3,TRAPP complex,GO:0030008,-0.786715,-3.152101,1.621004e-03,6.103739e-01,,5,"ENSG00000167632,ENSG00000160218,ENSG0000016853...","ENSG00000153339,ENSG00000007255,ENSG0000016751..."
4,transcription regulator complex,GO:0005667,0.642434,3.112832,1.853013e-03,6.595882e-01,,152,"ENSG00000124766,ENSG00000166949,ENSG0000009183...","ENSG00000157216,ENSG00000188909,ENSG0000012476..."
...,...,...,...,...,...,...,...,...,...,...
576,nucleus,GO:0005634,0.408129,inf,,,,2964,"ENSG00000147883,ENSG00000140718,ENSG0000013020...","ENSG00000058056,ENSG00000085840,ENSG0000012476..."
577,perinuclear region of cytoplasm,GO:0048471,0.356139,inf,,,,388,"ENSG00000163235,ENSG00000256762,ENSG0000016359...","ENSG00000138190,ENSG00000154645,ENSG0000017393..."
578,plasma membrane,GO:0005886,0.331963,inf,,,,2532,"ENSG00000130203,ENSG00000111252,ENSG0000016436...","ENSG00000078018,ENSG00000211626,ENSG0000012483..."
579,protein-containing complex,GO:0032991,0.425447,inf,,,,380,"ENSG00000089169,ENSG00000142655,ENSG0000009183...","ENSG00000089169,ENSG00000118507,ENSG0000012464..."


In [23]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_symbol.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/GO:MF.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_gomf.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
                 1          0
0  ENSG00000147883  21.634413
1  ENSG00000140718  18.318823
2  ENSG00000130203  15.455358
3  ENSG00000175164  15.153941
4  ENSG00000111252  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_gomf.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,sequence-specific double-stranded DNA binding,GO:1990837,0.570121,inf,0.000000e+00,0.000000e+00,,369,"ENSG00000016082,ENSG00000137265,ENSG0000009183...","ENSG00000187772,ENSG00000197921,ENSG0000012676..."
1,kinase activity,GO:0016301,0.316603,8.736849,2.397035e-18,2.015907e-15,,347,"ENSG00000135250,ENSG00000068078,ENSG0000014534...","ENSG00000214102,ENSG00000008128,ENSG0000015086..."
2,sequence-specific DNA binding,GO:0043565,0.640856,7.164815,7.789173e-13,6.550695e-10,,294,"ENSG00000016082,ENSG00000166949,ENSG0000014873...","ENSG00000177485,ENSG00000023608,ENSG0000017977..."
3,calcium ion binding,GO:0005509,0.307128,6.441450,1.183377e-10,9.952200e-08,,338,"ENSG00000089169,ENSG00000167323,ENSG0000013757...","ENSG00000089169,ENSG00000127241,ENSG0000020496..."
4,"""DNA-binding transcription activator activity,...",GO:0001228,0.638447,5.981109,2.216240e-09,1.863856e-06,,277,"ENSG00000016082,ENSG00000166949,ENSG0000013726...","ENSG00000125285,ENSG00000126767,ENSG0000011851..."
...,...,...,...,...,...,...,...,...,...,...
836,metal ion binding,GO:0046872,0.382125,inf,,,,1791,"ENSG00000140718,ENSG00000089169,ENSG0000017317...","ENSG00000058056,ENSG00000085840,ENSG0000016755..."
837,nucleotide binding,GO:0000166,0.224038,inf,,,,858,"ENSG00000185666,ENSG00000173175,ENSG0000026352...","ENSG00000137177,ENSG00000106392,ENSG0000008832..."
838,protein homodimerization activity,GO:0042803,0.450403,inf,,,,381,"ENSG00000130203,ENSG00000145349,ENSG0000016436...","ENSG00000127241,ENSG00000150867,ENSG0000028870..."
839,transferase activity,GO:0016740,0.278143,inf,,,,893,"ENSG00000140718,ENSG00000164362,ENSG0000021443...","ENSG00000106392,ENSG00000149743,ENSG0000000683..."


In [24]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore_symbol.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/from_facets/Subcellular Location.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_subcell.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)

GSEA input shape: (8285, 2)
GSEA input columns: [1, 0]
GSEA input sample:
                 1          0
0  ENSG00000147883  21.634413
1  ENSG00000140718  18.318823
2  ENSG00000130203  15.455358
3  ENSG00000175164  15.153941
4  ENSG00000111252  12.441185
GSEA results saved to /Users/polina/genetics_gsea/data/gsea/disease_zscore_subcell.tsv


Unnamed: 0,Term,ID,es,nes,pval,sidak,fdr,geneset_size,leading_edge,propagated_edge
0,Nuclear bodies,SL-0494,0.404539,3.266032,0.001091,0.140741,,286,"ENSG00000149177,ENSG00000182568,ENSG0000014873...","ENSG00000167604,ENSG00000221963,ENSG0000013813..."
1,: Secreted,Isoform 5,0.853501,2.522788,0.011643,0.803649,,5,"ENSG00000185499,ENSG00000149294","ENSG00000154639,ENSG00000104938,ENSG0000018549..."
2,: Cell membrane,Isoform 3,0.859308,2.359757,0.018287,0.923112,,9,"ENSG00000120659,ENSG00000066468,ENSG0000016868...","ENSG00000211891,ENSG00000172270,ENSG0000008248..."
3,: Nucleus,Isoform 4,0.881761,2.260973,0.023761,0.964656,,7,"ENSG00000183765,ENSG00000141510","ENSG00000036672,ENSG00000005156,ENSG0000018376..."
4,Melanosome membrane,SL-0160,0.785960,2.170081,0.030001,0.985505,,10,"ENSG00000164175,ENSG00000077498","ENSG00000118242,ENSG00000165240,ENSG0000016417..."
...,...,...,...,...,...,...,...,...,...,...
134,Nucleus,SL-0191,0.424045,inf,,,,2276,"ENSG00000140718,ENSG00000164362,ENSG0000016694...","ENSG00000124641,ENSG00000085840,ENSG0000012476..."
135,Plasma membrane,SL-0039,0.304077,inf,,,,1066,"ENSG00000173175,ENSG00000023171,ENSG0000013424...","ENSG00000135636,ENSG00000172785,ENSG0000007801..."
136,Predicted to be secreted[],,0.498223,inf,,,,412,"ENSG00000163235,ENSG00000130203,ENSG0000014917...","ENSG00000175793,ENSG00000137261,ENSG0000012724..."
137,Secreted,SL-0243,0.422762,inf,,,,758,"ENSG00000254647,ENSG00000130203,ENSG0000017516...","ENSG00000185674,ENSG00000137441,ENSG0000013258..."


## TA pleiotropy score

In [None]:
scores = "/Users/polina/genetics_gsea/data/input/geneset_disease_zscore.tsv"
library = "/Users/polina/genetics_gsea/data/gmt/Reactome_2025/ReactomePathways_merged.gmt"
output_name = "/Users/polina/genetics_gsea/data/gsea/disease_zscore_reactome_2025.tsv"

run_gsea_pandas(scores, library, processes=4, output_tsv=output_name)