In [1]:
!pip3 install blitzgsea

Defaulting to user installation because normal site-packages is not writeable


## Run blitzGSEA

For parquet dir with name "diseaseID=[ID]"
- for each file - read as dataframe
- sort by overallScore (descending)
- run result = blitz.gsea(file, library, processes=4)
- save all result into directory named after library with parquet partitions named after "diseaseID=[ID]" (dont overwrite, add to folder)

Use gcsfs to read and write files
library can be a list, and are preloaded by this function:

    # Determine libraries to process
    if libraries is None:
        libraries = blitz.enrichr.get_libraries()

    # Pre-load all library sets
    library_sets = {lib: blitz.enrichr.get_library(lib) for lib in libraries}


In [17]:
import os
import gcsfs
import pandas as pd
import blitzgsea as blitz
from pyspark.sql import SparkSession

In [18]:
# Initialize Spark
spark = SparkSession.builder.appName("run_blitzgsea").getOrCreate()

25/07/23 11:24:18 INFO SparkEnv: Registering MapOutputTracker
25/07/23 11:24:18 INFO SparkEnv: Registering BlockManagerMaster
25/07/23 11:24:18 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/07/23 11:24:18 INFO SparkEnv: Registering OutputCommitCoordinator


In [16]:
def run_blitzgsea(
    input_base: str,
    output_base: str,
    libraries: list = None,
    processes: int = 4,
):
    """
    Walk through GCS folders named diseaseId=[ID], read each as Spark,
    sort, convert to pandas, run blitz.gsea, then convert back & write.
    """
    fs = gcsfs.GCSFileSystem()

    # 1) Load libraries
    if libraries is None:
        libraries = blitz.enrichr.get_libraries()
    library_sets = {lib: blitz.enrichr.get_library(lib) for lib in libraries}

    # 2) Find diseaseId= subfolders
    base = input_base.rstrip('/') + '/'
    all_children = fs.ls(base)
    partitions = [
        p for p in all_children
        if os.path.basename(p).lower().startswith('diseaseid=')
    ]
    if not partitions:
        raise RuntimeError(f"No diseaseId= partitions found under {base}")

    # 3) Process each partition
    for raw_path in partitions:
        # ensure full GCS URI
        gcs_path = raw_path if raw_path.startswith("gs://") else f"gs://{raw_path}"
        partition = os.path.basename(raw_path)            # e.g. 'diseaseId=EFO_0000094'
        disease_id = partition.split('=', 1)[1]           # => 'EFO_0000094'
        print(f"Reading partition {partition} from {gcs_path}")

        # a) read & sort in Spark
        df = spark.read.parquet(gcs_path)
        df_sorted = df.orderBy(df.columns[1], ascending=False)

        # b) convert to pandas for blitz.gsea
        pdf = df_sorted.toPandas()

        # c) run GSEA per library and write back via Spark
        for lib_name, lib_set in library_sets.items():
            # run GSEA (returns pandas with index = Term)
            res_pdf = blitz.gsea(pdf, lib_set, processes=processes)

            # bring index into a column named 'Term'
            res_pdf = res_pdf.reset_index().rename(columns={'index': 'Term'})

            # ensure leading_edge is a flat string
            res_pdf['leading_edge'] = res_pdf['leading_edge'].apply(
                lambda x: ','.join(x) if isinstance(x, (list, tuple)) else str(x)
            )

            # map each Term to its propagated_edge values
            res_pdf['propagated_edge'] = res_pdf['Term'].map(
                lambda term: ",".join(library_sets[lib_name].get(term, []))
            )

            # put just the ID (not the 'diseaseId=') into the column
            res_pdf['diseaseId'] = disease_id

            # convert back to Spark and write out
            result_df = spark.createDataFrame(res_pdf)
            target_dir = os.path.join(output_base, lib_name, partition)
            result_df.write.mode("append").parquet(target_dir)

            print(f"  • Appended GSEA for {partition}, library {lib_name}")

In [None]:
input_base  = "gs://ot-team/polina/pathwaganda/input_4_gsea/non_oncology"
output_base = "gs://ot-team/polina/pathwaganda/gsea_run"
libs        = ["KEGG_2021_Human", "Reactome_Pathways_2024", 
               "WikiPathways_2024_Human", "GO_Biological_Process_2025"]

run_blitzgsea(
    input_base=input_base,
    output_base=output_base,
    libraries=libs,
    processes=4
)

Reading partition diseaseId=EFO_0000195 from gs://ot-team/polina/pathwaganda/input_4_gsea/non_oncology/diseaseId=EFO_0000195


                                                                                

  • Appended GSEA for diseaseId=EFO_0000195, library KEGG_2021_Human


                                                                                

  • Appended GSEA for diseaseId=EFO_0000195, library Reactome_Pathways_2024


                                                                                

  • Appended GSEA for diseaseId=EFO_0000195, library WikiPathways_2024_Human


                                                                                

  • Appended GSEA for diseaseId=EFO_0000195, library GO_Biological_Process_2025
Reading partition diseaseId=EFO_0000246 from gs://ot-team/polina/pathwaganda/input_4_gsea/non_oncology/diseaseId=EFO_0000246


                                                                                

  • Appended GSEA for diseaseId=EFO_0000246, library KEGG_2021_Human


                                                                                

  • Appended GSEA for diseaseId=EFO_0000246, library Reactome_Pathways_2024


                                                                                

  • Appended GSEA for diseaseId=EFO_0000246, library WikiPathways_2024_Human


                                                                                

  • Appended GSEA for diseaseId=EFO_0000246, library GO_Biological_Process_2025
Reading partition diseaseId=EFO_0000275 from gs://ot-team/polina/pathwaganda/input_4_gsea/non_oncology/diseaseId=EFO_0000275


In [20]:
spark.read.parquet("gs://ot-team/polina/pathwaganda/gsea_run/Reactome_Pathways_2024/diseaseId=EFO_0003825").show()

+--------------------+-------------------+------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------+
|                Term|                 es|               nes|                pval|               sidak|                 fdr|geneset_size|        leading_edge|     propagated_edge|  diseaseId|
+--------------------+-------------------+------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------+
|Regulation of TP5...| 0.6044440497518935| 4.963338353635351|6.929171381120369E-7|4.966983646598599E-4|4.354328542606200...|          30|TP53,PPP2R1A,CHD4...|BRPF1,SUPT16H,USP...|EFO_0003825|
| Signal Transduction|0.21447151645705248| 4.853241465681454|1.214596525134226...|8.704871434498735E-4|4.354328542606200...|         259|TP53,PPP2R1A,FBXW...|FNBP1,BAD,ANKFY1,...|EFO_0003825|
|             Disease|0.2491971524862304