In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StringType, StructType, StructField
import requests
from pyspark.sql import functions as F
import os

In [2]:
spark = SparkSession.builder.appName("ProcessGSEAoutput").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/29 13:12:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/29 13:12:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Preprocess Reactome database

In [3]:
def process_parquet_files(input_dir, output_dir):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # List all files in the input directory
    for filename in os.listdir(input_dir):
        input_file = os.path.join(input_dir, filename)
        output_file = os.path.join(output_dir, filename)

        # Load the parquet file into a DataFrame
        df = spark.read.parquet(input_file)

        # Step 1: Modify the 'ID' column to create a new 'Link' column
        df = df.withColumn("Link", F.concat(F.lit("https://reactome.org/content/detail/"), F.col("ID")))

        # Step 2: Calculate the number of comma-separated values in 'propagated_edge' column and create 'Pathway size'
        df = df.withColumn("Pathway size", F.size(F.split(F.col("propagated_edge"), ",")))

        # Step 3: Rename the columns
        df = df.withColumnRenamed("Term", "Pathway") \
                .withColumnRenamed("es", "ES") \
                .withColumnRenamed("fdr", "FDR") \
                .withColumnRenamed("nes", "NES") \
                .withColumnRenamed("pval", "p-value") \
                .withColumnRenamed("sidak", "Sidak's p-value") \
                .withColumnRenamed("geneset_size", "Number of input genes") \
                .withColumnRenamed("leading_edge", "Leading edge genes")

        # Step 4: Load 'Pathways_hierarchy_relationship.txt' into a DataFrame
        pathways_hierarchy_df = spark.read.option("delimiter", "\t").csv("/Users/polina/Pathwaganda/scr/gmt_pathway_files_prep/Reactome/Pathways_hierarchy_relationship.txt", header=False)

        # Rename columns to make them easier to work with
        pathways_hierarchy_df = pathways_hierarchy_df.withColumnRenamed("_c0", "Parent pathway") \
                                                        .withColumnRenamed("_c1", "Child pathway")

        # Step 5: Perform the merge
        joined_df = df.join(pathways_hierarchy_df, df["ID"] == pathways_hierarchy_df["Child pathway"], "left")

        # Step 6: Combine multiple 'Parent pathway' values into a comma-separated list
        result_df = joined_df.groupBy("ID", "Link", "Pathway", "ES", "NES", "FDR", "p-value", "Sidak's p-value", 
                                        "Number of input genes", "Leading edge genes", "Pathway size").agg(
            F.concat_ws(",", F.collect_list("Parent pathway")).alias("Parent pathway")
        )

        # Save the result as a Parquet file in the output directory
        result_df.write.parquet(output_file)

In [4]:
input_directory = "/Users/polina/Pathwaganda/data/GSEA_output/Reactome_Pathways_2025_diy"
output_directory = "/Users/polina/Pathwaganda/data/GSEA_output_clean/Reactome_Pathways_2025_diy_v2"
process_parquet_files(input_directory, output_directory)

                                                                                

In [23]:
test_path = "/Users/polina/Pathwaganda/data/GSEA_output_clean/Reactome_Pathways_2025_diy/diseaseId=EFO_0000706"
test = spark.read.parquet(test_path)
test.show(5)

+-------------+--------------------+--------------------+------------------+------------------+-------------------+------------------+---------------------+--------------------+------------+--------------+
|           ID|                Link|             Pathway|                ES|               NES|            p-value|   Sidak's p-value|Number of input genes|  Leading edge genes|Pathway size|Parent pathway|
+-------------+--------------------+--------------------+------------------+------------------+-------------------+------------------+---------------------+--------------------+------------+--------------+
|R-HSA-1059683|https://reactome....|Interleukin-6 sig...|0.6045125617474902|1.7049438477122143|0.08820489753627014|0.9999999934076531|                    6|                IL6R|          10| R-HSA-6783589|
| R-HSA-109581|https://reactome....|           Apoptosis|0.4089718529529836|0.8748301035197936| 0.3816663549494259|               1.0|                    9|  DCC,SATB1,TLR4,FAS

25/07/29 01:22:33 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1015890 ms exceeds timeout 120000 ms
25/07/29 01:22:33 WARN SparkContext: Killing executors is not supported by current scheduler.
25/07/29 01:37:50 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

## Preprocess WikiPathways

In [5]:
input_path = "WikiPathways_2025_diy/diseaseId=EFO_0000094"
df2 = spark.read.parquet(input_path)

In [6]:
df2.show(5, truncate=False)

+-----------------------------------------------------+------+-------------------+------------------+---------------------+---------------------+---------------------+------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

25/07/29 16:49:17 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 644059 ms exceeds timeout 120000 ms
25/07/29 16:49:17 WARN SparkContext: Killing executors is not supported by current scheduler.
25/07/29 16:49:19 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

Same here

## Preprocess KEGG_2021_Human

In [10]:
input_path = "KEGG_2021_Human/diseaseId=EFO_0000094"
df3 = spark.read.parquet(input_path)

In [11]:
df3.show(5, truncate=False)

+------------------------------+-------------------+------------------+---------------------+-------------------+--------------------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Same here

## Preprocess GO

In [12]:
input_path = "GO_Biological_Process_2025/diseaseId=EFO_0000094"
df4 = spark.read.parquet(input_path)

In [13]:
df4.show(5, truncate=False)

+-------------------------------------------------------------+-------------------+------------------+---------------------+--------------------+--------------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------

Here we can extract terms from brackets