# Save target-pathway lists as spark dfs and filter out non-gene targets

Input folder: Pathwaganda/data/GSEA_output

In [22]:
from pyspark.sql import SparkSession
import os
import shutil
from pyspark.sql.functions import split, explode, collect_list, col, concat_ws

In [33]:
spark = SparkSession.builder.getOrCreate()

In [10]:
df_test = spark.read.parquet("/Users/polina/Pathwaganda/data/GSEA_output_gui/Reactome_Pathways_2025_diy_v2/diseaseId=EFO_0000094")

In [11]:
df_test.show(5, truncate=False)

+------------+------------------------------------------------+-------------------------------+-------------------+-------------------+-------------------+---------------------+------------------+---------------------+--------------------------------------------+------------+--------------+
|ID          |Link                                            |Pathway                        |ES                 |NES                |FDR                |p-value              |Sidak's p-value   |Number of input genes|Leading edge genes                          |Pathway size|Parent pathway|
+------------+------------------------------------------------+-------------------------------+-------------------+-------------------+-------------------+---------------------+------------------+---------------------+--------------------------------------------+------------+--------------+
|R-HSA-109581|https://reactome.org/content/detail/R-HSA-109581|Apoptosis                      |0.369056153063541  |1.2967564

# Target-pathway matrix

Explode targets from propagated_edge and create boolean matrix TxP.

In [3]:
# Base directories - update these paths as needed
t_input_base = "/Users/polina/Pathwaganda/data/GSEA_output/Reactome_Pathways_2025_diy"
t_output_base = "/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy"

In [4]:
def process_parquet_files(input_dir, output_dir):

    # Loop through all disease subdirectories
    for disease_folder in os.listdir(input_dir):
        disease_input_path = os.path.join(input_dir, disease_folder)
        if not os.path.isdir(disease_input_path):
            continue

        # Find the first Parquet file in the disease folder
        parquet_files = [f for f in os.listdir(disease_input_path) if f.endswith(".parquet")]
        if not parquet_files:
            print(f"No Parquet file found in {disease_input_path}")
            continue

        input_parquet_path = os.path.join(disease_input_path, parquet_files[0])

        # Read the Parquet file
        df = spark.read.parquet(input_parquet_path)

        # Select and process relevant columns
        processed_df = (
            df.select("ID", "propagated_edge")
              .withColumn("approvedSymbol", explode(split(col("propagated_edge"), ",")))
              .groupBy("approvedSymbol")
              .agg(concat_ws(",", collect_list("ID")).alias("ID"))
        )

        # Create the corresponding output path
        disease_output_path = os.path.join(output_dir, disease_folder)
        os.makedirs(disease_output_path, exist_ok=True)

        output_parquet_path = os.path.join(disease_output_path)

        # Write the result
        processed_df.write.mode("overwrite").parquet(output_parquet_path)
        print(f"Saved processed file to: {output_parquet_path}")

    spark.stop()


In [5]:
process_parquet_files(t_input_base, t_output_base)

                                                                                

Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0000503
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0011015
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0000569
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0003916
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0004533
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0002033
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0017343
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matr

In [9]:
spark.read.parquet("/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0000094").show(5)

+----------------+--------------------+
|  approvedSymbol|                  ID|
+----------------+--------------------+
| complete genome|R-HSA-1643685,R-H...|
|        18S rRNA|       R-HSA-1643685|
|              1B|R-HSA-1643685,R-H...|
|              1C|R-HSA-1643685,R-H...|
|              1a|R-HSA-1643685,R-H...|
+----------------+--------------------+
only showing top 5 rows


# Pathway embeddings

## Hierarchical (Poincare ball model)

In [18]:
import os
import math
import sys
import types
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

# Ensure gensim can import numpy.strings
sys.modules['numpy.strings'] = types.ModuleType('numpy.strings')

from gensim.models.poincare import PoincareModel
import pandas as pd

In [19]:
def process_hierarchy_folders(input_dir, output_dir, negative=10, epochs=100):
    # Configure Spark with better memory settings
    spark = (SparkSession.builder
             .appName("PoincareEmbedding")
             .config("spark.driver.memory", "4g")
             .config("spark.executor.memory", "4g")
             .config("spark.sql.shuffle.partitions", "200")
             .config("spark.sql.parquet.compression.codec", "snappy")
             .config("spark.memory.fraction", "0.8")
             .config("spark.memory.storageFraction", "0.3")
             .getOrCreate())

    # Suppress unnecessary logs
    spark.sparkContext.setLogLevel("ERROR")

    try:
        for entry in os.scandir(input_dir):
            if not entry.is_dir():
                continue
            
            folder = entry.path
            name = entry.name

            # 1) Read Parquet with optimized settings
            df = spark.read.parquet(folder).cache()  # Cache since we use it multiple times

            # 2) Fill null parents with self
            df = df.withColumn(
                "Parent pathway",
                when(col("Parent pathway").isNull(), col("ID")).otherwise(col("Parent pathway"))
            )

            # 3) Count unique nodes - optimized
            N = df.selectExpr("ID as node").union(
                df.selectExpr("`Parent pathway` as node")
            ).distinct().count()

            # 4) Choose dims = ceil(log2(N)), at least 2
            dims = max(2, math.ceil(math.log2(N)))

            print(f"{name} - Total distinct nodes N: {N}; Chosen d: {dims}")

            # 6) Extract edges into Python list - with batch processing for large datasets
            edges = (
                df.select("Parent pathway", "ID")
                .dropDuplicates()
                .localCheckpoint()  # Helps with iterative algorithms
                .toPandas()
                .values.tolist()
            )

            # 7) Train Poincar√© model
            model = PoincareModel(edges, negative=negative, size=dims)
            model.train(epochs=epochs)

            # 8) Dump embeddings and convert to Spark
            emb = [(key, *model.kv[key]) for key in model.kv.index_to_key]
            pdf = pd.DataFrame(emb, columns=["ID"] + [f"dim_{i}" for i in range(dims)])
            
            # Write in batches if needed
            sdf = spark.createDataFrame(pdf)
            sdf.write.mode("overwrite").parquet(os.path.join(output_dir, name))

            # Clean up cached data
            df.unpersist()
            
    finally:
        spark.stop()

In [20]:
input_dir = "/Users/polina/Pathwaganda/data/GSEA_output_gui/Reactome_Pathways_2025_diy_v2"
output_dir = "/Users/polina/Pathwaganda/data/pathway_embeddings/Reactome_Pathways_2025_diy_v2"

process_hierarchy_folders(input_dir, output_dir, negative=10, epochs=100)

diseaseId=EFO_0000503 - Total distinct nodes N: 750; Chosen d: 10


                                                                                

diseaseId=EFO_0011015 - Total distinct nodes N: 1063; Chosen d: 11
diseaseId=MONDO_0000569 - Total distinct nodes N: 504; Chosen d: 9
diseaseId=MONDO_0003916 - Total distinct nodes N: 339; Chosen d: 9
diseaseId=EFO_0004533 - Total distinct nodes N: 721; Chosen d: 10
diseaseId=MONDO_0002033 - Total distinct nodes N: 742; Chosen d: 10
diseaseId=MONDO_0017343 - Total distinct nodes N: 709; Chosen d: 10


                                                                                

diseaseId=MONDO_0002691 - Total distinct nodes N: 779; Chosen d: 10
diseaseId=MONDO_0001014 - Total distinct nodes N: 767; Chosen d: 10
diseaseId=EFO_0007987 - Total distinct nodes N: 1489; Chosen d: 11
diseaseId=EFO_0000504 - Total distinct nodes N: 685; Chosen d: 10
diseaseId=EFO_0004730 - Total distinct nodes N: 1186; Chosen d: 11
diseaseId=EFO_0007989 - Total distinct nodes N: 560; Chosen d: 10
diseaseId=MONDO_0002654 - Total distinct nodes N: 1317; Chosen d: 11
diseaseId=EFO_0005423 - Total distinct nodes N: 288; Chosen d: 9
diseaseId=MONDO_0019472 - Total distinct nodes N: 607; Chosen d: 10
diseaseId=EFO_0000707 - Total distinct nodes N: 831; Chosen d: 10
diseaseId=HP_0000152 - Total distinct nodes N: 592; Chosen d: 10
diseaseId=HP_0001877 - Total distinct nodes N: 186; Chosen d: 8
diseaseId=MONDO_0000594 - Total distinct nodes N: 462; Chosen d: 9
diseaseId=HP_0000951 - Total distinct nodes N: 397; Chosen d: 9
diseaseId=EFO_0007911 - Total distinct nodes N: 343; Chosen d: 9
disea

                                                                                

diseaseId=EFO_0004274 - Total distinct nodes N: 274; Chosen d: 9
diseaseId=EFO_0004273 - Total distinct nodes N: 354; Chosen d: 9
diseaseId=EFO_0004617 - Total distinct nodes N: 881; Chosen d: 10
diseaseId=EFO_0021796 - Total distinct nodes N: 367; Chosen d: 9
diseaseId=EFO_0006943 - Total distinct nodes N: 700; Chosen d: 10
diseaseId=EFO_0004842 - Total distinct nodes N: 1210; Chosen d: 11
diseaseId=EFO_0000618 - Total distinct nodes N: 1792; Chosen d: 11
diseaseId=EFO_0003060 - Total distinct nodes N: 796; Chosen d: 10
diseaseId=EFO_0000275 - Total distinct nodes N: 510; Chosen d: 9
diseaseId=EFO_0003863 - Total distinct nodes N: 354; Chosen d: 9
diseaseId=EFO_0003897 - Total distinct nodes N: 764; Chosen d: 10
diseaseId=EFO_0005952 - Total distinct nodes N: 842; Chosen d: 10
diseaseId=MONDO_0002917 - Total distinct nodes N: 462; Chosen d: 9
diseaseId=EFO_0000272 - Total distinct nodes N: 753; Chosen d: 10
diseaseId=EFO_0000616 - Total distinct nodes N: 1699; Chosen d: 11
diseaseId=E

                                                                                

diseaseId=MONDO_0002928 - Total distinct nodes N: 652; Chosen d: 10
diseaseId=EFO_0002916 - Total distinct nodes N: 784; Chosen d: 10
diseaseId=EFO_0007800 - Total distinct nodes N: 707; Chosen d: 10
diseaseId=MONDO_0024476 - Total distinct nodes N: 754; Chosen d: 10
diseaseId=EFO_0002571 - Total distinct nodes N: 667; Chosen d: 10
diseaseId=EFO_0005592 - Total distinct nodes N: 654; Chosen d: 10
diseaseId=MONDO_0003059 - Total distinct nodes N: 756; Chosen d: 10
diseaseId=MONDO_0037254 - Total distinct nodes N: 734; Chosen d: 10


                                                                                

diseaseId=MONDO_0003061 - Total distinct nodes N: 248; Chosen d: 8
diseaseId=EFO_1000941 - Total distinct nodes N: 143; Chosen d: 8
diseaseId=MONDO_0007263 - Total distinct nodes N: 598; Chosen d: 10
diseaseId=EFO_0005105 - Total distinct nodes N: 1825; Chosen d: 11
diseaseId=EFO_0005561 - Total distinct nodes N: 582; Chosen d: 10
diseaseId=EFO_0000228 - Total distinct nodes N: 896; Chosen d: 10
diseaseId=EFO_0000673 - Total distinct nodes N: 749; Chosen d: 10
diseaseId=MONDO_0000621 - Total distinct nodes N: 814; Chosen d: 10
diseaseId=MONDO_0002512 - Total distinct nodes N: 711; Chosen d: 10
diseaseId=EFO_0005134 - Total distinct nodes N: 526; Chosen d: 10
diseaseId=MONDO_0002149 - Total distinct nodes N: 1302; Chosen d: 11
diseaseId=EFO_0003839 - Total distinct nodes N: 447; Chosen d: 9
diseaseId=HP_0002715 - Total distinct nodes N: 528; Chosen d: 10
diseaseId=EFO_0004627 - Total distinct nodes N: 735; Chosen d: 10
diseaseId=EFO_0003865 - Total distinct nodes N: 802; Chosen d: 10
di

                                                                                

diseaseId=EFO_0000232 - Total distinct nodes N: 749; Chosen d: 10
diseaseId=EFO_1002050 - Total distinct nodes N: 195; Chosen d: 8
diseaseId=EFO_0005116 - Total distinct nodes N: 867; Chosen d: 10
diseaseId=EFO_0022196 - Total distinct nodes N: 271; Chosen d: 9
diseaseId=EFO_1000999 - Total distinct nodes N: 1270; Chosen d: 11
diseaseId=MONDO_0001187 - Total distinct nodes N: 770; Chosen d: 10
diseaseId=EFO_0004833 - Total distinct nodes N: 1195; Chosen d: 11
diseaseId=EFO_0000651 - Total distinct nodes N: 1829; Chosen d: 11
diseaseId=EFO_1000532 - Total distinct nodes N: 706; Chosen d: 10
diseaseId=EFO_0004695 - Total distinct nodes N: 185; Chosen d: 8
diseaseId=EFO_0007010 - Total distinct nodes N: 1001; Chosen d: 10
diseaseId=MONDO_0004634 - Total distinct nodes N: 244; Chosen d: 8
diseaseId=MONDO_0021117 - Total distinct nodes N: 858; Chosen d: 10
diseaseId=EFO_0009387 - Total distinct nodes N: 512; Chosen d: 9
diseaseId=EFO_0005771 - Total distinct nodes N: 1124; Chosen d: 11
dise

                                                                                

diseaseId=MONDO_0000653 - Total distinct nodes N: 1051; Chosen d: 11
diseaseId=EFO_0004696 - Total distinct nodes N: 721; Chosen d: 10
diseaseId=MONDO_0005178 - Total distinct nodes N: 713; Chosen d: 10
diseaseId=EFO_0003820 - Total distinct nodes N: 780; Chosen d: 10
diseaseId=MONDO_0005147 - Total distinct nodes N: 215; Chosen d: 8
diseaseId=MONDO_0004095 - Total distinct nodes N: 849; Chosen d: 10
diseaseId=EFO_0000209 - Total distinct nodes N: 614; Chosen d: 10
diseaseId=HP_0012531 - Total distinct nodes N: 174; Chosen d: 8
diseaseId=EFO_0003818 - Total distinct nodes N: 711; Chosen d: 10
diseaseId=MONDO_0021583 - Total distinct nodes N: 811; Chosen d: 10
diseaseId=MONDO_0020663 - Total distinct nodes N: 602; Chosen d: 10
diseaseId=EFO_0004468 - Total distinct nodes N: 1342; Chosen d: 11
diseaseId=MONDO_0023644 - Total distinct nodes N: 726; Chosen d: 10
diseaseId=HP_0001574 - Total distinct nodes N: 547; Chosen d: 10
diseaseId=EFO_0005775 - Total distinct nodes N: 227; Chosen d: 8

In [24]:
spark.read.parquet("/Users/polina/Pathwaganda/data/pathway_embeddings/Reactome_Pathways_2025_diy_v2/diseaseId=EFO_0000094").show(5)

+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|           ID|               dim_0|               dim_1|               dim_2|               dim_3|               dim_4|               dim_5|               dim_6|              dim_7|               dim_8|               dim_9|
+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|  R-HSA-73884|0.014547058809404203|0.028268725834584226|-0.38879342669362515|-0.21606444941898975| 0.16547843443756646| -0.1846101340568833|0.002042940629954195|-0.5107297842501242| 0.20386755682225918| 0.37808384314641375|
|  R-HSA-73856|-0.09932011382566884| 0.08810110308801783| -0.5786949592655122|-0.27040918172764666|-

## Weighted (Jaccard similarity index)

# Target embeddings

## Hierarchical (Poincare coordinates)

Based on each file create pathway coordinates in hyperbolic space.

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, col, avg
import os

In [30]:
def create_target_embeddings(input_pathway: str, input_target2pathway: str, output: str):
    spark = SparkSession.builder.appName("TargetEmbeddingGenerator").getOrCreate()
    
    # List common file names in both folders
    pathway_files = {f.name for f in os.scandir(input_pathway) if f.is_dir()}
    target2pathway_files = {f.name for f in os.scandir(input_target2pathway) if f.is_dir()}
    common_files = pathway_files & target2pathway_files
    
    if not common_files:
        print("No matching files found between input_pathway and input_target2pathway.")
        return
    
    for file_name in common_files:
        print(f"Processing: {file_name}")
        
        # Read pathway embedding
        pathway_df = spark.read.parquet(os.path.join(input_pathway, file_name))
        
        # Read target-to-pathway mapping
        target_df = spark.read.parquet(os.path.join(input_target2pathway, file_name))
        
        # Explode pathway IDs
        exploded_target_df = (
            target_df
            .withColumn("pathwayID", explode(split(col("ID"), ",")))
            .select("approvedSymbol", "pathwayID")
        )
        
        # Join with pathway embeddings
        joined_df = (
            exploded_target_df
            .join(pathway_df, exploded_target_df["pathwayID"] == pathway_df["ID"], "inner")
        )

        # Drop duplicated ID columns
        joined_df = joined_df.drop(pathway_df["ID"])

        # Compute average for each dimension per target
        embedding_columns = [col for col in joined_df.columns if col.startswith("dim_")]
        averaged_df = (
            joined_df
            .groupBy("approvedSymbol")
            .agg(*[avg(c).alias(c) for c in embedding_columns])
        )
        
        # Write output
        output_path = os.path.join(output, file_name)
        averaged_df.write.mode("overwrite").parquet(output_path)
        print(f"Written to: {output_path}")

    spark.stop()

In [None]:
input_pathway = "/Users/polina/Pathwaganda/data/pathway_embeddings/Reactome_Pathways_2025_diy_v2"
input_target2pathway = "/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy"
output = "/Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy"

create_target_embeddings(input_pathway, input_target2pathway, output)

Processing: diseaseId=EFO_0000094
Written to: /Users/polina/Pathwaganda/data/target_embeddings/test/diseaseId=EFO_0000094


In [34]:
spark.read.parquet("/Users/polina/Pathwaganda/data/target_embeddings/test/diseaseId=EFO_0000094").show(5)

+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|approvedSymbol|               dim_0|               dim_1|               dim_2|               dim_3|               dim_4|               dim_5|               dim_6|               dim_7|               dim_8|               dim_9|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         AARS1| -0.3951987301936722| 0.24442867066109847|-0.13745470397796025| -0.2473505708239917|  0.4051520395139921| -0.1045244540795841| 0.21620448631638567| -0.0844914145852863| -0.1852327128433665|-0.37561990822242847|
|         ABCC6|0.034605752178775005|-0.26681391315408365| 0.22767806279006286| 0.1925856084

In [3]:
# Target info from OT platform to parse gene names
target_path = "gs://open-targets-data-releases/25.03/output/target/"
target = spark.read.parquet(target_path)
target_names = target.select("approvedSymbol", "id")

                                                                                

In [4]:
# Rename columns and aggregate by approvedSymbol
target_names = target.select("approvedSymbol", "id").groupBy('approvedSymbol').agg(
    f.concat_ws(',', f.collect_list('id')).alias('targetId')
)

In [5]:
def calculate_propagated_targets_with_terms(
    input_gcs_dir, 
    output_gcs_dir, 
    folders_to_process, 
    target_table
):
    """
    Process CSV files in specified GCS folders to generate Spark DataFrames
    with propagated targets and their corresponding terms. Merges with target_table
    on approvedSymbol and filters out unmatched rows. Saves results in Parquet format.

    Args:
        input_gcs_dir (str): Input GCS directory path (gs://bucket/path/)
        output_gcs_dir (str): Output GCS directory path (gs://bucket/path/)
        folders_to_process (list): List of folder names to process
        target_table (DataFrame): Spark DataFrame with 'approvedSymbol' and 'targetId'
    """
    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        csv_files = spark.sparkContext.wholeTextFiles(f"{input_gcs_dir}/{folder_name}/*.csv").keys().collect()
        
        for csv_file in csv_files:
            csv_name = csv_file.split("/")[-1].replace(".csv", "")
            output_folder_path = f"{output_gcs_dir}/{folder_name}/{csv_name}"

            df = spark.read.option("header", True).csv(csv_file)

            if not {'propagated_edge', 'Term'}.issubset(df.columns):
                print(f"Skipping {csv_file}: missing required columns.")
                continue

            df_exploded = df.withColumn(
                "approvedSymbol", explode(split(col("propagated_edge"), ","))
            ).dropna(subset=["approvedSymbol"])

            target_terms_df = df_exploded.groupBy("approvedSymbol") \
                .agg(collect_set("Term").alias("terms"))

            merged_df = target_terms_df.join(
                target_table.select("approvedSymbol", "targetId"),
                on="approvedSymbol",
                how="inner"
            )

            # Write as Parquet (supports array columns)
            merged_df.write.mode("overwrite").parquet(output_folder_path)
            print(f"Saved: {output_folder_path}")

In [15]:
calculate_propagated_targets_with_terms(
    input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output", 
    output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark", 
    folders_to_process = ["KEGG_2021_Human"], 
    target_table = target_names
)

                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000095_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000222_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000274_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000183_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000341_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000384_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000403_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000275_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000474_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000519_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000565_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000612_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000637_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000574_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000685_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000702_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000676_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0001073_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0001378_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0002429_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003060_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003144_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003758_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003833_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003869_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0004142_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0009606_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0005952_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_1001901_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0001657_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0002367_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0004975_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0004976_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0004979_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0004985_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0005178_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0005147_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0005180_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0005301_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0005277_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0007915_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0008170_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0008315_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0011719_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000729_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000756_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


In [6]:
calculate_propagated_targets_with_terms(
    input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output", 
    output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark", 
    folders_to_process = ["Reactome_Pathways_2024"], 
    target_table = target_names
)

                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000095_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000222_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000274_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000275_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000341_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000384_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000403_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000474_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000519_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000565_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000612_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000574_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000637_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000183_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000676_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000685_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000702_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000729_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0000756_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0001073_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0002429_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0001378_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0003060_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0003144_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0003758_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0003833_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0003869_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0004142_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0005952_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_0009606_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/EFO_1001901_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0001657_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0002367_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0004975_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0004976_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0004979_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0004985_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0005147_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0005178_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0005180_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0005277_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0005301_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0007915_ge_mm_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0008170_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0008315_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/Reactome_Pathways_2024/MONDO_0011719_ge_mm_som_gsea_Reactome_Pathways_2024_pval0.05


In [8]:
calculate_propagated_targets_with_terms(
    input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output", 
    output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark", 
    folders_to_process = ["WikiPathways_2024_Human"], 
    target_table = target_names
)

                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000095_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000222_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000183_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000274_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000341_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000275_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000384_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000474_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000403_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000519_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000565_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000574_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000612_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000637_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000676_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000685_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000702_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000729_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0001073_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0001378_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0002429_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0003060_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0003144_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0003758_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0003833_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0003869_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0004142_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0005952_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0009606_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_1001231_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_1001901_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0001657_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0002367_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0004975_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0004976_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0004979_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0004985_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0005147_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0005178_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0005180_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0005277_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0005301_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0007915_ge_mm_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0008170_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0008315_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/MONDO_0011719_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/WikiPathways_2024_Human/EFO_0000756_ge_mm_som_gsea_WikiPathways_2024_Human_pval0.05


In [9]:
calculate_propagated_targets_with_terms(
    input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output", 
    output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark", 
    folders_to_process = ["GO_Biological_Process_2023"], 
    target_table = target_names
)

                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000095_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000183_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000222_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000274_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000275_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000341_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000384_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000403_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000474_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000519_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000565_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000574_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000612_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000637_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000676_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000685_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000702_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000729_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0001073_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0001378_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0002429_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0003060_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0003144_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0003758_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0003833_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0004142_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0003869_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0005952_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_1001231_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0009606_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_1001901_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0001657_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0002367_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0004975_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0004976_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0004979_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0004985_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0005147_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0005178_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0005180_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0005277_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0005301_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0007915_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0008170_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0008315_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/MONDO_0011719_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000756_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


In [10]:
spark_df = spark.read.parquet("gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/GO_Biological_Process_2023/EFO_0000756_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05")

In [11]:
spark_df.show(10, truncate=False)

+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|approvedSymbol|terms                                                                                                                                                                                                                                                              |targetId       |
+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|AATK          |[Protein Autophosphorylation (GO:0046777)]                                                               

In [12]:
spark_df.count()

8959

# Save target-pathways lists for tensor board visualisation

! This code is pasted from IBD_pathway_to_cell from similarity_mvp and haven't been run !

Process CSV files in specified folders within a GCS directory and save target-pathway relationships in a format suitable for TensorBoard Embedding Projector. 

In [None]:
def save_target_pathway_tensorboard(input_gcs_dir, output_gcs_dir, folders_to_process):
    """
    Args:
        input_gcs_dir (str): Input GCS directory path.
        output_gcs_dir (str): Output GCS directory path.
        folders_to_process (list): List of folder names within the input directory to process.

    Output:
        Saves target-pathway relationships as TSV files in the output GCS directory.
    """
    # Initialize GCS filesystem
    fs = gcsfs.GCSFileSystem()

    # Ensure no trailing slashes in input and output directories
    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        folder_path = f"{input_gcs_dir}/{folder_name}"
        output_folder_path = f"{output_gcs_dir}/{folder_name}"

        # Ensure output folder exists
        if not fs.exists(output_folder_path):
            fs.mkdirs(output_folder_path)

        # List files in the input folder
        files_in_folder = fs.ls(folder_path)
        csv_files = [file for file in files_in_folder if file.endswith('.csv')]

        for file_path in csv_files:
            # Read the CSV file directly from GCS
            with fs.open(file_path, 'r') as f:
                df = pd.read_csv(f)

            # Check if 'propagated_edge' and 'Term' exist in the file
            if 'propagated_edge' not in df.columns or 'Term' not in df.columns:
                print(f"Skipping {file_path}: missing required columns.")
                continue

            # Explode the `propagated_edge` column
            df['propagated_edge_exploded'] = df['propagated_edge'].str.split(',')
            df = df.explode('propagated_edge_exploded').dropna(subset=['propagated_edge_exploded'])

            # Prepare data for TensorBoard format
            embedding_metadata = df[['propagated_edge_exploded', 'Term']]
            embedding_metadata = embedding_metadata.rename(
                columns={"propagated_edge_exploded": "Target", "Term": "Pathway"}
            )

            # Save target-pathway relationships as a TSV
            output_file_name = f"{Path(file_path).stem}_target_pathway.tsv"
            output_file_path = f"{output_folder_path}/{output_file_name}"

            # Save directly to GCS
            with fs.open(output_file_path, 'w') as f:
                embedding_metadata.to_csv(f, sep='\t', index=False)

            print(f"Processed and uploaded: {output_file_path}")


In [None]:
input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output"
output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/4tensorboard/jaccard"

library = ["Reactome_Pathways_2024"]

save_target_pathway_tensorboard(input_gcs_dir, output_gcs_dir, library)