# Save target-pathway lists as spark dfs and filter out non-gene targets

Input folder: Pathwaganda/data/GSEA_output

In [1]:
from pyspark.sql import SparkSession
import os
import shutil
from pyspark.sql.functions import split, explode, collect_list, col, concat_ws

In [2]:
spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/14 14:18:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
df_test = spark.read.parquet("/Users/polina/Pathwaganda/data/GSEA_output/Reactome_Pathways_2025_diy/diseaseId=EFO_0000094")

In [10]:
df_test.show(5, truncate=False)

+--------------------------------------------------------------------------+-------------+-------------------+------------------+---------------------+-------------------+-------------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Filter by FDR, add hierarchies

1. Read all of files in Reactome_Pathways_2025_diy as one parquet file with new column diseaseId
2. Filter by fdr cut off 0.1
3. Add parent pathway into table

In [8]:
from pyspark.sql.functions import input_file_name, regexp_extract, col, lit
from pyspark.sql import Row

def load_reactome_pathways_with_hierarchy(base_dir, hierarchy_path, fdr_cutoff=0.1):

    # Step 1: Read parquet files recursively
    df = spark.read.option("recursiveFileLookup", "true").parquet(base_dir)

    # Step 2: Extract diseaseId from file path
    df = df.withColumn(
        "diseaseId",
        regexp_extract(input_file_name(), r"diseaseId=([^/]+)", 1)
    )

    # Step 3: Filter by FDR cutoff
    df_filtered = df.filter(col("fdr") <= fdr_cutoff)

    # Step 4: Load pathway hierarchy file
    pathways_hierarchy_df = (
        spark.read.option("delimiter", "\t")
        .csv(hierarchy_path, header=False)
        .withColumnRenamed("_c0", "parentId")
        .withColumnRenamed("_c1", "childId")
    )

    # Step 5: Compute hierarchy level in-memory
    hierarchy_pairs = pathways_hierarchy_df.collect()
    parent_map = {row["childId"]: row["parentId"] for row in hierarchy_pairs}

    def get_level(child_id):
        level = 0
        current = child_id
        while current in parent_map and parent_map[current] is not None:
            current = parent_map[current]
            level += 1
            if level > 50:  # safety break for cycles
                break
        return level

    levels_data = [
        Row(parentId=row["parentId"], childId=row["childId"], hierLevel=get_level(row["childId"]))
        for row in hierarchy_pairs
    ]

    pathways_hierarchy_df = spark.createDataFrame(levels_data)

    # Step 6: Merge filtered pathways with hierarchy (now including hierLevel)
    joined_df = df_filtered.join(
        pathways_hierarchy_df,
        df_filtered["ID"] == pathways_hierarchy_df["childId"],
        "left"
    )

    return joined_df


In [9]:
base_dir = "/Users/polina/Pathwaganda/data/GSEA_output/Reactome_Pathways_2025_diy"
hierarchy_path = "/Users/polina/Pathwaganda/data/gmt_pathway_files_prep/Reactome/Pathways_hierarchy_relationship.txt"

df_filtered = load_reactome_pathways_with_hierarchy(base_dir, hierarchy_path)

                                                                                

In [16]:
df_filtered.filter(col("hierLevel") > 8).show(10)



+--------------------+------------+-------------------+-------------------+--------------------+------------------+--------------------+------------+--------------------+--------------------+-------------+------------+------------+---------+
|                Term|          ID|                 es|                nes|                pval|             sidak|                 fdr|geneset_size|        leading_edge|     propagated_edge|    diseaseId|    parentId|     childId|hierLevel|
+--------------------+------------+-------------------+-------------------+--------------------+------------------+--------------------+------------+--------------------+--------------------+-------------+------------+------------+---------+
|Formation of HIV-...|R-HSA-167200|-0.6109253369441215|-2.7815364391841655|0.005410225667489277|0.9999213261776687|0.052358961737590665|          17|POLR2K,POLR2C,POL...|CCNT1,CDK7,CDK9,C...|  EFO_0009676|R-HSA-167246|R-HSA-167200|        9|
|Formation of HIV-...|R-HSA-1672

                                                                                

In [17]:
df_filtered.write.mode("overwrite").parquet("/Users/polina/Pathwaganda/data/GSEA-output_filt_hier")

25/08/14 14:21:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


# Target-pathway matrix

Explode targets from propagated_edge and create boolean matrix TxP.

Hierarchical levels explanation:
- Level 1 - broad propagation: all except higher parent pathways
- Level 2 - medium propagation: except 1st and 2nd higher parent pathways
- Level 3 - specific propagation: except up to 3rd higher parent pathways

In [6]:
# Base directories - update these paths as needed
t_input_base = "/Users/polina/Pathwaganda/data/GSEA_output/Reactome_Pathways_2025_diy"
t_output_base = "/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy"

In [None]:
def process_parquet_files(input_dir, output_dir):

    # Loop through all disease subdirectories
    for disease_folder in os.listdir(input_dir):
        disease_input_path = os.path.join(input_dir, disease_folder)
        if not os.path.isdir(disease_input_path):
            continue

        # Find the first Parquet file in the disease folder
        parquet_files = [f for f in os.listdir(disease_input_path) if f.endswith(".parquet")]
        if not parquet_files:
            print(f"No Parquet file found in {disease_input_path}")
            continue

        input_parquet_path = os.path.join(disease_input_path, parquet_files[0])

        # Read the Parquet file
        df = spark.read.parquet(input_parquet_path)

        # Select and process relevant columns
        processed_df = (
            df.select("ID", "propagated_edge")
              .withColumn("approvedSymbol", explode(split(col("propagated_edge"), ",")))
              .groupBy("approvedSymbol")
              .agg(concat_ws(",", collect_list("ID")).alias("ID"))
        )

        # Create the corresponding output path
        disease_output_path = os.path.join(output_dir, disease_folder)
        os.makedirs(disease_output_path, exist_ok=True)

        output_parquet_path = os.path.join(disease_output_path)

        # Write the result
        processed_df.write.mode("overwrite").parquet(output_parquet_path)
        print(f"Saved processed file to: {output_parquet_path}")


In [5]:
process_parquet_files(t_input_base, t_output_base)

                                                                                

Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0000503
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0011015
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0000569
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0003916
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0004533
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0002033
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=MONDO_0017343
Saved processed file to: /Users/polina/Pathwaganda/data/target-pathway_matr

In [9]:
spark.read.parquet("/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy/diseaseId=EFO_0000094").show(5)

+----------------+--------------------+
|  approvedSymbol|                  ID|
+----------------+--------------------+
| complete genome|R-HSA-1643685,R-H...|
|        18S rRNA|       R-HSA-1643685|
|              1B|R-HSA-1643685,R-H...|
|              1C|R-HSA-1643685,R-H...|
|              1a|R-HSA-1643685,R-H...|
+----------------+--------------------+
only showing top 5 rows


# Pathway embeddings

## Hierarchical (Poincare ball model)

In [18]:
import os
import math
import sys
import types
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

# Ensure gensim can import numpy.strings
sys.modules['numpy.strings'] = types.ModuleType('numpy.strings')

from gensim.models.poincare import PoincareModel
import pandas as pd

In [None]:
def process_hierarchy_folders(input_dir, output_dir, negative=10, epochs=100):
    # Configure Spark with better memory settings
    # spark = (SparkSession.builder
    #          .appName("PoincareEmbedding")
    #          .config("spark.driver.memory", "4g")
    #          .config("spark.executor.memory", "4g")
    #          .config("spark.sql.shuffle.partitions", "200")
    #          .config("spark.sql.parquet.compression.codec", "snappy")
    #          .config("spark.memory.fraction", "0.8")
    #          .config("spark.memory.storageFraction", "0.3")
    #          .getOrCreate())

    # Suppress unnecessary logs
    spark.sparkContext.setLogLevel("ERROR")

    try:
        for entry in os.scandir(input_dir):
            if not entry.is_dir():
                continue
            
            folder = entry.path
            name = entry.name

            # 1) Read Parquet with optimized settings
            df = spark.read.parquet(folder).cache()  # Cache since we use it multiple times

            # 2) Fill null parents with self
            df = df.withColumn(
                "Parent pathway",
                when(col("Parent pathway").isNull(), col("ID")).otherwise(col("Parent pathway"))
            )

            # 3) Count unique nodes - optimized
            N = df.selectExpr("ID as node").union(
                df.selectExpr("`Parent pathway` as node")
            ).distinct().count()

            # 4) Choose dims = ceil(log2(N)), at least 2
            dims = max(2, math.ceil(math.log2(N)))

            print(f"{name} - Total distinct nodes N: {N}; Chosen d: {dims}")

            # 6) Extract edges into Python list - with batch processing for large datasets
            edges = (
                df.select("Parent pathway", "ID")
                .dropDuplicates()
                .localCheckpoint()  # Helps with iterative algorithms
                .toPandas()
                .values.tolist()
            )

            # 7) Train Poincaré model
            model = PoincareModel(edges, negative=negative, size=dims)
            model.train(epochs=epochs)

            # 8) Dump embeddings and convert to Spark
            emb = [(key, *model.kv[key]) for key in model.kv.index_to_key]
            pdf = pd.DataFrame(emb, columns=["ID"] + [f"dim_{i}" for i in range(dims)])
            
            # Write in batches if needed
            sdf = spark.createDataFrame(pdf)
            sdf.write.mode("overwrite").parquet(os.path.join(output_dir, name))

            # Clean up cached data
            df.unpersist()
            
    finally:
        spark.stop()

In [20]:
input_dir = "/Users/polina/Pathwaganda/data/GSEA_output_gui/Reactome_Pathways_2025_diy_v2"
output_dir = "/Users/polina/Pathwaganda/data/pathway_embeddings/Reactome_Pathways_2025_diy_v2"

process_hierarchy_folders(input_dir, output_dir, negative=10, epochs=100)

diseaseId=EFO_0000503 - Total distinct nodes N: 750; Chosen d: 10


                                                                                

diseaseId=EFO_0011015 - Total distinct nodes N: 1063; Chosen d: 11
diseaseId=MONDO_0000569 - Total distinct nodes N: 504; Chosen d: 9
diseaseId=MONDO_0003916 - Total distinct nodes N: 339; Chosen d: 9
diseaseId=EFO_0004533 - Total distinct nodes N: 721; Chosen d: 10
diseaseId=MONDO_0002033 - Total distinct nodes N: 742; Chosen d: 10
diseaseId=MONDO_0017343 - Total distinct nodes N: 709; Chosen d: 10


                                                                                

diseaseId=MONDO_0002691 - Total distinct nodes N: 779; Chosen d: 10
diseaseId=MONDO_0001014 - Total distinct nodes N: 767; Chosen d: 10
diseaseId=EFO_0007987 - Total distinct nodes N: 1489; Chosen d: 11
diseaseId=EFO_0000504 - Total distinct nodes N: 685; Chosen d: 10
diseaseId=EFO_0004730 - Total distinct nodes N: 1186; Chosen d: 11
diseaseId=EFO_0007989 - Total distinct nodes N: 560; Chosen d: 10
diseaseId=MONDO_0002654 - Total distinct nodes N: 1317; Chosen d: 11
diseaseId=EFO_0005423 - Total distinct nodes N: 288; Chosen d: 9
diseaseId=MONDO_0019472 - Total distinct nodes N: 607; Chosen d: 10
diseaseId=EFO_0000707 - Total distinct nodes N: 831; Chosen d: 10
diseaseId=HP_0000152 - Total distinct nodes N: 592; Chosen d: 10
diseaseId=HP_0001877 - Total distinct nodes N: 186; Chosen d: 8
diseaseId=MONDO_0000594 - Total distinct nodes N: 462; Chosen d: 9
diseaseId=HP_0000951 - Total distinct nodes N: 397; Chosen d: 9
diseaseId=EFO_0007911 - Total distinct nodes N: 343; Chosen d: 9
disea

                                                                                

diseaseId=EFO_0004274 - Total distinct nodes N: 274; Chosen d: 9
diseaseId=EFO_0004273 - Total distinct nodes N: 354; Chosen d: 9
diseaseId=EFO_0004617 - Total distinct nodes N: 881; Chosen d: 10
diseaseId=EFO_0021796 - Total distinct nodes N: 367; Chosen d: 9
diseaseId=EFO_0006943 - Total distinct nodes N: 700; Chosen d: 10
diseaseId=EFO_0004842 - Total distinct nodes N: 1210; Chosen d: 11
diseaseId=EFO_0000618 - Total distinct nodes N: 1792; Chosen d: 11
diseaseId=EFO_0003060 - Total distinct nodes N: 796; Chosen d: 10
diseaseId=EFO_0000275 - Total distinct nodes N: 510; Chosen d: 9
diseaseId=EFO_0003863 - Total distinct nodes N: 354; Chosen d: 9
diseaseId=EFO_0003897 - Total distinct nodes N: 764; Chosen d: 10
diseaseId=EFO_0005952 - Total distinct nodes N: 842; Chosen d: 10
diseaseId=MONDO_0002917 - Total distinct nodes N: 462; Chosen d: 9
diseaseId=EFO_0000272 - Total distinct nodes N: 753; Chosen d: 10
diseaseId=EFO_0000616 - Total distinct nodes N: 1699; Chosen d: 11
diseaseId=E

                                                                                

diseaseId=MONDO_0002928 - Total distinct nodes N: 652; Chosen d: 10
diseaseId=EFO_0002916 - Total distinct nodes N: 784; Chosen d: 10
diseaseId=EFO_0007800 - Total distinct nodes N: 707; Chosen d: 10
diseaseId=MONDO_0024476 - Total distinct nodes N: 754; Chosen d: 10
diseaseId=EFO_0002571 - Total distinct nodes N: 667; Chosen d: 10
diseaseId=EFO_0005592 - Total distinct nodes N: 654; Chosen d: 10
diseaseId=MONDO_0003059 - Total distinct nodes N: 756; Chosen d: 10
diseaseId=MONDO_0037254 - Total distinct nodes N: 734; Chosen d: 10


                                                                                

diseaseId=MONDO_0003061 - Total distinct nodes N: 248; Chosen d: 8
diseaseId=EFO_1000941 - Total distinct nodes N: 143; Chosen d: 8
diseaseId=MONDO_0007263 - Total distinct nodes N: 598; Chosen d: 10
diseaseId=EFO_0005105 - Total distinct nodes N: 1825; Chosen d: 11
diseaseId=EFO_0005561 - Total distinct nodes N: 582; Chosen d: 10
diseaseId=EFO_0000228 - Total distinct nodes N: 896; Chosen d: 10
diseaseId=EFO_0000673 - Total distinct nodes N: 749; Chosen d: 10
diseaseId=MONDO_0000621 - Total distinct nodes N: 814; Chosen d: 10
diseaseId=MONDO_0002512 - Total distinct nodes N: 711; Chosen d: 10
diseaseId=EFO_0005134 - Total distinct nodes N: 526; Chosen d: 10
diseaseId=MONDO_0002149 - Total distinct nodes N: 1302; Chosen d: 11
diseaseId=EFO_0003839 - Total distinct nodes N: 447; Chosen d: 9
diseaseId=HP_0002715 - Total distinct nodes N: 528; Chosen d: 10
diseaseId=EFO_0004627 - Total distinct nodes N: 735; Chosen d: 10
diseaseId=EFO_0003865 - Total distinct nodes N: 802; Chosen d: 10
di

                                                                                

diseaseId=EFO_0000232 - Total distinct nodes N: 749; Chosen d: 10
diseaseId=EFO_1002050 - Total distinct nodes N: 195; Chosen d: 8
diseaseId=EFO_0005116 - Total distinct nodes N: 867; Chosen d: 10
diseaseId=EFO_0022196 - Total distinct nodes N: 271; Chosen d: 9
diseaseId=EFO_1000999 - Total distinct nodes N: 1270; Chosen d: 11
diseaseId=MONDO_0001187 - Total distinct nodes N: 770; Chosen d: 10
diseaseId=EFO_0004833 - Total distinct nodes N: 1195; Chosen d: 11
diseaseId=EFO_0000651 - Total distinct nodes N: 1829; Chosen d: 11
diseaseId=EFO_1000532 - Total distinct nodes N: 706; Chosen d: 10
diseaseId=EFO_0004695 - Total distinct nodes N: 185; Chosen d: 8
diseaseId=EFO_0007010 - Total distinct nodes N: 1001; Chosen d: 10
diseaseId=MONDO_0004634 - Total distinct nodes N: 244; Chosen d: 8
diseaseId=MONDO_0021117 - Total distinct nodes N: 858; Chosen d: 10
diseaseId=EFO_0009387 - Total distinct nodes N: 512; Chosen d: 9
diseaseId=EFO_0005771 - Total distinct nodes N: 1124; Chosen d: 11
dise

                                                                                

diseaseId=MONDO_0000653 - Total distinct nodes N: 1051; Chosen d: 11
diseaseId=EFO_0004696 - Total distinct nodes N: 721; Chosen d: 10
diseaseId=MONDO_0005178 - Total distinct nodes N: 713; Chosen d: 10
diseaseId=EFO_0003820 - Total distinct nodes N: 780; Chosen d: 10
diseaseId=MONDO_0005147 - Total distinct nodes N: 215; Chosen d: 8
diseaseId=MONDO_0004095 - Total distinct nodes N: 849; Chosen d: 10
diseaseId=EFO_0000209 - Total distinct nodes N: 614; Chosen d: 10
diseaseId=HP_0012531 - Total distinct nodes N: 174; Chosen d: 8
diseaseId=EFO_0003818 - Total distinct nodes N: 711; Chosen d: 10
diseaseId=MONDO_0021583 - Total distinct nodes N: 811; Chosen d: 10
diseaseId=MONDO_0020663 - Total distinct nodes N: 602; Chosen d: 10
diseaseId=EFO_0004468 - Total distinct nodes N: 1342; Chosen d: 11
diseaseId=MONDO_0023644 - Total distinct nodes N: 726; Chosen d: 10
diseaseId=HP_0001574 - Total distinct nodes N: 547; Chosen d: 10
diseaseId=EFO_0005775 - Total distinct nodes N: 227; Chosen d: 8

In [24]:
spark.read.parquet("/Users/polina/Pathwaganda/data/pathway_embeddings/Reactome_Pathways_2025_diy_v2/diseaseId=EFO_0000094").show(5)

+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|           ID|               dim_0|               dim_1|               dim_2|               dim_3|               dim_4|               dim_5|               dim_6|              dim_7|               dim_8|               dim_9|
+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+
|  R-HSA-73884|0.014547058809404203|0.028268725834584226|-0.38879342669362515|-0.21606444941898975| 0.16547843443756646| -0.1846101340568833|0.002042940629954195|-0.5107297842501242| 0.20386755682225918| 0.37808384314641375|
|  R-HSA-73856|-0.09932011382566884| 0.08810110308801783| -0.5786949592655122|-0.27040918172764666|-

## Weighted (Jaccard similarity index)

# Target embeddings

## Hierarchical (Poincare coordinates)

Based on each file create pathway coordinates in hyperbolic space.

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, col, avg
import os

In [30]:
def create_target_embeddings(input_pathway: str, input_target2pathway: str, output: str):
    spark = SparkSession.builder.appName("TargetEmbeddingGenerator").getOrCreate()
    
    # List common file names in both folders
    pathway_files = {f.name for f in os.scandir(input_pathway) if f.is_dir()}
    target2pathway_files = {f.name for f in os.scandir(input_target2pathway) if f.is_dir()}
    common_files = pathway_files & target2pathway_files
    
    if not common_files:
        print("No matching files found between input_pathway and input_target2pathway.")
        return
    
    for file_name in common_files:
        print(f"Processing: {file_name}")
        
        # Read pathway embedding
        pathway_df = spark.read.parquet(os.path.join(input_pathway, file_name))
        
        # Read target-to-pathway mapping
        target_df = spark.read.parquet(os.path.join(input_target2pathway, file_name))
        
        # Explode pathway IDs
        exploded_target_df = (
            target_df
            .withColumn("pathwayID", explode(split(col("ID"), ",")))
            .select("approvedSymbol", "pathwayID")
        )
        
        # Join with pathway embeddings
        joined_df = (
            exploded_target_df
            .join(pathway_df, exploded_target_df["pathwayID"] == pathway_df["ID"], "inner")
        )

        # Drop duplicated ID columns
        joined_df = joined_df.drop(pathway_df["ID"])

        # Compute average for each dimension per target
        embedding_columns = [col for col in joined_df.columns if col.startswith("dim_")]
        averaged_df = (
            joined_df
            .groupBy("approvedSymbol")
            .agg(*[avg(c).alias(c) for c in embedding_columns])
        )
        
        # Write output
        output_path = os.path.join(output, file_name)
        averaged_df.write.mode("overwrite").parquet(output_path)
        print(f"Written to: {output_path}")

    spark.stop()

In [35]:
input_pathway = "/Users/polina/Pathwaganda/data/pathway_embeddings/Reactome_Pathways_2025_diy_v2"
input_target2pathway = "/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy"
output = "/Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy"

create_target_embeddings(input_pathway, input_target2pathway, output)

Processing: diseaseId=EFO_0000706
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0000706
Processing: diseaseId=EFO_0006335
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0006335
Processing: diseaseId=MONDO_0000591
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0000591
Processing: diseaseId=MONDO_0004390
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0004390
Processing: diseaseId=HP_0032263
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=HP_0032263
Processing: diseaseId=EFO_1000363
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_1000363
Processing: diseaseId=MONDO_0000588
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseas

                                                                                

Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0004541
Processing: diseaseId=MONDO_0003219
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0003219
Processing: diseaseId=MONDO_0002715
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0002715
Processing: diseaseId=EFO_0001061
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0001061
Processing: diseaseId=MONDO_0002229
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0002229
Processing: diseaseId=EFO_0001663
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0001663
Processing: diseaseId=EFO_0009674
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0009674
Processing: di

                                                                                

Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0001933
Processing: diseaseId=EFO_0009546
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0009546
Processing: diseaseId=EFO_0011008
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0011008
Processing: diseaseId=EFO_0004517
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0004517
Processing: diseaseId=MONDO_0021063
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=MONDO_0021063
Processing: diseaseId=EFO_1000350
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_1000350
Processing: diseaseId=HP_0000152
Written to: /Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=HP_0000152
Processing: diseaseId=

In [38]:
spark.read.parquet("/Users/polina/Pathwaganda/data/target_embeddings/test/diseaseId=EFO_0000094").count()

10146

## Prepare target-based metadata files with info about targets per disease

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/05 12:05:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
spark.read.parquet("/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/test/diseaseId=EFO_0000094").show(5)

                                                                                

+----------------+--------------------+
|  approvedSymbol|                  ID|
+----------------+--------------------+
| complete genome|R-HSA-1643685,R-H...|
|        18S rRNA|       R-HSA-1643685|
|              1B|R-HSA-1643685,R-H...|
|              1C|R-HSA-1643685,R-H...|
|              1a|R-HSA-1643685,R-H...|
+----------------+--------------------+
only showing top 5 rows


Lets use target-pathway_matrix_opt folder to start with and parse target info from OT files.

In [None]:
# Take targetId from:

spark.read.parquet("/Users/polina/Pathwaganda/data/target").show(5)

25/08/05 12:08:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------------+--------------+--------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+---------+
|             id|approvedSymbol|       biotype|       transcriptIds| canonicalTranscript|      canonicalExons|     genomicLocation|alternativeGenes|        approvedName|                  go|hallmarks|            synonyms|      symbolSynonyms|        nameSynonyms|functionDescriptions|subcellularLocations|         targetClass|     obsoleteSymbols|       obsoleteNames|          constraint| tep|          proteinIds|             dbXrefs|chemicalProbes|   

In [27]:
# Take genetic evidence scores from gsea_4_inout files:

spark.read.parquet("/Users/polina/Pathwaganda/data/input_4_gsea/diseaseId=EFO_0000094").show(5)

+--------+-------------------+
|       0|                  1|
+--------+-------------------+
|    CUX1| 0.4559480982087158|
|  NPIPB8|0.14562438358841173|
|    ETV5| 0.3039653988058105|
|  NUTM2D| 0.3039653988058105|
|DCAF12L2| 0.3039653988058105|
+--------+-------------------+
only showing top 5 rows


In [None]:
# Take drug info from:

spark.read.parquet("/Users/polina/Pathwaganda/data/known_drug").show(5)

+-----------+---------------+----------+-----+----------+--------------------+--------------------+---------------+--------------+--------------------+-------------+----------------+--------------------+--------------------+--------------+--------------------+--------------------+
|     drugId|       targetId| diseaseId|phase|    status|                urls|           ancestors|          label|approvedSymbol|        approvedName|  targetClass|        prefName|          tradeNames|            synonyms|      drugType|   mechanismOfAction|          targetName|
+-----------+---------------+----------+-----+----------+--------------------+--------------------+---------------+--------------+--------------------+-------------+----------------+--------------------+--------------------+--------------+--------------------+--------------------+
|CHEMBL52440|ENSG00000183454|DOID_10113|  1.0| Completed|[{ClinicalTrials,...|[MONDO_0002428, E...|trypanosomiasis|        GRIN2A|glutamate ionotro...|[Io

### Merge with known_drug from ChEMBL

In [20]:
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, concat_ws, max as spark_max

def merge_files_with_target_drug_info(input_dir: str, 
                                      target_parquet: str, 
                                    #   association_parquet: str, 
                                      known_drug_parquet: str, 
                                      output_dir: str):
    # Start Spark session
    spark = SparkSession.builder.getOrCreate()

    # Read shared dataframes
    target_df = spark.read.parquet(target_parquet).select("approvedSymbol", "id").distinct()
    known_drug_df = spark.read.parquet(known_drug_parquet).select("phase", "targetId", "diseaseId")

    # Prepare target mapping: approvedSymbol -> comma-separated list of ids
    target_agg_df = (
        target_df
        .groupBy("approvedSymbol")
        .agg(concat_ws(",", collect_list("id")).alias("targetId"))
    )

    # Iterate over folders in input_dir
    for folder_name in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, folder_name)
        if not os.path.isdir(folder_path):
            continue

        # Expect folder name of format diseaseId=XXX
        match = re.match(r"diseaseId=(.+)", folder_name)
        if not match:
            continue
        disease_id = match.group(1)

        # Read initial file (assuming single parquet in folder)
        initial_file_path = os.path.join(folder_path)
        initial_df = spark.read.parquet(initial_file_path)

        # Join with target mapping
        initial_with_target = (
            initial_df
            .join(target_agg_df, on="approvedSymbol", how="left")
        )

        # Filter known drug data for the current diseaseId
        filtered_known_drug = known_drug_df.filter(col("diseaseId") == disease_id)

        # Join with known_drug to get phase
        final_df = (
            initial_with_target
            .join(filtered_known_drug, on="targetId", how="left")
        )

        # Aggregate max phase for each row
        result_df = (
            final_df
            .groupBy(*initial_df.columns, "targetId")
            .agg(spark_max("phase").alias("maxPhaseChEMBL"))
        )

        # Save the result as a parquet file to output_dir with same folder name
        output_path = os.path.join(output_dir, folder_name)
        result_df.write.mode("overwrite").parquet(output_path)


In [21]:
merge_files_with_target_drug_info(
    input_dir="/Users/polina/Pathwaganda/data/target-pathway_matrix_opt/Reactome_Pathways_2025_diy",
    target_parquet="/Users/polina/Pathwaganda/data/target",
    # association_path="/path/to/association.parquet",
    known_drug_parquet="/Users/polina/Pathwaganda/data/known_drug",
    output_dir="/Users/polina/Pathwaganda/data/target_metadata/known_drug_merge/Reactome_Pathways_2025_diy"
)

                                                                                

In [None]:
spark.read.parquet("/Users/polina/Pathwaganda/diseaseId=EFO_0000094").show(5)

+--------------+--------------------+---------------+--------------+
|approvedSymbol|                  ID|       targetId|maxPhaseChEMBL|
+--------------+--------------------+---------------+--------------+
|        ANGPT1|        R-HSA-109582|ENSG00000154188|          NULL|
|         APOOL|R-HSA-1592230,R-H...|ENSG00000155008|          NULL|
|         CCAR1|R-HSA-72203,R-HSA...|ENSG00000060339|          NULL|
|          CD96|        R-HSA-198933|ENSG00000153283|          NULL|
|         CDH24|R-HSA-9759476,R-H...|ENSG00000139880|          NULL|
+--------------+--------------------+---------------+--------------+
only showing top 5 rows


### Merge with genetic association score from OT platform

In [57]:
import os
from pyspark.sql import SparkSession

def merge_parquet_folders_with_na(spark, folder_1, folder_2, output_dir):
    """
    For each subfolder in folder_1:
    - Read parquet from folder_1 and folder_2 (same subfolder name)
    - Rename columns in folder_2 df: '0' -> 'approvedSymbol', '1' -> 'geneticScore'
    - Left join folder_1 df with folder_2 df on 'approvedSymbol'
    - If folder_2 subfolder missing, write folder_1 df as is
    - Write merged df to output_dir with same subfolder name
    """
    folder_1_subdirs = [name for name in os.listdir(folder_1) 
                        if os.path.isdir(os.path.join(folder_1, name))]

    for subdir in folder_1_subdirs:
        path_1 = os.path.join(folder_1, subdir)
        path_2 = os.path.join(folder_2, subdir)
        output_path = os.path.join(output_dir, subdir)

        df1 = spark.read.parquet(path_1)

        if not os.path.exists(path_2):
            print(f"Folder {subdir} missing in folder_2. Writing original file from folder_1 as is.")
            df1.write.mode("overwrite").parquet(output_path)
            continue

        df2 = spark.read.parquet(path_2)
        df2_renamed = df2.withColumnRenamed("0", "approvedSymbol") \
                         .withColumnRenamed("1", "geneticScore")

        # Left join so unmatched get null for geneticScore
        merged_df = df1.join(df2_renamed.select("approvedSymbol", "geneticScore"), 
                             on="approvedSymbol", how="left")

        merged_df.write.mode("overwrite").parquet(output_path)
        print(f"Merged and written: {output_path}")

In [58]:
spark = SparkSession.builder.appName("MergeParquets").getOrCreate()

merge_parquet_folders_with_na(
    spark,
    folder_1="/Users/polina/Pathwaganda/data/target_metadata/known_drug_merge/Reactome_Pathways_2025_diy",
    folder_2="/Users/polina/Pathwaganda/data/input_4_gsea",
    output_dir="/Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy"
)

Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=EFO_0000503
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=EFO_0011015
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=MONDO_0000569
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=MONDO_0003916
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=EFO_0004533
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=MONDO_0002033
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=MONDO_0017343
Merged and written: /Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=MOND

In [55]:
spark.read.parquet("/Users/polina/Pathwaganda/data/target_metadata/ge_merge/test/diseaseId=MONDO_0045024").filter(col("targetId").isNull()).count()

283

! Need to make synonyms search !

# Prepare file with umap coordinates

## Case 1: user hasn't specified list of genes (show only genetics)

Steps: 
- Take coordinate file and run umap and clustering
- Write coordinates and clusters into correspondent metadata file
- Run gsea to put labels for each pathway (opt)
- Filter out genes without genetic evidence

In [None]:
import pandas as pd
import numpy as np
import os
import umap
import hdbscan
from scipy.spatial.distance import pdist, squareform

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def compute_poincare_distance_matrix(embedding_matrix):
    """Vectorized computation of pairwise Poincaré distances."""
    def poincare_dist(u, v):
        norm_u = np.linalg.norm(u)
        norm_v = np.linalg.norm(v)
        norm_diff = np.linalg.norm(u - v)

        denom = (1 - norm_u ** 2) * (1 - norm_v ** 2)
        if denom <= 0:
            return float('inf')

        argument = 1 + 2 * (norm_diff ** 2) / denom
        return np.arccosh(argument)

    return squareform(pdist(embedding_matrix, metric=poincare_dist))

In [None]:
def perform_umap_clustering_parquet(
    metadata_parquet_dir,
    coordinates_parquet_dir,
    output_dir,
    n_neighbors=10,
    min_dist=0.5,
    min_cluster_size=12,
    umap_dimensions=2
):
    """
    Performs UMAP dimensionality reduction and HDBSCAN clustering using Poincaré distance.
    Aligns metadata and coordinates by 'approvedSymbol', saves final result as TSV.
    """

    # Load metadata and coordinates
    metadata = pd.read_parquet(metadata_parquet_dir).query("geneticScore.notnull()")
    coords_df = pd.read_parquet(coordinates_parquet_dir)

    # Sanity checks
    assert 'approvedSymbol' in metadata.columns, "Metadata must contain 'approvedSymbol'"
    assert coords_df.shape[1] > 1, "Coordinates must have approvedSymbol + at least one dimension"

    # Rename first column to 'approvedSymbol' if needed
    coords_df = coords_df.rename(columns={coords_df.columns[0]: 'approvedSymbol'})

    # Convert coordinate columns to float
    coord_columns = coords_df.columns[1:]
    coords_df[coord_columns] = coords_df[coord_columns].astype(float)

    # Merge metadata and coordinates on approvedSymbol
    merged_df = pd.merge(metadata, coords_df, on='approvedSymbol', how='inner')
    print(f"Merged metadata and coordinates: {merged_df.shape[0]} entries.")

    # Extract embedding matrix (in correct order)
    embedding_matrix = merged_df[coord_columns].values

    def compute_poincare_distance_matrix(embedding_matrix):
    """Vectorized computation of pairwise Poincaré distances."""
    def poincare_dist(u, v):
        norm_u = np.linalg.norm(u)
        norm_v = np.linalg.norm(v)
        norm_diff = np.linalg.norm(u - v)

        denom = (1 - norm_u ** 2) * (1 - norm_v ** 2)
        if denom <= 0:
            return float('inf')

        argument = 1 + 2 * (norm_diff ** 2) / denom
        return np.arccosh(argument)

    return squareform(pdist(embedding_matrix, metric=poincare_dist))

    # Check that all embeddings lie within the unit ball
    norms = np.linalg.norm(embedding_matrix, axis=1)
    if np.any(norms >= 1):
        raise ValueError("Some embeddings lie outside the Poincaré ball (norm >= 1).")

    distance_matrix = compute_poincare_distance_matrix(embedding_matrix)

    # UMAP dimensionality reduction
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=umap_dimensions,
        metric='precomputed',
        random_state=42
    )
    embedding_umap = reducer.fit_transform(distance_matrix)

    # HDBSCAN clustering
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=1,
        metric='precomputed'
    )
    cluster_labels = clusterer.fit_predict(distance_matrix)

    # Add UMAP and cluster results to merged_df
    for dim in range(umap_dimensions):
        merged_df[f'UMAP {dim+1}'] = embedding_umap[:, dim]
    merged_df['cluster'] = cluster_labels

    # Drop original embedding dimensions before saving
    output_df = merged_df.drop(columns=coord_columns)

    # Output directory and file
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, 'metadata_clusters_poincare_fast_ge.tsv')
    output_df.to_csv(output_file, sep='\t', index=False)


    print(f"✅ Updated metadata with clusters saved to: {output_file}")
    return output_file

In [19]:
perform_umap_clustering_parquet(
    metadata_parquet_dir="/Users/polina/Pathwaganda/data/target_metadata/ge_merge/Reactome_Pathways_2025_diy/diseaseId=EFO_0000094/",
    coordinates_parquet_dir="/Users/polina/Pathwaganda/data/target_embeddings/Reactome_Pathways_2025_diy/diseaseId=EFO_0000094/",
    output_dir="/Users/polina/Pathwaganda/data/umap/test",
    n_neighbors=5,
    min_dist=0.7,
    min_cluster_size=5,
    umap_dimensions=2
)


Merged metadata and coordinates: 528 entries.


  warn("using precomputed metric; inverse_transform will be unavailable")
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(


✅ Updated metadata with clusters saved to: /Users/polina/Pathwaganda/data/umap/test/metadata_clusters_poincare_fast_ge.tsv


'/Users/polina/Pathwaganda/data/umap/test/metadata_clusters_poincare_fast_ge.tsv'