# Save target-pathway lists as spark dfs and filter out non-gene targets

Input folder: ot-team/polina/pathway_propagation_validation_v2/gsea_output

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import col, explode, split, collect_set

In [6]:
spark = SparkSession.builder.getOrCreate()

In [7]:
# Target info from OT platform to parse gene names
target_path = "gs://open-targets-data-releases/25.03/output/target/"
target = spark.read.parquet(target_path)
target_names = target.select("approvedSymbol", "id")

                                                                                

In [8]:
# Rename columns and aggregate by approvedSymbol
target_names = target.select("approvedSymbol", "id").groupBy('approvedSymbol').agg(
    f.concat_ws(',', f.collect_list('id')).alias('targetId')
)

In [None]:
def calculate_propagated_targets_with_terms(
    input_gcs_dir, 
    output_gcs_dir, 
    folders_to_process, 
    target_table
):
    """
    Process CSV files in specified GCS folders to generate Spark DataFrames
    with propagated targets and their corresponding terms. Merges with target_table
    on approvedSymbol and filters out unmatched rows. Saves results in Parquet format.

    Args:
        input_gcs_dir (str): Input GCS directory path (gs://bucket/path/)
        output_gcs_dir (str): Output GCS directory path (gs://bucket/path/)
        folders_to_process (list): List of folder names to process
        target_table (DataFrame): Spark DataFrame with 'approvedSymbol' and 'targetId'
    """
    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        csv_files = spark.sparkContext.wholeTextFiles(f"{input_gcs_dir}/{folder_name}/*.csv").keys().collect()
        
        for csv_file in csv_files:
            csv_name = csv_file.split("/")[-1].replace(".csv", "")
            output_folder_path = f"{output_gcs_dir}/{folder_name}/{csv_name}"

            df = spark.read.option("header", True).csv(csv_file)

            if not {'propagated_edge', 'Term'}.issubset(df.columns):
                print(f"Skipping {csv_file}: missing required columns.")
                continue

            df_exploded = df.withColumn(
                "approvedSymbol", explode(split(col("propagated_edge"), ","))
            ).dropna(subset=["approvedSymbol"])

            target_terms_df = df_exploded.groupBy("approvedSymbol") \
                .agg(collect_set("Term").alias("terms"))

            merged_df = target_terms_df.join(
                target_table.select("approvedSymbol", "targetId"),
                on="approvedSymbol",
                how="inner"
            )

            # Write as Parquet (supports array columns)
            merged_df.write.mode("overwrite").parquet(output_folder_path)
            print(f"Saved: {output_folder_path}")

In [None]:
calculate_propagated_targets_with_terms(
    input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output", 
    output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark", 
    folders_to_process = ["KEGG_2021_Human"], 
    target_table = target_names
)

                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000095_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000222_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000274_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000183_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000341_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000384_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000403_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000275_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000474_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000519_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000565_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000612_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000637_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000574_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000685_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000702_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0000676_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0001073_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0001378_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0002429_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003060_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003144_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003758_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003833_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0003869_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0004142_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0009606_ge_mm_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_0005952_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/EFO_1001901_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0001657_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0002367_ge_mm_som_gsea_KEGG_2021_Human_pval0.05


                                                                                

Saved: gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/KEGG_2021_Human/MONDO_0004975_ge_mm_gsea_KEGG_2021_Human_pval0.05


[Stage 242:>                (0 + 4) / 7][Stage 244:>                (0 + 0) / 1]

In [11]:
spark_df = spark.read.parquet("gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark/test/EFO_0000095_ge_mm_som_gsea_KEGG_2021_Human_pval0.05")

In [13]:
spark_df.show(10, truncate=False)

+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|approvedSymbol|terms                                                                                                                                            |targetId       |
+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------+---------------+
|ABCA1         |[Lipid and atherosclerosis]                                                                                                                      |ENSG00000165029|
|ABCB1         |[Gastric cancer, MicroRNAs in cancer]                                                                                                            |ENSG00000085563|
|ABCC1         |[Sphingolipid signaling pathway, MicroRNAs in cancer]                                    

In [14]:
spark_df.count()

3493

In [None]:
def gene_ids_parser(input_folder, second_df, output_folder):
    # Read all files in the input folder
    input_files = spark.read.csv(input_folder, recursiveFileLookup=True, header=True)
    df = input_files.withColumn("file_name", f.element_at(f.split(f.input_file_name(), "/"), -1))
    
    # Perform the join operation on the entire DataFrame
    initial_key_column = "approvedSymbol"
    second_key_column = "approvedSymbol_0"
    
    df_genes = df.join(second_df, df[initial_key_column] == second_df[second_key_column], how="left").drop(second_key_column)
    
    # Check row counts
    if df.count() != df_genes.count():
        print(f"Warning: Counts don't match (initial: {df.count()}, joined: {df_genes.count()})")
    else:
        print(f"Counts match: {df.count()}")
    
    # Write the output to the specified location
    output_path = output_folder
    df_genes.write.option("header", "true").parquet(output_path, mode="overwrite")

In [None]:
input_folder = "gs://ot-team/polina/pathway_propagation_validation_v2/average_cutoffs/jaccard/Reactome_Pathways_2024"
second_df = target_names
output_folder = "gs://ot-team/polina/pathway_propagation_validation_v2/average_cutoffs/gene_spark/jaccard/Reactome_Pathways_2024"

gene_ids_parser(input_folder, second_df, output_folder)

# Save target-pathways lists for tensor board visualisation

! This code is pasted from IBD_pathway_to_cell from similarity_mvp and haven't been run !

Process CSV files in specified folders within a GCS directory and save target-pathway relationships in a format suitable for TensorBoard Embedding Projector. 

In [None]:
def save_target_pathway_tensorboard(input_gcs_dir, output_gcs_dir, folders_to_process):
    """
    Args:
        input_gcs_dir (str): Input GCS directory path.
        output_gcs_dir (str): Output GCS directory path.
        folders_to_process (list): List of folder names within the input directory to process.

    Output:
        Saves target-pathway relationships as TSV files in the output GCS directory.
    """
    # Initialize GCS filesystem
    fs = gcsfs.GCSFileSystem()

    # Ensure no trailing slashes in input and output directories
    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        folder_path = f"{input_gcs_dir}/{folder_name}"
        output_folder_path = f"{output_gcs_dir}/{folder_name}"

        # Ensure output folder exists
        if not fs.exists(output_folder_path):
            fs.mkdirs(output_folder_path)

        # List files in the input folder
        files_in_folder = fs.ls(folder_path)
        csv_files = [file for file in files_in_folder if file.endswith('.csv')]

        for file_path in csv_files:
            # Read the CSV file directly from GCS
            with fs.open(file_path, 'r') as f:
                df = pd.read_csv(f)

            # Check if 'propagated_edge' and 'Term' exist in the file
            if 'propagated_edge' not in df.columns or 'Term' not in df.columns:
                print(f"Skipping {file_path}: missing required columns.")
                continue

            # Explode the `propagated_edge` column
            df['propagated_edge_exploded'] = df['propagated_edge'].str.split(',')
            df = df.explode('propagated_edge_exploded').dropna(subset=['propagated_edge_exploded'])

            # Prepare data for TensorBoard format
            embedding_metadata = df[['propagated_edge_exploded', 'Term']]
            embedding_metadata = embedding_metadata.rename(
                columns={"propagated_edge_exploded": "Target", "Term": "Pathway"}
            )

            # Save target-pathway relationships as a TSV
            output_file_name = f"{Path(file_path).stem}_target_pathway.tsv"
            output_file_path = f"{output_folder_path}/{output_file_name}"

            # Save directly to GCS
            with fs.open(output_file_path, 'w') as f:
                embedding_metadata.to_csv(f, sep='\t', index=False)

            print(f"Processed and uploaded: {output_file_path}")


In [None]:
input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output"
output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/4tensorboard/jaccard"

library = ["Reactome_Pathways_2024"]

save_target_pathway_tensorboard(input_gcs_dir, output_gcs_dir, library)