In [1]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, explode, split, array_distinct, udf
# from pyspark.ml.feature import MinHashLSH
# from pyspark.sql.types import FloatType
# import pyspark.sql.functions as F
# import os
# import numpy as np
# import gcsfs
# from pyspark.ml.linalg import Vectors, VectorUDT
# from pyspark.sql.functions import udf
# from itertools import combinations

# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col, split, explode, collect_set, lit, array, udf
# from pyspark.sql.types import DoubleType
# from itertools import combinations

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, split, collect_list, flatten, 
    array_intersect, array_union, size,
    when, first, input_file_name, regexp_extract, regexp_replace
)
from pyspark.sql.functions import lit
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.getOrCreate()

# Similarity matrix for propagation results

For future speed optiomisation use Minhash for similarity calculation.

## Jaccard

In [3]:
# Configurations for performance
spark.conf.set("spark.sql.shuffle.partitions", "200")
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

In [4]:
from pyspark.sql.functions import *

def calculate_jaccard_similarity(base_gcs_path, folders_to_process, output_gcs_path):
    base_gcs_path = base_gcs_path.rstrip("/")
    output_gcs_path = output_gcs_path.rstrip("/")

    for folder in folders_to_process:
        # Use wholeTextFiles trick to find actual .parquet file paths
        parquet_files = spark.sparkContext \
            .wholeTextFiles(f"{base_gcs_path}/{folder}/**/*.parquet") \
            .keys() \
            .collect()

        # Get unique directories from the file paths
        parquet_dirs = set("/".join(path.split("/")[:-1]) for path in parquet_files)

        for parquet_dir in sorted(parquet_dirs):
            df = spark.read.parquet(parquet_dir)

            grouped = df.groupBy("approvedSymbol").agg(
                flatten(collect_list("terms")).alias("terms"),
                first("targetId").alias("targetId")
            ).cache()

            if grouped.count() < 10000:
                grouped = grouped.hint("broadcast")

            pairs = grouped.alias("a").join(
                grouped.alias("b"),
                (col("a.approvedSymbol") < col("b.approvedSymbol")) &
                (size(array_intersect(col("a.terms"), col("b.terms"))) > 0)
            )

            result = pairs.select(
                col("a.approvedSymbol").alias("approvedSymbolA"),
                col("b.approvedSymbol").alias("approvedSymbolB"),
                col("a.targetId").alias("targetIdA"),
                col("b.targetId").alias("targetIdB"),
                (size(array_intersect(col("a.terms"), col("b.terms"))) /
                 size(array_union(col("a.terms"), col("b.terms")))).alias("jaccardSimilarity"),
                array_join(array_intersect(col("a.terms"), col("b.terms")), ",").alias("intersectingTerms")
            )

            # Recreate relative output path
            relative_path = parquet_dir.replace(base_gcs_path + "/", "")
            output_path = f"{output_gcs_path}/{relative_path}"
            print(f"Writing: {output_path}")
            result.repartition(20).write.mode("overwrite").parquet(output_path)
            grouped.unpersist()

~ 20 sec / file

In [5]:
gsea_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output_spark"
output_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark"

calculate_jaccard_similarity(
    base_gcs_path=gsea_dir,
    folders_to_process=["GO_Biological_Process_2023"],
    output_gcs_path=output_dir
)

                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000095_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000183_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000222_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000274_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000275_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000341_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000384_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000403_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000474_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000519_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000565_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000574_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000612_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000637_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000676_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000685_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000702_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000729_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0000756_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0001073_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0001378_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0002429_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0003060_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0003144_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0003758_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0003833_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0003869_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0004142_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0005952_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_0009606_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_1001231_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/EFO_1001901_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0001657_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0002367_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0004975_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0004976_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0004979_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0004985_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0005147_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0005178_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0005180_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0005277_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0005301_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0007915_ge_mm_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0008170_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0008315_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

Writing: gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0011719_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05


                                                                                

In [6]:
df_show = spark.read.parquet("gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark/GO_Biological_Process_2023/MONDO_0011719_ge_mm_som_gsea_GO_Biological_Process_2023_pval0.05")

In [7]:
df_show.filter((col("approvedSymbolA") == "CDK4") & (col("approvedSymbolB") == "CDK6")).show(5, truncate=False)
# df_show.show(10)

                                                                                

+---------------+---------------+---------------+---------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+
|approvedSymbolA|approvedSymbolB|targetIdA      |targetIdB      |jaccardSimilarity|intersectingTerms                                                                                                                 |
+---------------+---------------+---------------+---------------+-----------------+----------------------------------------------------------------------------------------------------------------------------------+
|CDK4           |CDK6           |ENSG00000135446|ENSG00000105810|1.0              |Protein Modification Process (GO:0036211),Protein Phosphorylation (GO:0006468),Regulation Of Fibroblast Proliferation (GO:0048145)|
+---------------+---------------+---------------+---------------+-----------------+---------------------------------------------------------

In [8]:
df_show.count()

                                                                                

3040655