This code is pasted from IBD_pathway_to_cell from similarity_mvp and should be rewritten to spark to process data more effeciently.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, split, array_distinct
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as F
import os
import numpy as np
import gcsfs

In [2]:
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/28 16:18:10 INFO SparkEnv: Registering MapOutputTracker
25/04/28 16:18:10 INFO SparkEnv: Registering BlockManagerMaster
25/04/28 16:18:10 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/04/28 16:18:10 INFO SparkEnv: Registering OutputCommitCoordinator


# Similarity matrix for propagation results

For MVP: 

presence or absence of target in pathways (0/1)

Jaccard for distance

In [4]:
def calculate_jaccard_similarity_spark_minhash(input_gcs_dir, output_gcs_dir, folders_to_process, num_hash_tables=5):
    """
    Process CSV files in specified folders within a GCS directory and calculate Jaccard similarity matrices using Spark MinHashLSH (approximate).

    Args:
        input_gcs_dir (str): Input GCS directory path.
        output_gcs_dir (str): Output GCS directory path.
        folders_to_process (list): List of folder names within the input directory to process.
        num_hash_tables (int): Number of hash tables for MinHash (higher = more accuracy, slower).

    Output:
        Saves approximate similarity results into output GCS directory.
    """

    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        folder_path = f"{input_gcs_dir}/{folder_name}"
        output_folder_path = f"{output_gcs_dir}/{folder_name}"

        # Read all CSVs inside the folder
        df = spark.read.option("header", True).csv(f"{folder_path}/*.csv")

        # Check if required columns exist
        expected_cols = {'propagated_edge', 'Term'}
        if not expected_cols.issubset(set(df.columns)):
            print(f"Skipping folder {folder_name}: missing required columns.")
            continue

        # Explode propagated_edge column
        df_exploded = df.withColumn(
            "propagated_edge_exploded", 
            explode(split(col("propagated_edge"), ","))
        ).dropna(subset=["propagated_edge_exploded"])

        # Group by target and collect associated terms
        target_terms = df_exploded.groupBy("propagated_edge_exploded") \
            .agg(F.collect_set("Term").alias("terms"))

        # Index the terms (since MinHash needs fixed-size numeric vectors)
        # Create a dictionary: term -> index
        all_terms = df_exploded.select("Term").distinct().rdd.map(lambda r: r[0]).collect()
        term_to_index = {term: idx for idx, term in enumerate(all_terms)}

        # Transform terms into SparseVectors
        def terms_to_vector(terms):
            indices = [term_to_index[term] for term in terms if term in term_to_index]
            values = [1.0] * len(indices)
            size = len(term_to_index)
            return Vectors.sparse(size, list(zip(indices, values)))

        terms_to_vector_udf = F.udf(terms_to_vector)

        target_terms = target_terms.withColumn("features", terms_to_vector_udf(col("terms")))

        # Initialize MinHashLSH model
        mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash_tables)
        model = mh.fit(target_terms)

        # Compute pairwise similarities
        similarities = model.approxSimilarityJoin(target_terms, target_terms, threshold=1.0, distCol="JaccardDistance")

        # Select relevant columns and format
        similarity_df = similarities.select(
            col("datasetA.propagated_edge_exploded").alias("target1"),
            col("datasetB.propagated_edge_exploded").alias("target2"),
            (1 - col("JaccardDistance")).alias("similarity")
        ).filter("target1 <= target2")  # Avoid duplicates (symmetry)

        # Save the similarity results
        similarity_df.write.mode('overwrite').option("header", True).csv(f"{output_folder_path}")

        print(f"Processed (MinHash) and uploaded: {output_folder_path}")

In [1]:
gsea_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output"
output_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard_spark"

library = ["KEGG_2021_Human"]

calculate_jaccard_similarity_spark_minhash(gsea_dir, output_dir, library)

NameError: name 'calculate_jaccard_similarity_spark_minhash' is not defined