This code is pasted from IBD_pathway_to_cell from similarity_mvp and should be rewritten to spark to process data more effeciently.

# Similarity matrix for propagation results

For MVP: 

presence or absence of target in pathways (0/1)

Jaccard for distance

In [None]:
def calculate_jaccard_similarity(input_gcs_dir, output_gcs_dir, folders_to_process):
    """
    Process CSV files in specified folders within a GCS directory and calculate Jaccard similarity matrices.

    Args:
        input_gcs_dir (str): Input GCS directory path.
        output_gcs_dir (str): Output GCS directory path.
        folders_to_process (list): List of folder names within the input directory to process.

    Output:
        Saves similarity matrices as CSV files in output GCS directory.
    """
    # Initialize GCS filesystem
    fs = gcsfs.GCSFileSystem()

    # Ensure no trailing slashes in input and output directories
    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        folder_path = f"{input_gcs_dir}/{folder_name}"
        output_folder_path = f"{output_gcs_dir}/{folder_name}"

        # Ensure output folder exists
        if not fs.exists(output_folder_path):
            fs.mkdirs(output_folder_path)

        # List files in the input folder
        files_in_folder = fs.ls(folder_path)
        csv_files = [file for file in files_in_folder if file.endswith('.csv')]

        for file_path in csv_files:
            # Read the CSV file directly from GCS
            with fs.open(file_path, 'r') as f:
                df = pd.read_csv(f)

            # Check if 'propagated_edge' and 'Term' exist in the file
            if 'propagated_edge' not in df.columns or 'Term' not in df.columns:
                print(f"Skipping {file_path}: missing required columns.")
                continue

            # Explode the `propagated_edge` column
            df['propagated_edge_exploded'] = df['propagated_edge'].str.split(',')
            df = df.explode('propagated_edge_exploded').dropna(subset=['propagated_edge_exploded'])

            # Prepare data for the similarity matrix
            unique_terms = df['Term'].unique()
            unique_targets = df['propagated_edge_exploded'].unique()

            # Create a binary matrix: rows=targets, columns=terms
            binary_matrix = pd.DataFrame(0, index=unique_targets, columns=unique_terms)
            for target in unique_targets:
                terms_with_target = df.loc[df['propagated_edge_exploded'] == target, 'Term'].unique()
                binary_matrix.loc[target, terms_with_target] = 1

            # Calculate Jaccard similarity matrix
            jaccard_distance = pdist(binary_matrix.values, metric='jaccard')
            jaccard_similarity = 1 - squareform(jaccard_distance)

            # # Apply exponential decay function to the similarity matrix
            # decay_function = lambda x: np.exp(-x)
            # similarity_matrix = decay_function(jaccard_similarity)

            # Save similarity matrix as a CSV
            similarity_df = pd.DataFrame(
                jaccard_similarity, 
                index=unique_targets, 
                columns=unique_targets
            )
            output_file_name = f"{Path(file_path).stem}_jaccard_sim.csv"
            output_file_path = f"{output_folder_path}/{output_file_name}"

            # Save directly to GCS
            with fs.open(output_file_path, 'w') as f:
                similarity_df.to_csv(f, index=True)

            print(f"Processed and uploaded: {output_file_path}")

In [None]:
gsea_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output"
output_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/similarity_mtx/jaccard"

library = ["GO_Biological_Process_2023"]

calculate_jaccard_similarity(gsea_dir, output_dir, library)