! This code is pasted from IBD_pathway_to_cell from similarity_mvp and haven't been run !

Process CSV files in specified folders within a GCS directory and save target-pathway relationships in a format suitable for TensorBoard Embedding Projector. 

In [None]:
def save_target_pathway_tensorboard(input_gcs_dir, output_gcs_dir, folders_to_process):
    """
    Args:
        input_gcs_dir (str): Input GCS directory path.
        output_gcs_dir (str): Output GCS directory path.
        folders_to_process (list): List of folder names within the input directory to process.

    Output:
        Saves target-pathway relationships as TSV files in the output GCS directory.
    """
    # Initialize GCS filesystem
    fs = gcsfs.GCSFileSystem()

    # Ensure no trailing slashes in input and output directories
    input_gcs_dir = input_gcs_dir.rstrip("/")
    output_gcs_dir = output_gcs_dir.rstrip("/")

    for folder_name in folders_to_process:
        folder_path = f"{input_gcs_dir}/{folder_name}"
        output_folder_path = f"{output_gcs_dir}/{folder_name}"

        # Ensure output folder exists
        if not fs.exists(output_folder_path):
            fs.mkdirs(output_folder_path)

        # List files in the input folder
        files_in_folder = fs.ls(folder_path)
        csv_files = [file for file in files_in_folder if file.endswith('.csv')]

        for file_path in csv_files:
            # Read the CSV file directly from GCS
            with fs.open(file_path, 'r') as f:
                df = pd.read_csv(f)

            # Check if 'propagated_edge' and 'Term' exist in the file
            if 'propagated_edge' not in df.columns or 'Term' not in df.columns:
                print(f"Skipping {file_path}: missing required columns.")
                continue

            # Explode the `propagated_edge` column
            df['propagated_edge_exploded'] = df['propagated_edge'].str.split(',')
            df = df.explode('propagated_edge_exploded').dropna(subset=['propagated_edge_exploded'])

            # Prepare data for TensorBoard format
            embedding_metadata = df[['propagated_edge_exploded', 'Term']]
            embedding_metadata = embedding_metadata.rename(
                columns={"propagated_edge_exploded": "Target", "Term": "Pathway"}
            )

            # Save target-pathway relationships as a TSV
            output_file_name = f"{Path(file_path).stem}_target_pathway.tsv"
            output_file_path = f"{output_folder_path}/{output_file_name}"

            # Save directly to GCS
            with fs.open(output_file_path, 'w') as f:
                embedding_metadata.to_csv(f, sep='\t', index=False)

            print(f"Processed and uploaded: {output_file_path}")


In [None]:
input_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/gsea_output"
output_gcs_dir = "gs://ot-team/polina/pathway_propagation_validation_v2/4tensorboard/jaccard"

library = ["Reactome_Pathways_2024"]

save_target_pathway_tensorboard(input_gcs_dir, output_gcs_dir, library)