In [24]:
import pandas as pd
import json
import os

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [25]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

KeyboardInterrupt: 

In [None]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, resolutions: list, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    hierarchical_cluster = clustering.Hierarchical(clustering.xenium_spot_data, embedding="umap", save_plot=True, num_clusters=n_clusters, include_spatial=True)
    hierarchical_cluster_no_spatial = clustering.Hierarchical(clustering.xenium_spot_data, embedding="umap", save_plot=True, num_clusters=n_clusters, include_spatial=False)
    k_means_cluster = clustering.KMeans(clustering.xenium_spot_data, save_plot=True, K=n_clusters)
    k_means_cluster_no_spatial = clustering.KMeans(clustering.xenium_spot_data, save_plot=True, K=n_clusters, include_spatial=False)
    Leiden_cluster = clustering.Leiden(clustering.xenium_spot_data, resolutions, embedding="umap", save_plot=True)
    Louvain_cluster = clustering.Louvain(clustering.xenium_spot_data, resolutions, embedding="umap", save_plot=True)

    return clustering, Leiden_cluster, Louvain_cluster, hierarchical_cluster, hierarchical_cluster_no_spatial, k_means_cluster, k_means_cluster_no_spatial

In [32]:
import numpy as np
import torch
from scipy.spatial.distance import cdist

def record_results(cluster_dict, results_dir, model_name, filename, spot_size, third_dim, resolution=None, uses_spatial=True):

    dirpath = f"{results_dir}/{model_name}/{str(resolution) + '/' if resolution is not None else ''}clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}"

    with open(filepath, "w") as f:
        json.dump(cluster_dict[model_name], f, indent=4)

    wss = {}
    current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
        resolution, 
        cluster_dict[model_name][spot_size][third_dim]
    ))
    current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
        uses_spatial, 
        cluster_dict[model_name][spot_size][third_dim]
    ))
    cluster_labels = np.unique(current_clustering)

    for label in cluster_labels:
        current_cluster_locations = np.stack(np.where(np.array(current_clustering) == label), axis=1)
        wss[f"Cluster {label}"] = (spot_size ** 2) * np.mean(cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, wss[f"Cluster {label}"])

    wss_dirpath = f"{results_dir}/{model_name}{'/' + str(resolution) if resolution is not None else ''}/wss/{spot_size}/"
    if not os.path.exists(wss_dirpath):
        os.makedirs(wss_dirpath)

    wss_filepath = f"{wss_dirpath}/{filename}_wss.json"
    with open(wss_filepath, "w") as f:
        json.dump(wss, f, indent=4)

In [33]:
cluster_dict = {"Leiden": {}, "Louvain": {}, "Hierarchical": {}, "K-Means": {}}
wss = {"Leiden": {}, "Louvain": {}, "Hierarchical": {}, "K-Means": {}}
results_dir = "results/hBreast"
cluster_results_filename = "clusters_w_plots_7_19.json"

In [34]:
import matplotlib
matplotlib.use('Agg')

resolutions = [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]
for spot_size in [100]:
    for third_dim in [False, True]:
        for K in [6,10]:
            original_data, Leiden_cluster, Louvain_cluster, hierarchical_cluster, hierarchical_cluster_no_spatial, k_means_cluster, k_means_cluster_no_spatial = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, resolutions, n_clusters=K)

            # Leiden
            if "Leiden" not in cluster_dict:
                cluster_dict["Leiden"] = {}
            if spot_size not in cluster_dict["Leiden"]:
                cluster_dict["Leiden"][spot_size] = {}
            cluster_dict["Leiden"][spot_size][third_dim] = {res: clusters.tolist() for res, clusters in Leiden_cluster.items()}
            for resolution in resolutions:
                record_results(cluster_dict, results_dir, "Leiden", cluster_results_filename, spot_size, third_dim, resolution)

            # Louvain
            if "Louvain" not in cluster_dict:
                cluster_dict["Louvain"] = {}
            if spot_size not in cluster_dict["Louvain"]:
                cluster_dict["Louvain"][spot_size] = {}
            cluster_dict["Louvain"][spot_size][third_dim] = {res: clusters.tolist() for res, clusters in Louvain_cluster.items()}
            for resolution in resolutions:
                record_results(cluster_dict, results_dir, "Louvain", cluster_results_filename, spot_size, third_dim, resolution)

            # Hierarchical
            if "Hierarchical" not in cluster_dict:
                cluster_dict["Hierarchical"] = {}
            if spot_size not in cluster_dict["Hierarchical"]:
                cluster_dict["Hierarchical"][spot_size] = {}
            cluster_dict["Hierarchical"][spot_size][third_dim] = {True: hierarchical_cluster.tolist()}
            record_results(cluster_dict, results_dir, "Hierarchical", cluster_results_filename, spot_size, third_dim, uses_spatial=True)

            # Hierarchical
            if "Hierarchical_No_Spatial" not in cluster_dict:
                cluster_dict["Hierarchical_No_Spatial"] = {}
            if spot_size not in cluster_dict["Hierarchical_No_Spatial"]:
                cluster_dict["Hierarchical_No_Spatial"][spot_size] = {}
            cluster_dict["Hierarchical_No_Spatial"][spot_size][third_dim] = {False: hierarchical_cluster_no_spatial.tolist()}
            record_results(cluster_dict, results_dir, "Hierarchical_No_Spatial", cluster_results_filename, spot_size, third_dim, uses_spatial=False)

            # K-Means Spatial
            if "K-Means" not in cluster_dict:
                cluster_dict["K-Means"] = {}
            if spot_size not in cluster_dict["K-Means"]:
                cluster_dict["K-Means"][spot_size] = {}
            cluster_dict["K-Means"][spot_size][third_dim] = {True: k_means_cluster.tolist()}
            record_results(cluster_dict, results_dir, "K-Means", cluster_results_filename, spot_size, third_dim, uses_spatial=True)

            # K-Means No Spatial
            if "K-Means_No_Spatial" not in cluster_dict:
                cluster_dict["K-Means_No_Spatial"] = {}
            if spot_size not in cluster_dict["K-Means_No_Spatial"]:
                cluster_dict["K-Means_No_Spatial"][spot_size] = {}
            cluster_dict["K-Means_No_Spatial"][spot_size][third_dim] = {False: k_means_cluster_no_spatial.tolist()}
            record_results(cluster_dict, results_dir, "K-Means_No_Spatial", cluster_results_filename, spot_size, third_dim, uses_spatial=False)

            print(f"Cluster with spot size {(spot_size, third_dim, K)} completed.")

The size of the spot data is (7312, 541)
         Falling back to preprocessing with `sc.pp.pca` and default params.
POSSIBLE 16 0 16143737.460908651
POSSIBLE 16 1 19515902.173377708
POSSIBLE 16 10 10989632.7094269
POSSIBLE 16 11 14103623.718613043
POSSIBLE 16 12 18931523.477141656
POSSIBLE 16 13 17761589.444851667
POSSIBLE 16 14 4346236.104823701
POSSIBLE 16 15 17858933.51800554
POSSIBLE 16 2 33032002.19353617
POSSIBLE 16 3 13914386.990119565
POSSIBLE 16 4 20407491.41051422
POSSIBLE 16 5 13658463.444781484
POSSIBLE 16 6 12397820.893595042
POSSIBLE 16 7 21817695.65509527
POSSIBLE 16 8 13489577.043256456
POSSIBLE 16 9 11058639.3191991
POSSIBLE 16 0 16143737.460908651
POSSIBLE 16 1 19515902.173377708
POSSIBLE 16 10 10989632.7094269
POSSIBLE 16 11 14103623.718613043
POSSIBLE 16 12 18931523.477141656
POSSIBLE 16 13 17761589.444851667
POSSIBLE 16 14 4346236.104823701
POSSIBLE 16 15 17858933.51800554
POSSIBLE 16 2 33032002.19353617
POSSIBLE 16 3 13914386.990119565
POSSIBLE 16 4 20407491.4105