In [1]:
import pandas as pd
import json
import os

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [2]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [3]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    k_means_cluster = clustering.KMeans(clustering.xenium_spot_data, save_plot=True, K=n_clusters)
    k_means_cluster_no_spatial = clustering.KMeans(clustering.xenium_spot_data, save_plot=True, K=n_clusters, include_spatial=False)
    return clustering, k_means_cluster, k_means_cluster_no_spatial

In [4]:
import numpy as np
import torch
from scipy.spatial.distance import cdist

def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, K=None, resolution=None, uses_spatial=True):

    if resolution is not None:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
            resolution, 
            cluster_dict[model_name][spot_size][third_dim]
        ))
    else:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][uses_spatial].get(
            K, 
            cluster_dict[model_name][spot_size][third_dim][uses_spatial]
        ))
    cluster_labels = np.unique(current_clustering)

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)
    dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}.csv"

    original_data.xenium_spot_data.obs[f"{model_name} cluster"].to_csv(filepath)
    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)
    
    mpd = {}
    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        mpd[f"Cluster {label}"] = spot_size * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, mpd[f"Cluster {label}"])

    mpd_dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/mpd/{spot_size}/"
    if not os.path.exists(mpd_dirpath):
        os.makedirs(mpd_dirpath)

    mpd_filepath = f"{mpd_dirpath}/{filename}_mpd.json"
    with open(mpd_filepath, "w") as f:
        json.dump(mpd, f, indent=4)

In [5]:
cluster_dict = {"K-Means": {}}
mpd = {"K-Means": {}}
results_dir = "results/hBreast"

In [6]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [50, 75, 100]:
    for third_dim in [False]:
        for K in [17]:
            cluster_results_filename = f"clusters_K={K}"
            original_data, k_means_cluster, k_means_cluster_no_spatial = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, n_clusters=K)
            # K-Means Spatial
            if "K-Means" not in cluster_dict:
                cluster_dict["K-Means"] = {}
            if spot_size not in cluster_dict["K-Means"]:
                cluster_dict["K-Means"][spot_size] = {}
            cluster_dict["K-Means"][spot_size][third_dim] = {True: {K: k_means_cluster.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "K-Means", cluster_results_filename, spot_size, third_dim, K, uses_spatial=True)

            # K-Means No Spatial
            if "K-Means_No_Spatial" not in cluster_dict:
                cluster_dict["K-Means_No_Spatial"] = {}
            if spot_size not in cluster_dict["K-Means_No_Spatial"]:
                cluster_dict["K-Means_No_Spatial"][spot_size] = {}
            cluster_dict["K-Means_No_Spatial"][spot_size][third_dim] = {False: {K: k_means_cluster_no_spatial.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "K-Means_No_Spatial", cluster_results_filename, spot_size, third_dim, K, uses_spatial=False)

            print(f"Cluster with spot size {(spot_size, third_dim, K)} completed.")

The size of the spot data is (23444, 280)
POSSIBLE 17 0 298092.4866778674
POSSIBLE 17 1 139788.90379943798
POSSIBLE 17 2 170343.59633366982
POSSIBLE 17 3 192357.67706572055
POSSIBLE 17 4 191688.8693530675
POSSIBLE 17 5 162439.67673042405
POSSIBLE 17 6 188796.99580911602
POSSIBLE 17 7 206145.26168998092
POSSIBLE 17 8 161153.16263570968
POSSIBLE 17 9 194452.5797990538
POSSIBLE 17 10 154737.89390493985
POSSIBLE 17 11 167923.86171408076
POSSIBLE 17 12 200313.16993622735
POSSIBLE 17 13 169433.6516111459
POSSIBLE 17 14 168146.3093281248
POSSIBLE 17 15 192457.6159052161
POSSIBLE 17 16 140665.16703685417
POSSIBLE 17 0 294263.32243352133
POSSIBLE 17 1 171706.6083144954
POSSIBLE 17 2 156202.68930566628
POSSIBLE 17 3 142403.74748856886
POSSIBLE 17 4 189192.46155686036
POSSIBLE 17 5 176018.50937534572
POSSIBLE 17 6 160748.94556566316
POSSIBLE 17 7 157291.60642481365
POSSIBLE 17 8 195100.62245826842
POSSIBLE 17 9 192284.2677321608
POSSIBLE 17 10 171323.80697852632
POSSIBLE 17 11 194615.89203666305


In [7]:
spot_sizes = [50,75,100]
methods = ["K-Means", "K-Means_No_Spatial"]
in_billions = 1_000_000_000
for method in methods:
    for spot_size in spot_sizes:
        print(spot_size)
        for K in [17]:
            filename = f"results/hBreast/{method}/{K}/mpd/{spot_size}/{cluster_results_filename}_mpd.json"
            if os.path.exists(filename):
                print(spot_size)
                with open(filename, "r") as mpd_dict:
                    current_mpd = json.load(mpd_dict)
                print("Method:", method, "Spot Size", spot_size, "Num Clusters:", len(current_mpd), "Total mpd", sum(current_mpd.values()) / in_billions)

50
50
Method: K-Means Spot Size 50 Num Clusters: 17 Total mpd 0.0030989368793306366
75
75
Method: K-Means Spot Size 75 Num Clusters: 17 Total mpd 0.004518220689853618
100
100
Method: K-Means Spot Size 100 Num Clusters: 17 Total mpd 0.005927142548308676
50
50
Method: K-Means_No_Spatial Spot Size 50 Num Clusters: 17 Total mpd 0.00301922581194194
75
75
Method: K-Means_No_Spatial Spot Size 75 Num Clusters: 17 Total mpd 0.0044231958260115
100
100
Method: K-Means_No_Spatial Spot Size 100 Num Clusters: 17 Total mpd 0.005896710644302065
