In [8]:
import pandas as pd
import json
import os

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [9]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [10]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    k_means_cluster = clustering.KMeans(clustering.xenium_spot_data, save_plot=True, K=n_clusters)
    k_means_cluster_no_spatial = clustering.KMeans(clustering.xenium_spot_data, save_plot=True, K=n_clusters, include_spatial=False)
    return clustering, k_means_cluster, k_means_cluster_no_spatial

In [11]:
import numpy as np
import torch
from scipy.spatial.distance import cdist

def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, K=None, resolution=None, uses_spatial=True):

    if resolution is not None:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
            resolution, 
            cluster_dict[model_name][spot_size][third_dim]
        ))
    else:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][uses_spatial].get(
            K, 
            cluster_dict[model_name][spot_size][third_dim][uses_spatial]
        ))
    cluster_labels = np.unique(current_clustering)

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)
    dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}.csv"

    original_data.xenium_spot_data.obs[f"{model_name} cluster"].to_csv(filepath)
    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)
    
    wss = {}
    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        wss[f"Cluster {label}"] = (spot_size ** 2) * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, wss[f"Cluster {label}"])

    wss_dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/wss/{spot_size}/"
    if not os.path.exists(wss_dirpath):
        os.makedirs(wss_dirpath)

    wss_filepath = f"{wss_dirpath}/{filename}_wss.json"
    with open(wss_filepath, "w") as f:
        json.dump(wss, f, indent=4)

In [12]:
cluster_dict = {"K-Means": {}}
wss = {"K-Means": {}}
results_dir = "results/hBreast"

In [13]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [50, 75, 100]:
    for third_dim in [False]:
        for K in [17]:
            cluster_results_filename = f"clusters_K={K}"
            original_data, k_means_cluster, k_means_cluster_no_spatial = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, n_clusters=K)
            # K-Means Spatial
            if "K-Means" not in cluster_dict:
                cluster_dict["K-Means"] = {}
            if spot_size not in cluster_dict["K-Means"]:
                cluster_dict["K-Means"][spot_size] = {}
            cluster_dict["K-Means"][spot_size][third_dim] = {True: {K: k_means_cluster.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "K-Means", cluster_results_filename, spot_size, third_dim, K, uses_spatial=True)

            # K-Means No Spatial
            if "K-Means_No_Spatial" not in cluster_dict:
                cluster_dict["K-Means_No_Spatial"] = {}
            if spot_size not in cluster_dict["K-Means_No_Spatial"]:
                cluster_dict["K-Means_No_Spatial"][spot_size] = {}
            cluster_dict["K-Means_No_Spatial"][spot_size][third_dim] = {False: {K: k_means_cluster_no_spatial.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "K-Means_No_Spatial", cluster_results_filename, spot_size, third_dim, K, uses_spatial=False)

            print(f"Cluster with spot size {(spot_size, third_dim, K)} completed.")

The size of the spot data is (23444, 280)
POSSIBLE 17 0 275238.23349812505
POSSIBLE 17 1 201571.67743681872
POSSIBLE 17 2 177446.97401227552
POSSIBLE 17 3 175109.72773690562
POSSIBLE 17 4 145501.39942954326
POSSIBLE 17 5 190081.16209611305
POSSIBLE 17 6 159493.2966810456
POSSIBLE 17 7 199758.80087713612
POSSIBLE 17 8 155660.2513333816
POSSIBLE 17 9 149010.23439164663
POSSIBLE 17 10 173563.6352486485
POSSIBLE 17 11 188771.15494948532
POSSIBLE 17 12 153582.22803976134
POSSIBLE 17 13 173178.1935664512
POSSIBLE 17 14 193421.79625476507
POSSIBLE 17 15 154663.74331381798
POSSIBLE 17 16 159835.56924355874
POSSIBLE 17 0 289532.24908019474
POSSIBLE 17 1 160302.472990233
POSSIBLE 17 2 192106.96398726196
POSSIBLE 17 3 188943.643357102
POSSIBLE 17 4 158152.5542001083
POSSIBLE 17 5 154663.74331381798
POSSIBLE 17 6 204753.1961729318
POSSIBLE 17 7 157074.55945909355
POSSIBLE 17 8 172310.08867643692
POSSIBLE 17 9 199721.28864979156
POSSIBLE 17 10 148578.78680035297
POSSIBLE 17 11 172414.99214342426
PO

In [14]:
spot_sizes = [50,75,100]
methods = ["K-Means", "K-Means_No_Spatial"]
in_billions = 1_000_000_000
for method in methods:
    for spot_size in spot_sizes:
        print(spot_size)
        for K in [17]:
            filename = f"results/hBreast/{method}/{K}/wss/{spot_size}/{cluster_results_filename}_wss.json"
            if os.path.exists(filename):
                print(spot_size)
                with open(filename, "r") as wss_dict:
                    current_wss = json.load(wss_dict)
                print("Method:", method, "Spot Size", spot_size, "Num Clusters:", len(current_wss), "Total WSS", sum(current_wss.values()) / in_billions)

50
50
Method: K-Means Spot Size 50 Num Clusters: 17 Total WSS 0.0030258880781094794
75
75
Method: K-Means Spot Size 75 Num Clusters: 17 Total WSS 0.0044761552301336195
100
100
Method: K-Means Spot Size 100 Num Clusters: 17 Total WSS 0.0056234390157401575
50
50
Method: K-Means_No_Spatial Spot Size 50 Num Clusters: 17 Total WSS 0.0030416505744839473
75
75
Method: K-Means_No_Spatial Spot Size 75 Num Clusters: 17 Total WSS 0.004546685614214051
100
100
Method: K-Means_No_Spatial Spot Size 100 Num Clusters: 17 Total WSS 0.006113354131907866
