In [1]:
import pandas as pd
import json
import os

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [2]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [3]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    hierarchical_cluster = clustering.Hierarchical(clustering.xenium_spot_data, embedding="umap", save_plot=True, num_clusters=n_clusters, include_spatial=True)
    hierarchical_cluster_no_spatial = clustering.Hierarchical(clustering.xenium_spot_data, embedding="umap", save_plot=True, num_clusters=n_clusters, include_spatial=False)
    return clustering, hierarchical_cluster, hierarchical_cluster_no_spatial

In [4]:
import numpy as np
import torch
from scipy.spatial.distance import cdist

def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, K=None, resolution=None, uses_spatial=True):

    if resolution is not None:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
            resolution, 
            cluster_dict[model_name][spot_size][third_dim]
        ))
    else:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][uses_spatial].get(
            K, 
            cluster_dict[model_name][spot_size][third_dim][uses_spatial]
        ))
    cluster_labels = np.unique(current_clustering)

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)
    dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}.csv"

    original_data.xenium_spot_data.obs[f"{model_name} cluster"].to_csv(filepath)
    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)

    mpd = {}
    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        mpd[f"Cluster {label}"] = spot_size * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, mpd[f"Cluster {label}"])

    mpd_dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/mpd/{spot_size}/"
    if not os.path.exists(mpd_dirpath):
        os.makedirs(mpd_dirpath)

    mpd_filepath = f"{mpd_dirpath}/{filename}_mpd.json"
    with open(mpd_filepath, "w") as f:
        json.dump(mpd, f, indent=4)

In [5]:
cluster_dict = {"Hierarchical": {}}
mpd = {"Hierarchical": {}}
results_dir = "results/hBreast"

In [6]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [50, 75, 100]:
    for third_dim in [False]:
        for K in [17]:
            cluster_results_filename = f"clusters_K={K}"
            original_data, hierarchical_cluster, hierarchical_cluster_no_spatial = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, n_clusters=K)
            # Hierarchical Spatial
            if "Hierarchical" not in cluster_dict:
                cluster_dict["Hierarchical"] = {}
            if spot_size not in cluster_dict["Hierarchical"]:
                cluster_dict["Hierarchical"][spot_size] = {}
            cluster_dict["Hierarchical"][spot_size][third_dim] = {True: {K: hierarchical_cluster.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "Hierarchical", cluster_results_filename, spot_size, third_dim, K, uses_spatial=True)

            # Hierarchical No Spatial
            if "Hierarchical_No_Spatial" not in cluster_dict:
                cluster_dict["Hierarchical_No_Spatial"] = {}
            if spot_size not in cluster_dict["Hierarchical_No_Spatial"]:
                cluster_dict["Hierarchical_No_Spatial"][spot_size] = {}
            cluster_dict["Hierarchical_No_Spatial"][spot_size][third_dim] = {False: {K: hierarchical_cluster_no_spatial.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "Hierarchical_No_Spatial", cluster_results_filename, spot_size, third_dim, K, uses_spatial=False)

            print(f"Cluster with spot size {(spot_size, third_dim, K)} completed.")

The size of the spot data is (23444, 280)
         Falling back to preprocessing with `sc.pp.pca` and default params.
POSSIBLE 17 1 153864.3839623562
POSSIBLE 17 2 181139.9579898963
POSSIBLE 17 3 174051.86829250678
POSSIBLE 17 4 176394.63673151995
POSSIBLE 17 5 200088.21236640622
POSSIBLE 17 6 101419.24866749704
POSSIBLE 17 7 173969.50045129895
POSSIBLE 17 8 190205.5668843859
POSSIBLE 17 9 187693.10388498253
POSSIBLE 17 10 199404.7335029722
POSSIBLE 17 11 206176.03182383964
POSSIBLE 17 12 182243.58620968836
POSSIBLE 17 13 201306.95050688254
POSSIBLE 17 14 211848.84017789725
POSSIBLE 17 15 167620.2847518643
POSSIBLE 17 16 172352.65105919773
POSSIBLE 17 17 173337.4241952826
POSSIBLE 17 1 201950.5191518618
POSSIBLE 17 2 162090.42835014788
POSSIBLE 17 3 178974.10069722714
POSSIBLE 17 4 157685.24716399235
POSSIBLE 17 5 149022.7198240636
POSSIBLE 17 6 176453.61099130713
POSSIBLE 17 7 166999.77822537054
POSSIBLE 17 8 147984.1857074606
POSSIBLE 17 9 169490.59700276295
POSSIBLE 17 10 183306.938

In [7]:
spot_sizes = [50,75,100]
resolutions = [0.25, 0.5, 0.75, 1.0]
methods = ["Hierarchical", "Hierarchical_No_Spatial"]
in_billions = 1_000_000_000
for method in methods:
    for spot_size in spot_sizes:
        for K in [17]:
            filename = f"results/hBreast/{method}/{K}/mpd/{spot_size}/{cluster_results_filename}_mpd.json"
            if os.path.exists(filename):
                with open(filename, "r") as mpd_dict:
                    current_mpd = json.load(mpd_dict)
                print("Method:", method, "Spot Size", spot_size, "Num Clusters:", len(current_mpd), "Total mpd", sum(current_mpd.values()) / in_billions)

Method: Hierarchical Spot Size 50 Num Clusters: 17 Total mpd 0.003053116981458475
Method: Hierarchical Spot Size 75 Num Clusters: 17 Total mpd 0.004481702061656659
Method: Hierarchical Spot Size 100 Num Clusters: 17 Total mpd 0.00560841868163628
Method: Hierarchical_No_Spatial Spot Size 50 Num Clusters: 17 Total mpd 0.0029659282313437246
Method: Hierarchical_No_Spatial Spot Size 75 Num Clusters: 17 Total mpd 0.004377434008860294
Method: Hierarchical_No_Spatial Spot Size 100 Num Clusters: 17 Total mpd 0.005796027727486132
