In [14]:
import pandas as pd
import json
import os

import numpy as np
import torch
from scipy.spatial.distance import cdist

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

from mclustpy import mclustpy

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [15]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [16]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, num_pcs: int, n_clusters=17, model_name: str = "EEE"):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False, n_pcs = num_pcs)

    clustering.pca(clustering.xenium_spot_data, num_pcs = num_pcs)

    mclust_cluster = clustering.mclust(clustering.xenium_spot_data, G=17, model_name = "EEE")

    return clustering, mclust_cluster

In [17]:
def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, num_pcs, K=None, resolution=None, uses_spatial=True):

    dirpath = f"{results_dir}/{model_name}/{num_pcs}/{(str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    wss = {}
    if resolution is not None:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][num_pcs].get(
            resolution, 
            cluster_dict[model_name][spot_size][third_dim][num_pcs]
        ))
    else:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][num_pcs][uses_spatial].get(
            K, 
            cluster_dict[model_name][spot_size][third_dim][num_pcs][uses_spatial]
        ))
    cluster_labels = np.unique(current_clustering)

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}.csv"

    original_data.xenium_spot_data.obs[f"{model_name} cluster"].to_csv(filepath)
    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)
    num_clusters = len(cluster_labels)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)

    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        wss[f"Cluster {label}"] = (spot_size ** 2) * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, wss[f"Cluster {label}"])

    wss_dirpath = f"{results_dir}/{model_name}/{num_pcs}/{(str(resolution) if resolution is not None else str(K))}/wss/{spot_size}/"
    if not os.path.exists(wss_dirpath):
        os.makedirs(wss_dirpath)

    wss_filepath = f"{wss_dirpath}/{filename}_wss.json"
    with open(wss_filepath, "w") as f:
        json.dump(wss, f, indent=4)

In [18]:
cluster_dict = {"mclust": {}}
wss = {"mclust": {}}
results_dir = "results/hBreast"

In [19]:
PC_list = [3, 5, 10, 15, 25]

In [20]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [50, 75, 100]:
    for third_dim in [False]:
        for K in [17]:
            for num_pcs in PC_list:

                try:

                    cluster_results_filename = f"clusters_K={K}"
                    original_data, mclust_cluster = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, num_pcs, n_clusters=K)

                    # mclust
                    if "mclust" not in cluster_dict:
                        cluster_dict["mclust"] = {}
                    if spot_size not in cluster_dict["mclust"]:
                        cluster_dict["mclust"][spot_size] = {}
                    if third_dim not in cluster_dict["mclust"][spot_size]:
                        cluster_dict["mclust"][spot_size][third_dim] = {}
                    cluster_dict["mclust"][spot_size][third_dim][num_pcs] = {True: {K: mclust_cluster.tolist()}}
                    record_results(original_data, cluster_dict, results_dir, "mclust", cluster_results_filename, spot_size, third_dim, num_pcs=num_pcs, K=K, uses_spatial=True)

                except TypeError:

                    print(f"Most likely mclust returned a null object for setting: SPOT_SIZE={spot_size}, NUM_PCS={num_pcs}, K={K}")

The size of the spot data is (23444, 280)
(23444, 3)
fitting ...
POSSIBLE 14 1 166630.87931376175
POSSIBLE 14 2 148785.4008044954
POSSIBLE 14 3 196517.20308761153
POSSIBLE 14 4 189626.81779101398
POSSIBLE 14 5 203472.8927527831
POSSIBLE 14 6 166237.58778851904
POSSIBLE 14 7 184684.08404518818
POSSIBLE 14 9 175584.55789105626
POSSIBLE 14 10 183015.70280896313
POSSIBLE 14 11 186307.23717613472
POSSIBLE 14 12 167888.65590673542
POSSIBLE 14 13 197787.49689506213
POSSIBLE 14 16 170484.0548596655
POSSIBLE 14 17 196624.17717453526
The size of the spot data is (23444, 280)
(23444, 5)
fitting ...
POSSIBLE 14 1 175960.80538837597
POSSIBLE 14 2 179372.601461927
POSSIBLE 14 3 189349.29253076573
POSSIBLE 14 4 161478.53612295733
POSSIBLE 14 5 204373.61326193158
POSSIBLE 14 6 189095.42012114526
POSSIBLE 14 7 158623.38891811136
POSSIBLE 14 9 194210.12400478535
POSSIBLE 14 10 201787.88069538947
POSSIBLE 14 11 140006.09649957294
POSSIBLE 14 12 190345.95766192215
POSSIBLE 14 14 185303.28839891203
POSSIBL