In [1]:
import pandas as pd
import json
import os

import numpy as np
import torch
from scipy.spatial.distance import cdist

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

from mclustpy import mclustpy

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [2]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [3]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, num_pcs: int, n_clusters=17, model_name: str = "EEE"):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False, n_pcs = num_pcs)

    clustering.pca(clustering.xenium_spot_data, num_pcs = num_pcs)

    mclust_cluster = clustering.mclust(clustering.xenium_spot_data, G=n_clusters, model_name = "EEE")

    return clustering, mclust_cluster

In [4]:
def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, num_pcs, K=None, resolution=None, uses_spatial=True):

    dirpath = f"{results_dir}/{model_name}/{num_pcs}/{(str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    current_clustering = pd.read_csv(f"{dirpath}/{filename}.csv", index_col=0)["mclust cluster"].values

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)

    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)
    num_clusters = len(cluster_labels)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)

    mpd = {}
    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        mpd[f"Cluster {label}"] = spot_size * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, mpd[f"Cluster {label}"])

    mpd_dirpath = f"{results_dir}/{model_name}/{num_pcs}/{(str(resolution) if resolution is not None else str(K))}/mpd/{spot_size}/"
    if not os.path.exists(mpd_dirpath):
        os.makedirs(mpd_dirpath)

    mpd_filepath = f"{mpd_dirpath}/{filename}_mpd.json"
    with open(mpd_filepath, "w") as f:
        json.dump(mpd, f, indent=4)

In [5]:
cluster_dict = {"mclust": {}}
mpd = {"mclust": {}}
results_dir = "results/hBreast"

In [6]:
PC_list = [3, 5, 10, 15, 25]

In [8]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [50]:
    for third_dim in [False]:
        for K in [17]:
            for num_pcs in PC_list:

                try:

                    cluster_results_filename = f"clusters_K={K}"
                    original_data, mclust_cluster = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, num_pcs, n_clusters=K)

                    # mclust
                    if "mclust" not in cluster_dict:
                        cluster_dict["mclust"] = {}
                    if spot_size not in cluster_dict["mclust"]:
                        cluster_dict["mclust"][spot_size] = {}
                    if third_dim not in cluster_dict["mclust"][spot_size]:
                        cluster_dict["mclust"][spot_size][third_dim] = {}
                    cluster_dict["mclust"][spot_size][third_dim][num_pcs] = {True: {K: mclust_cluster.tolist()}}
                    record_results(original_data, cluster_dict, results_dir, "mclust", cluster_results_filename, spot_size, third_dim, num_pcs=num_pcs, K=K, uses_spatial=True)

                except TypeError as e:

                    print(f"Most likely mclust returned a null object for setting: SPOT_SIZE={spot_size}, NUM_PCS={num_pcs}, K={K}")
                    print(f"Error: {e}")
                
                1/0

The size of the spot data is (23444, 280)
TOTAL NUMBER OF UNIQUE CLUSTERS:  15
POSSIBLE 15 1 167584.19356975707
POSSIBLE 15 2 170559.17810074365
POSSIBLE 15 3 188896.77991364626
POSSIBLE 15 4 176810.2129674784
POSSIBLE 15 5 166646.45761794172
POSSIBLE 15 7 186708.15559302788
POSSIBLE 15 8 164198.0598013789
POSSIBLE 15 9 184210.34394690802
POSSIBLE 15 10 178523.5728404353
POSSIBLE 15 11 199039.67980814233
POSSIBLE 15 12 188437.93426351078
POSSIBLE 15 13 147805.54902816465
POSSIBLE 15 14 201002.29108736364
POSSIBLE 15 15 196285.18984109032
POSSIBLE 15 16 193779.68607315017


ZeroDivisionError: division by zero