In [1]:
import pandas as pd
import json
import os
import torch

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [2]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [3]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, init_method: str = "mclust", num_pcs: int = 15, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    BayesSpace_cluster = clustering.BayesSpace(clustering.xenium_spot_data, init_method=init_method, num_pcs=num_pcs, K=n_clusters, grid_search=True)

    return clustering, BayesSpace_cluster

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import torch
from scipy.spatial.distance import cdist

def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, num_pcs, init_method, K=None, resolution=None, uses_spatial=True):

    dirpath = f"{results_dir}/{model_name}/{num_pcs}/{(str(resolution) if resolution is not None else str(K))}/clusters/{init_method}/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    for gamma in np.linspace(1, 3, 9):
        gamma_str = f"{gamma:.2f}"
        new_dirpath = os.path.join(dirpath, gamma_str)
        if not os.path.exists(new_dirpath):
            os.makedirs(new_dirpath)
        try:
            current_clustering = pd.read_csv(f"{dirpath}/{gamma_str}/{filename}.csv", index_col=0)["BayesSpace cluster"].values

            original_data.xenium_spot_data.obs[f"{model_name} cluster"] = current_clustering
            # Extracting row, col, and cluster values from the dataframe
            rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
            cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
            clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
            cluster_labels = np.unique(clusters)
            num_clusters = len(cluster_labels)

            num_rows = int(max(rows) - min(rows) + 1)
            num_cols = int(max(cols) - min(cols) + 1)

            cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

            cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)

            # colors = plt.cm.get_cmap('viridis', num_clusters + 1)
            # colormap_colors = np.vstack(([[1, 1, 1, 1]], colors(np.linspace(0, 1, num_clusters))))
            # colormap = ListedColormap(colormap_colors)

            # plt.figure(figsize=(6, 6))
            # plt.imshow(cluster_grid, cmap=colormap, interpolation='nearest', origin='lower')
            # plt.colorbar(ticks=range(num_clusters + 1), label='Cluster Values')
            # plt.title(f'Cluster Assignment with BayesSpace ($\gamma$ = {gamma})')

            # plt.savefig(
            #     os.path.join(new_dirpath, f"clusters_K={K}.png")
            # )
            
            mpd = {}
            for label in cluster_labels:
                current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
                mpd[f"Cluster {label}"] = spot_size * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
                print(f"POSSIBLE {len(cluster_labels)}", label, mpd[f"Cluster {label}"])

            mpd_dirpath = f"{results_dir}/{model_name}/{num_pcs}/{(str(resolution) if resolution is not None else str(K))}/mpd/{init_method}/{spot_size}/{gamma_str}"
            if not os.path.exists(mpd_dirpath):
                os.makedirs(mpd_dirpath)

            mpd_filename = f"{filename}_mpd.json"
            print(os.path.join(mpd_dirpath, mpd_filename))
            with open(os.path.join(mpd_dirpath, mpd_filename), "w") as f:
                json.dump(mpd, f, indent=4)

        except:
            continue

In [5]:
cluster_dict = {"BayesSpace": {}}
mpd = {"BayesSpace": {}}
results_dir = "results/hBreast"

In [6]:
PC_list = [3, 5, 10, 15, 25]
init_methods = ["kmeans", "mclust"]

In [7]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [50, 75, 100]:
    for third_dim in [False]:
        for K in [17]:
            for num_pcs in PC_list:
                for init_method in init_methods:
                    cluster_results_filename = f"clusters_K={K}"
                    original_data, BayesSpace_cluster = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, init_method, num_pcs, n_clusters=K)

                    record_results(original_data, cluster_dict, results_dir, "BayesSpace", cluster_results_filename, spot_size, third_dim, num_pcs, init_method, K, uses_spatial=True)

The size of the spot data is (23444, 280)


Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, ro

50 3 17

Neighbors were identified for 23441 out of 23444 spots.
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Neighbors were identified for 23441 out of 23444 spots.
Fitting model...
You created a large dataset with compression and chunking.
The chunk size is equal to the dataset dimensions.
If you want to read subsets of the dataset, you should testsmaller chunk sizes to improve read times.
You created a large dataset with compression and chunking.
The chunk size is equal to the dataset dimensions.
If you want to read subsets of the dataset, you should testsmaller chunk sizes to improve read times.
Calculating labels using iterations 100 through 1000.


    [1]  6  6  6  6  7  6  7  6  6  6  6  6  6  6  6  6  2  6  6  7  6  6  7  6
   [25]  6  6  6  6  6  6  6  6  2  7  7  6  7  7  7  6  6  6  6  6  6  6  6  6
   [49]  6  7 11  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  7  6  6
   [73]  6  6  6  6  7  7  6  6  6  7  7  7  7  7  7  6  6  6  7  6  6  6  6  7
   [97]  6  6  6  7  7  6  6  6  6  6  6  7  6  6  6  6  6  6  6  6  6  6  6  6
  [121]  6  6  7  6  6  6  6  7  6  6  6  6  6  6  6  7  6  6  6  6  6  6  6  7
  [145]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  7  6  6  7  2  6  6  6  6  6
  [169]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
  [193]  6  7  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  7  6
  [217]  6  6  6  7  6  6  6  6  6  6  6  6  6  7  7  3 15 13 13 13 11  6  7  6
  [241]  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  7  7  6  6
  [265]  6  6  7  6  6  6  6  6  6  6  6  6  6  6  6  6  6 16  6  7  6  6  6  6
  [289]  6  6  6  6  6  6  6  6  6  6  6

ZeroDivisionError: division by zero

In [None]:
spot_sizes = [50, 75, 100]
in_billions = 1_000_000_000
method="BayesSpace"
for spot_size in spot_sizes:
    for K in [17]:
        for init_method in init_methods:
            for num_pcs in PC_list:
                for gamma in np.linspace(1, 3, 9):
                    gamma_str = f"{gamma:.2f}"
                    cluster_results_filename = f"clusters_K={K}"
                    filename = f"results/hBreast/{method}/{num_pcs}/{K}/mpd/{init_method}/{spot_size}/{gamma_str}/{cluster_results_filename}_mpd.json"
                    if os.path.exists(filename):
                        with open(filename, "r") as mpd_dict:
                            current_mpd = json.load(mpd_dict)
                        print("Method:", method, "Spot Size", spot_size, "Num Clusters:", len(current_mpd), "Num PCs", num_pcs, "\u03B3", f": {gamma_str}", "Initial Method:", init_method, "Total mpd:", sum(current_mpd.values()) / in_billions)