In [1]:
import pandas as pd
import json
import os

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [2]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [3]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    BayesSpace_cluster = clustering.BayesSpace(clustering.xenium_spot_data, K=n_clusters)

    return clustering, BayesSpace_cluster

In [4]:
import numpy as np
import torch
from scipy.spatial.distance import cdist

def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, K=None, resolution=None, uses_spatial=True):

    dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}.json"

    with open(filepath, "w") as f:
        json.dump(cluster_dict[model_name], f, indent=4)

    wss = {}
    if resolution is not None:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
            resolution, 
            cluster_dict[model_name][spot_size][third_dim]
        ))
    else:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][uses_spatial].get(
            K, 
            cluster_dict[model_name][spot_size][third_dim][uses_spatial]
        ))
    cluster_labels = np.unique(current_clustering)

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)
    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)
    num_clusters = len(cluster_labels)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)

    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        wss[f"Cluster {label}"] = (spot_size ** 2) * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, wss[f"Cluster {label}"])

    wss_dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/wss/{spot_size}/"
    if not os.path.exists(wss_dirpath):
        os.makedirs(wss_dirpath)

    wss_filepath = f"{wss_dirpath}/{filename}_wss.json"
    with open(wss_filepath, "w") as f:
        json.dump(wss, f, indent=4)

In [5]:
cluster_dict = {"BayesSpace": {}}
wss = {"BayesSpace": {}}
results_dir = "results/hBreast"

In [6]:
import matplotlib
matplotlib.use('Agg')

for spot_size in [100]:
    for third_dim in [False]:
        for K in [16]:
            cluster_results_filename = f"clusters_K={K}"
            original_data, BayesSpace_cluster = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, n_clusters=K)

            # BayesSpace
            if "BayesSpace" not in cluster_dict:
                cluster_dict["BayesSpace"] = {}
            if spot_size not in cluster_dict["BayesSpace"]:
                cluster_dict["BayesSpace"][spot_size] = {}
            cluster_dict["BayesSpace"][spot_size][third_dim] = {True: {K: BayesSpace_cluster.tolist()}}
            record_results(original_data, cluster_dict, results_dir, "BayesSpace", cluster_results_filename, spot_size, third_dim, K, uses_spatial=True)

The size of the spot data is (6138, 280)


Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, ro

100 16

Neighbors were identified for 6137 out of 6138 spots.
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Fitting model...
Neighbors were identified for 6137 out of 6138 spots.
Fitting model...
You created a large dataset with compression and chunking.
The chunk size is equal to the dataset dimensions.
If you want to read subsets of the dataset, you should testsmaller chunk sizes to improve read times.
You created a large dataset with compression and chunking.
The chunk size is equal to the dataset dimensions.
If you want to read subsets of the dataset, you should testsmaller chunk sizes to improve read times.
Calculating labels using iterations 100 through 1000.


   [1] 12 12 12 14  9  9 14 14 14 14 14 14  9  9 14 14 14 12 12 12 12 12 12 12
  [25] 12 12 12 12 12 14 14 14  9  9  9 14 14 14 12 14 14 14 14 14  9  9  9 14
  [49] 14 14 14 14 14 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 14 14
  [73] 14 14 14 14 14 14 14  9 14 14 14 14 14  1  1  1  1 14 12 14 14 12 12 14
  [97] 14 14  9 14 14 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 14 14 12
 [121] 12 14 14 14  9  9  9 14 14 14 14 14  9  9 16 16 14 12 12 12 12 12 14  9
 [145]  1  1  9  9 14 14 14 14 14 14 14 14 12 12 12 12 12 12 12 12 12 12 14 14
 [169] 14 12 12 14 14 14 14 14 14 14 14 14 14 14 14 12 14 14 14 12 12 12 12 12
 [193] 14  9  9  9  1  1  1 16 14 14  9 14 14 14 14 12 12 12 12 12 12 12 12 14
 [217] 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 12 12
 [241] 12 14 14 14 14  9  1  9  1  9  9  9  1  1  1  9 12 12 12 12 12 14 14 12
 [265] 12 14 14 14 14 14 14 14 12 14 14 14 14 14  1  9 14 14 14 14 14 14 14 16
 [289] 14 14 14 14 14 14 14 14 14 14 12 14 14  9  1 

In [7]:
spot_sizes = [50, 75, 100]
resolutions = [0.25, 0.5, 0.75, 1.0]
in_billions = 1_000_000_000
method="BayesSpace"
for spot_size in spot_sizes:
    for K in [17]:
        filename = f"results/hBreast/{method}/{K}/wss/{spot_size}/{cluster_results_filename}_wss.json"
        if os.path.exists(filename):
            with open(filename, "r") as wss_dict:
                current_wss = json.load(wss_dict)
            print("Method:", method, "Spot Size", spot_size, "Num Clusters:", len(current_wss), "Total WSS", sum(current_wss.values()) / in_billions)