In [1]:
import pandas as pd
import json
import os

import warnings 
warnings.filterwarnings("ignore")
from importlib import reload

# this ensures that I can update the class without losing my variables in my notebook
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster
from utils.metrics import *

In [2]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

In [3]:
def run_experiment(data, dataset_name: str, current_spot_size: int, third_dim: bool, resolutions: list, n_clusters=15):
    
    clustering = XeniumCluster(data=data, dataset_name=dataset_name)
    clustering.set_spot_size(current_spot_size)
    clustering.create_spot_data(third_dim=third_dim, save_data=True)

    print(f"The size of the spot data is {clustering.xenium_spot_data.shape}")

    clustering.normalize_counts(clustering.xenium_spot_data)
    clustering.generate_neighborhood_graph(clustering.xenium_spot_data, plot_pcas=False)

    Louvain_cluster = clustering.Louvain(clustering.xenium_spot_data, resolutions, embedding="umap", save_plot=True)

    return clustering, Louvain_cluster

In [4]:
import numpy as np
import torch
from scipy.spatial.distance import cdist

def record_results(original_data, cluster_dict, results_dir, model_name, filename, spot_size, third_dim, K=None, resolution=None, uses_spatial=True):

    if resolution is not None:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim].get(
            resolution, 
            cluster_dict[model_name][spot_size][third_dim]
        ))
    else:
        current_clustering = np.array(cluster_dict[model_name][spot_size][third_dim][uses_spatial].get(
            K, 
            cluster_dict[model_name][spot_size][third_dim][uses_spatial]
        ))
    cluster_labels = np.unique(current_clustering)

    original_data.xenium_spot_data.obs[f"{model_name} cluster"] = np.array(current_clustering)
    dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/clusters/{spot_size}"
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    filepath = f"{dirpath}/{filename}{resolution}.csv"

    original_data.xenium_spot_data.obs[f"{model_name} cluster"].to_csv(filepath)
    # Extracting row, col, and cluster values from the dataframe
    rows = torch.tensor(original_data.xenium_spot_data.obs["row"].astype(int))
    cols = torch.tensor(original_data.xenium_spot_data.obs["col"].astype(int))
    clusters = torch.tensor(original_data.xenium_spot_data.obs[f"{model_name} cluster"].astype(int))
    cluster_labels = np.unique(clusters)

    num_rows = int(max(rows) - min(rows) + 1)
    num_cols = int(max(cols) - min(cols) + 1)

    cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.int)

    cluster_grid[rows, cols] = torch.tensor(clusters, dtype=torch.int)

    mpd = {}
    for label in cluster_labels:
        current_cluster_locations = torch.stack(torch.where((cluster_grid == label)), axis=1).to(float)
        mpd[f"Cluster {label}"] = spot_size * torch.mean(torch.cdist(current_cluster_locations, current_cluster_locations)).item()
        print(f"POSSIBLE {len(cluster_labels)}", label, mpd[f"Cluster {label}"])

    mpd_dirpath = f"{results_dir}/{model_name}{'/' + (str(resolution) if resolution is not None else str(K))}/mpd/{spot_size}/"
    if not os.path.exists(mpd_dirpath):
        os.makedirs(mpd_dirpath)

    mpd_filepath = f"{mpd_dirpath}/{filename}_mpd.json"
    with open(mpd_filepath, "w") as f:
        json.dump(mpd, f, indent=4)

In [5]:
cluster_dict = {"Louvain": {}}
mpd = {"Louvain": {}}
results_dir = "results/hBreast"

In [6]:
import matplotlib
matplotlib.use('Agg')

resolutions = [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]
for spot_size in [50, 75, 100]:
    for third_dim in [False]:
        cluster_results_filename = f"clusters_RES="
        original_data, Louvain_cluster = run_experiment(df_transcripts, "hBreast", spot_size, third_dim, resolutions)

        # Louvain
        if "Louvain" not in cluster_dict:
            cluster_dict["Louvain"] = {}
        if spot_size not in cluster_dict["Louvain"]:
            cluster_dict["Louvain"][spot_size] = {}
        cluster_dict["Louvain"][spot_size][third_dim] = {res: clusters.tolist() for res, clusters in Louvain_cluster.items()}
        for resolution in resolutions:
            record_results(original_data, cluster_dict, results_dir, "Louvain", cluster_results_filename, spot_size, third_dim, resolution=resolution)

        print(f"Cluster with spot size {(spot_size, third_dim)} completed.")

The size of the spot data is (23444, 280)
POSSIBLE 2 0 261017.80772173512
POSSIBLE 2 1 160734.8526459699
POSSIBLE 3 0 240357.36295093584
POSSIBLE 3 1 201995.58076278176
POSSIBLE 3 2 189424.7471147104
POSSIBLE 9 0 254490.79972637983
POSSIBLE 9 1 186882.7849516292
POSSIBLE 9 2 186200.3150703561
POSSIBLE 9 3 173462.71602708515
POSSIBLE 9 4 145953.4384422753
POSSIBLE 9 5 206091.79921459884
POSSIBLE 9 6 188992.71791122737
POSSIBLE 9 7 203895.57766747614
POSSIBLE 9 8 157576.8770039418
POSSIBLE 14 0 272559.29814572487
POSSIBLE 14 1 188433.9784164845
POSSIBLE 14 2 166376.6518897933
POSSIBLE 14 3 198318.09327592602
POSSIBLE 14 4 143968.9276799286
POSSIBLE 14 5 126724.4399763697
POSSIBLE 14 6 204269.88553291827
POSSIBLE 14 7 188147.67420688624
POSSIBLE 14 8 78737.12559087006
POSSIBLE 14 9 182760.9717694038
POSSIBLE 14 10 131413.45649811553
POSSIBLE 14 11 62897.95331150993
POSSIBLE 14 12 199897.73454614793
POSSIBLE 14 13 153572.38697596715
POSSIBLE 16 0 279507.72895037
POSSIBLE 16 1 141075.827383

In [7]:
spot_sizes = [50,75,100]
resolutions = [0.25, 0.5, 0.75, 1.0]
in_billions = 1_000_000_000
method = "Louvain"
for spot_size in spot_sizes:
    for K in [17]:
        for resolution in resolutions:
            filename = f"results/hBreast/{method}/{resolution}/mpd/{spot_size}/{cluster_results_filename}_mpd.json"
            if os.path.exists(filename):
                with open(filename, "r") as mpd_dict:
                    current_mpd = json.load(mpd_dict)
                print("Method:", method, "Spot Size", spot_size, "Resolution", resolution, "Num Clusters:", len(current_mpd), "Total mpd", sum(current_mpd.values()) / in_billions)

Method: Louvain Spot Size 50 Resolution 0.25 Num Clusters: 3 Total mpd 0.000631777690828428
Method: Louvain Spot Size 50 Resolution 0.5 Num Clusters: 9 Total mpd 0.0017035470260149696
Method: Louvain Spot Size 50 Resolution 0.75 Num Clusters: 14 Total mpd 0.002298078577816046
Method: Louvain Spot Size 50 Resolution 1.0 Num Clusters: 16 Total mpd 0.0026801153332916493
Method: Louvain Spot Size 75 Resolution 0.25 Num Clusters: 5 Total mpd 0.0014023262065739341
Method: Louvain Spot Size 75 Resolution 0.5 Num Clusters: 8 Total mpd 0.0021461803880797868
Method: Louvain Spot Size 75 Resolution 0.75 Num Clusters: 13 Total mpd 0.003106687617721076
Method: Louvain Spot Size 75 Resolution 1.0 Num Clusters: 18 Total mpd 0.004289346584895316
Method: Louvain Spot Size 100 Resolution 0.25 Num Clusters: 4 Total mpd 0.0015748356096873027
Method: Louvain Spot Size 100 Resolution 0.5 Num Clusters: 8 Total mpd 0.002856924003864341
Method: Louvain Spot Size 100 Resolution 0.75 Num Clusters: 10 Total mpd 0