In [1]:
import json
import jsonlines
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from sklearn.metrics import silhouette_score

from importlib import reload
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster

from scipy.sparse import csr_matrix

# Variables Setup

In [2]:
dataset_name = "hBreast"
models = ["BayXenSmooth", "Leiden", "Louvain", "K-Means", "K-Means_No_Spatial", "Hierarchical", "Hierarchical_No_Spatial", "BayesSpace"]
resolutions = [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0]
spot_sizes = [100, 75, 50]
K_values = [17]

In [3]:
# BayXenSmooth Hyperparameters
BayXenSmooth_PCs = 25
KMeansInit = True
neighborhood_sizes = [1,2,3,4,5]
sample_for_assignment = False
concentraion_amp = 1.0
agg = "sum"

# Load Data

In [4]:
# Path to your .gz file
file_path = f'data/{dataset_name}/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

# Other Metric Implementations

- Variation Index (TODO) If we want to compare competing methods clustering with our clustering.


In [5]:
def morans_i_cluster_similarity(clustering, locations, clusters):
    print("Starting Moran's I Calculation.")
    moran_clusters = ad.AnnData(locations)
    sc.pp.neighbors(moran_clusters, n_pcs=0, n_neighbors=100)
    print("Neighbors calculated.")

    cluster_labels = clusters.values
    # Calculate Moran's I for the binary presence of each cluster
    unique_clusters = np.unique(cluster_labels)
    morans_i_results = {}
    for cluster in unique_clusters:
        cluster_indicator = (cluster_labels == cluster).astype(int)
        morans_i = sc.metrics.morans_i(moran_clusters, vals=cluster_indicator)
        morans_i_results[cluster] = morans_i

    print("Done!")
    return np.mean(list(morans_i_results.values()))

In [6]:
def gearys_c_cluster_similarity(clustering, locations, clusters):
    print("Starting Gearys's C Calculation.")
    gearys_clusters = ad.AnnData(locations)
    sc.pp.neighbors(gearys_clusters, n_pcs=0, n_neighbors=100)
    print("Neighbors calculated.")

    cluster_labels = clusters.values
    # Calculate Gearys C for the binary presence of each cluster
    unique_clusters = np.unique(cluster_labels)
    gearys_c_results = {}
    for cluster in unique_clusters:
        cluster_indicator = (cluster_labels == cluster).astype(int)
        gearys_c = sc.metrics.gearys_c(gearys_clusters, vals=cluster_indicator)
        gearys_c_results[cluster] = gearys_c

    print("Done!")
    return np.mean(list(gearys_c_results.values()))

In [7]:
import jsonlines

# Example dictionary
data = {
    "key1": "value1",
    "key2": "value2",
    "key3": "value3"
}

# Writing each key-value pair as a separate row in a jsonl file
with jsonlines.open('output.jsonl', mode='w') as writer:
    for key, value in data.items():
        writer.write({key: value})

In [17]:
def save_results(results, dataset_name, method, metric_name, spot_size, resolution=None, K=17, neighborhood_size=1):
    if resolution is not None:
        directory = f"results/{dataset_name}/{method}/{resolution}/{metric_name}/{spot_size}"
    elif method == "BayXenSmooth":
        directory = f"results/{dataset_name}/{method}/clusters/PCA/{BayXenSmooth_PCs}/KMEANSINIT=True/NEIGHBORSIZE={neighborhood_size}/NUMCLUSTERS={K}/SPATIALINIT=True/SAMPLEFORASSIGNMENT={sample_for_assignment}/SPATIALNORM=1.0/SPATIALPRIORMULT={concentraion_amp}/SPOTSIZE={spot_size}/AGG={agg}"
    else:
        directory = f"results/{dataset_name}/{method}/{K}/{metric_name}/{spot_size}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with jsonlines.open(f"{directory}/{metric_name}.jsonl", mode='w') as writer:
        try:
            for key, value in results.items():
                writer.write({key: value})
        except AttributeError: # b/c it's not a dictionary so .items() fails
            writer.write(results)

# Calculate the Silhouette Score (and other metrics of note.)

In [19]:
for spot_size in spot_sizes:
    clustering = XeniumCluster(data=df_transcripts, dataset_name=dataset_name)
    clustering.set_spot_size(spot_size)
    clustering.create_spot_data(third_dim=False, save_data=True)
    locations = clustering.xenium_spot_data.obs[["row", "col"]]
    for model in models:
        for K in K_values:
            if model in ["Leiden", "Louvain"]:
                for resolution in resolutions:
                    clusters = pd.read_csv(f"results/{dataset_name}/{model}/{resolution}/clusters/{spot_size}/clusters_RES={resolution}.csv")[f"{model} cluster"]
                    save_results(silhouette_score(locations, clusters), dataset_name, model, "silhouette_score", spot_size, resolution=resolution)
                    save_results(morans_i_cluster_similarity(clustering, locations, clusters), dataset_name, model, "morans_i", spot_size, resolution=resolution)
                    save_results(gearys_c_cluster_similarity(clustering, locations, clusters), dataset_name, model, "gearys_c", spot_size, resolution=resolution)
            elif model == "BayXenSmooth":
                min_expressions_per_spot = 10
                clustering.xenium_spot_data = clustering.xenium_spot_data[clustering.xenium_spot_data.X.sum(axis=1) > min_expressions_per_spot]
                for neighborhood_size in neighborhood_sizes:
                    clusters = pd.read_csv(f"results/{dataset_name}/{model}/clusters/PCA/{BayXenSmooth_PCs}/KMEANSINIT=True/NEIGHBORSIZE={neighborhood_size}/NUMCLUSTERS={K}/SPATIALINIT=True/SAMPLEFORASSIGNMENT={sample_for_assignment}/SPATIALNORM=1.0/SPATIALPRIORMULT={concentraion_amp}/SPOTSIZE={spot_size}/AGG={agg}/clusters_K={K}.csv")[f"{model} cluster"]
                    save_results(silhouette_score(locations, clusters), dataset_name, model, "silhouette_score", spot_size, K=K)
                    save_results(morans_i_cluster_similarity(clustering, locations, clusters), dataset_name, model, "morans_i", spot_size, K=K)
                    save_results(gearys_c_cluster_similarity(clustering, locations, clusters), dataset_name, model, "gearys_c", spot_size, K=K)
            else:
                clusters = pd.read_csv(f"results/{dataset_name}/{model}/{K}/clusters/{spot_size}/clusters_K={K}.csv")[f"{model} cluster"]
                save_results(silhouette_score(locations, clusters), dataset_name, model, "silhouette_score", spot_size, K=K)
                save_results(morans_i_cluster_similarity(clustering, locations, clusters), dataset_name, model, "morans_i", spot_size, K=K)
                save_results(gearys_c_cluster_similarity(clustering, locations, clusters), dataset_name, model, "gearys_c", spot_size, K=K)



ValueError: Found input variables with inconsistent numbers of samples: [6138, 6066]

# Marker Gene Autocorrelation

In [None]:
def gene_morans_i(clustering, locations, clusters):
    print("Starting Moran's I Calculation.")
    moran_clusters = ad.AnnData(locations)
    sc.pp.neighbors(moran_clusters, n_pcs=0, n_neighbors=100)
    print("Neighbors calculated.")

    # Create a binary adjacency matrix indicating if points are in the same cluster
    cluster_labels = clusters.values
    same_cluster = (cluster_labels[:, None] == cluster_labels).astype(int)
    print(moran_clusters.obsp["connectivities"].shape, same_cluster.shape)
    moran_clusters.obsp["connectivities"] = moran_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
    print("Connectivities formed.")

    # Calculate Moran's I for the genes
    morans_i = sc.metrics.morans_i(moran_clusters, vals=clustering.xenium_spot_data.X.T)

    morans_i_dict = dict(zip(clustering.xenium_spot_data.var.index, morans_i))

    return morans_i_dict

In [None]:
def gene_gearys_c(clustering, locations, clusters):
    print("Starting Geary's C Calculation.")
    gearys_clusters = ad.AnnData(locations)
    sc.pp.neighbors(gearys_clusters, n_pcs=0, n_neighbors=100)
    print("Neighbors calculated.")

    # Create a binary adjacency matrix indicating if points are in the same cluster
    cluster_labels = clusters.values
    same_cluster = (cluster_labels[:, None] == cluster_labels).astype(int)
    gearys_clusters.obsp["connectivities"] = gearys_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
    print("Connectivities formed.")

    # Calculate Geary's C for the genes
    gearys_c= sc.metrics.gearys_c(gearys_clusters, vals=clustering.xenium_spot_data.X.T)

    gearys_c_dict = dict(zip(clustering.xenium_spot_data.var.index, gearys_c))

    return gearys_c_dict

In [None]:
models = ["BayXenSmooth"]
neighborhood_sizes = [1]

In [None]:
for spot_size in spot_sizes:
    clustering = XeniumCluster(data=df_transcripts, dataset_name=dataset_name)
    clustering.set_spot_size(spot_size)
    clustering.create_spot_data(third_dim=False, save_data=True)
    locations = clustering.xenium_spot_data.obs[["row", "col"]]
    for model in models:
        for K in K_values:
            if model in ["Leiden", "Louvain"]:
                for resolution in resolutions:
                    clusters = pd.read_csv(f"results/{dataset_name}/{model}/{resolution}/clusters/{spot_size}/clusters_RES={resolution}.csv")[f"{model} cluster"]
                    save_results(gene_morans_i(clustering, locations, clusters), dataset_name, model, "morans_i_by_gene", spot_size, resolution=resolution)
                    save_results(gene_gearys_c(clustering, locations, clusters), dataset_name, model, "gearys_c_by_gene", spot_size, resolution=resolution)
            elif model == "BayXenSmooth":
                for neighborhood_size in neighborhood_sizes:
                    clusters = pd.read_csv(f"results/{dataset_name}/{model}/clusters/PCA/{BayXenSmooth_PCs}/KMEANSINIT=True/NEIGHBORSIZE={neighborhood_size}/NUMCLUSTERS={K}/SPATIALINIT=True/SAMPLEFORASSIGNMENT={sample_for_assignment}/SPATIALNORM=1.0/SPATIALPRIORMULT={concentraion_amp}/SPOTSIZE={spot_size}/AGG={agg}/clusters_K={K}.csv")[f"{model} cluster"]
                    save_results(gene_morans_i(clustering, locations, clusters), dataset_name, model, "morans_i_by_gene", spot_size, K=K, neighborhood_size=neighborhood_size)
                    save_results(gene_gearys_c(clustering, locations, clusters), dataset_name, model, "gearys_c_by_gene", spot_size, K=K, neighborhood_size=neighborhood_size)
            else:
                clusters = pd.read_csv(f"results/{dataset_name}/{model}/{K}/clusters/{spot_size}/clusters_K={K}.csv")[f"{model} cluster"]
                save_results(gene_morans_i(clustering, locations, clusters), dataset_name, model, "morans_i_by_gene", spot_size, K=K)
                save_results(gene_gearys_c(clustering, locations, clusters), dataset_name, model, "gearys_c_by_gene", spot_size, K=K)



Starting Moran's I Calculation.
Neighbors calculated.
(6066, 6066) (6066, 6066)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(6066, 6066) (6066, 6066)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(6066, 6066) (6066, 6066)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(6066, 6066) (6066, 6066)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(6066, 6066) (6066, 6066)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.




Starting Moran's I Calculation.
Neighbors calculated.
(10564, 10564) (10564, 10564)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(10564, 10564) (10564, 10564)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(10564, 10564) (10564, 10564)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(10564, 10564) (10564, 10564)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(10564, 10564) (10564, 10564)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.




Starting Moran's I Calculation.
Neighbors calculated.
(22643, 22643) (22643, 22643)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(22643, 22643) (22643, 22643)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(22643, 22643) (22643, 22643)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(22643, 22643) (22643, 22643)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
Starting Moran's I Calculation.




Neighbors calculated.
(22643, 22643) (22643, 22643)
Connectivities formed.
Starting Geary's C Calculation.




Neighbors calculated.
Connectivities formed.
