In [366]:
import json
import jsonlines
import os
import re
import torch
import warnings
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from scipy.stats import norm

from importlib import reload
import xenium_cluster
reload(xenium_cluster)
from xenium_cluster import XeniumCluster

from scipy.sparse import csr_matrix

# Variables Setup

In [367]:
dataset_name = "hBreast"
models = ["Leiden", "Louvain", "K-Means", "Hierarchical_No_Spatial", "mclust"]
resolutions = [0.75]
spot_sizes = [100, 75, 50]
K_values = [17]

In [350]:
# BayXenSmooth Hyperparameters
BayXenSmooth_PCs = [3, 5, 10, 25]
BayesSpace_PCs = [3, 5, 10, 15, 25]
neighborhood_sizes = [1,2,3,4,5]
sample_for_assignment = False
concentration_amp = 1.0
spatial_norms = [0.05, 0.1]
aggs = ["sum", "mean", "weighted"]
num_neighboring_spots = [50, 100, 200, 400]
kernels = ['umap', 'naive_distance']

# Load Data

In [351]:
# Path to your .gz file
file_path = f'data/{dataset_name}/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

# drop cells without ids
df_transcripts = df_transcripts[df_transcripts["cell_id"] != -1]

# drop blanks and controls
df_transcripts = df_transcripts[~df_transcripts["feature_name"].str.startswith('BLANK_') & ~df_transcripts["feature_name"].str.startswith('NegControl')]

# Other Metric Implementations

- Variation Index (TODO) If we want to compare competing methods clustering with our clustering.


In [368]:
def morans_i_cluster_similarity(clustering, locations, clusters, num_neighbors=100, kernel='umap', p=1, print_output=False):
    
    moran_clusters = ad.AnnData(locations)
    cluster_labels = clusters.values

    if kernel in ['umap', 'gauss']:
        sc.pp.neighbors(moran_clusters, n_neighbors=num_neighbors, use_rep='X', n_pcs=0, method=kernel)
    elif kernel == 'naive_distance':
        def naive_distance(x, p=1):
            return 1 / ((1 + x)**(1/p))

        nbrs = NearestNeighbors(n_neighbors=num_neighbors).fit(moran_clusters.X)
        distances, indices = nbrs.kneighbors(moran_clusters.X)
        connectivities = csr_matrix((moran_clusters.shape[0], moran_clusters.shape[0]))
        connectivities[np.arange(len(indices))[:, None], indices] = naive_distance(distances, p)
        moran_clusters.obsp["connectivities"] = connectivities

    # Calculate Moran's I for the binary presence of each cluster
    unique_clusters = np.unique(cluster_labels)
    morans_i_results = {}
    for cluster in unique_clusters:
        cluster_indicator = (cluster_labels == cluster).astype(int)
        morans_i = sc.metrics.morans_i(moran_clusters.obsp["connectivities"], vals=cluster_indicator)
        morans_i_results[cluster] = morans_i

    return np.mean(list(morans_i_results.values()))

In [369]:
def gearys_c_cluster_similarity(clustering, locations, clusters, num_neighbors=100, kernel='umap', p=1, print_output=False):

    gearys_clusters = ad.AnnData(locations)

    cluster_labels = clusters.values
    # Calculate Gearys C for the binary presence of each cluster
    if kernel in ['umap', 'gauss']:
        sc.pp.neighbors(gearys_clusters, n_neighbors=num_neighbors, use_rep='X', n_pcs=0, method=kernel)
    elif kernel == 'naive_distance':
        def naive_distance(x, p=1):
            return 1 / ((1 + x)**(1/p))

        nbrs = NearestNeighbors(n_neighbors=num_neighbors).fit(gearys_clusters.X)
        distances, indices = nbrs.kneighbors(gearys_clusters.X)
        connectivities = csr_matrix((gearys_clusters.shape[0], gearys_clusters.shape[0]))
        connectivities[np.arange(len(indices))[:, None], indices] = naive_distance(distances, p)
        gearys_clusters.obsp["connectivities"] = connectivities

    # Calculate Moran's I for the binary presence of each cluster
    unique_clusters = np.unique(cluster_labels)
    gearys_c_results = {}
    for cluster in unique_clusters:
        cluster_indicator = (cluster_labels == cluster).astype(int)
        gearys_c = sc.metrics.gearys_c(gearys_clusters, vals=cluster_indicator)
        gearys_c_results[cluster] = gearys_c

    return np.mean(list(gearys_c_results.values()))

In [370]:
def save_results(results, directory, metric_name, specification=None):
    subdirectory = f"{specification}" if specification else ""
    full_path = f"{directory}/{subdirectory}"
    
    # Create the directory if it doesn't exist
    os.makedirs(full_path, exist_ok=True)
    
    with jsonlines.open(f"{full_path}/{metric_name}.jsonl", mode='w') as writer:
        try:
            for key, value in results.items():
                writer.write({key: value})
        except AttributeError: # b/c it's not a dictionary so .items() fails
            writer.write(results)

# Calculate the Silhouette Score (and other metrics of note.)

In [378]:
from IPython.utils import io

with io.capture_output():
    for spot_size in spot_sizes:
        print(spot_size)
        clustering = XeniumCluster(data=df_transcripts, dataset_name=dataset_name)
        clustering.set_spot_size(spot_size)
        clustering.create_spot_data(third_dim=False, save_data=True)
        locations = clustering.xenium_spot_data.obs[["row", "col"]]
        for K in K_values:
            for model in models:
                results_dir = f"results/hBreast/{model}"
                for root, dirs, files in os.walk(results_dir):
                    for file in files:
                        print(os.path.join(root, file), (file == f"clusters_K={K}.csv" or (file.endswith(".csv") and "clusters_RES" in file)) and f"/{spot_size}/" in root)
                        # Check if the file is named morans_i_by_gene.json
                        if model == "BayXenSmooth":
                            if (file == f"clusters_K={K}.csv" or (file.endswith(".csv") and "clusters_RES" in file)) and f"/SPOTSIZE={spot_size}/" in os.path.join(root, file):
                                for neighboring_spots in num_neighboring_spots:
                                    for kernel in kernels:
                                        clusters = pd.read_csv(os.path.join(root, file))[f"{model} cluster"]
                                        save_results(morans_i_cluster_similarity(clustering, locations, clusters, num_neighbors=neighboring_spots, kernel=kernel), root, "morans_i", specification=f"{kernel}/{neighboring_spots}")
                                        save_results(gearys_c_cluster_similarity(clustering, locations, clusters, num_neighbors=neighboring_spots, kernel=kernel), root, "gearys_c", specification=f"{kernel}/{neighboring_spots}")
                                        save_results(silhouette_score(locations, clusters), root, "silhouette_score")
                        else:
                            if (file == f"clusters_K={K}.csv" or (file.endswith(".csv") and "clusters_RES" in file)) and f"/{spot_size}/" in os.path.join(root, file):
                                for neighboring_spots in num_neighboring_spots:
                                    for kernel in kernels:
                                        clusters = pd.read_csv(os.path.join(root, file))[f"{model} cluster"]
                                        save_results(morans_i_cluster_similarity(clustering, locations, clusters, num_neighbors=neighboring_spots, kernel=kernel), root, "morans_i", specification=f"{kernel}/{neighboring_spots}")
                                        save_results(gearys_c_cluster_similarity(clustering, locations, clusters, num_neighbors=neighboring_spots, kernel=kernel), root, "gearys_c", specification=f"{kernel}/{neighboring_spots}")
                                        save_results(silhouette_score(locations, clusters), root, "silhouette_score")

In [None]:
# for spot_size in spot_sizes:
#     clustering = XeniumCluster(data=df_transcripts, dataset_name=dataset_name)
#     clustering.set_spot_size(spot_size)
#     clustering.create_spot_data(third_dim=False, save_data=True)
#     locations = clustering.xenium_spot_data.obs[["row", "col"]]
#     for model in models:
#         for K in K_values:
#             if model in ["Leiden", "Louvain"]:
#                 for resolution in resolutions:
#                     clusters = pd.read_csv(f"results/{dataset_name}/{model}/{resolution}/clusters/{spot_size}/clusters_RES={resolution}.csv")[f"{model} cluster"]
#                     save_results(silhouette_score(locations, clusters), dataset_name, model, "silhouette_score", spot_size, resolution=resolution)
#                     save_results(morans_i_cluster_similarity(clustering, locations, clusters), dataset_name, model, "morans_i", spot_size, resolution=resolution)
#                     save_results(gearys_c_cluster_similarity(clustering, locations, clusters), dataset_name, model, "gearys_c", spot_size, resolution=resolution)
#             elif model == "BayXenSmooth":
#                 min_expressions_per_spot = 10
#                 clustering.xenium_spot_data = clustering.xenium_spot_data[clustering.xenium_spot_data.X.sum(axis=1) > min_expressions_per_spot]
#                 for neighborhood_size in neighborhood_sizes:
#                     clusters = pd.read_csv(f"results/{dataset_name}/{model}/clusters/PCA/{BayXenSmooth_PCs}/KMEANSINIT=True/NEIGHBORSIZE={neighborhood_size}/NUMCLUSTERS={K}/SPATIALINIT=True/SAMPLEFORASSIGNMENT={sample_for_assignment}/SPATIALNORM={spatial_norm}/SPATIALPRIORMULT={concentration_amp}/SPOTSIZE={spot_size}/AGG={agg}/clusters_K={K}.csv")[f"{model} cluster"]
#                     save_results(silhouette_score(locations, clusters), dataset_name, model, "silhouette_score", spot_size, K=K)
#                     save_results(morans_i_cluster_similarity(clustering, locations, clusters), dataset_name, model, "morans_i", spot_size, K=K, sample_for_assignment=sample_for_assignment)
#                     save_results(gearys_c_cluster_similarity(clustering, locations, clusters), dataset_name, model, "gearys_c", spot_size, K=K, sample_for_assignment=sample_for_assignment)
#             else:
#                 clusters = pd.read_csv(f"results/{dataset_name}/{model}/{K}/clusters/{spot_size}/clusters_K={K}.csv")[f"{model} cluster"]
#                 save_results(silhouette_score(locations, clusters), dataset_name, model, "silhouette_score", spot_size, K=K)
#                 save_results(morans_i_cluster_similarity(clustering, locations, clusters), dataset_name, model, "morans_i", spot_size, K=K)
#                 save_results(gearys_c_cluster_similarity(clustering, locations, clusters), dataset_name, model, "gearys_c", spot_size, K=K)

# Marker Gene Autocorrelation

In [104]:
MARKER_GENES = ["BANK1", "CEACAM6", "FASN", "FGL2", "IL7R", "KRT6B", "POSTN", "TCIM"]

In [207]:
def gene_morans_i(clustering, moran_clusters, clusters, num_neighbors=100, kernel='umap', p=1, marker_genes=MARKER_GENES, print_output=False):

    # Create a binary adjacency matrix indicating if points are in the same cluster
    cluster_labels = clusters.values
    same_cluster = (cluster_labels[:, None] == cluster_labels).astype(int)

    if kernel == 'umap':
        sc.pp.neighbors(moran_clusters, n_neighbors=num_neighbors, use_rep='X', n_pcs=0, method=kernel)
        moran_clusters.obsp["adjacency"] = moran_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
    elif kernel == 'gauss':
        sc.pp.neighbors(moran_clusters, n_neighbors=num_neighbors, use_rep='X', n_pcs=0, method=kernel)
        moran_clusters.obsp["adjacency"] = moran_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
    elif kernel == 'naive_distance':
        def naive_distance(x, p=1):
            return 1 / ((1 + x)**(1/p))

        nbrs = NearestNeighbors(n_neighbors=num_neighbors).fit(moran_clusters.X)
        distances, indices = nbrs.kneighbors(moran_clusters.X)
        connectivities = csr_matrix((moran_clusters.shape[0], moran_clusters.shape[0]))
        connectivities[np.arange(len(indices))[:, None], indices] = naive_distance(distances, p)
        moran_clusters.obsp["connectivities"] = connectivities
        moran_clusters.obsp["adjacency"] = moran_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
    elif kernel == 'basic':
        moran_clusters.obsp["adjacency"] = csr_matrix(same_cluster)
    else:
        warnings.warn(f"Kernel '{kernel}' not implemented. Using 'basic' kernel instead. We recommend 'umap'.", UserWarning)
        kernel = 'basic'
        moran_clusters.obsp["adjacency"] = csr_matrix(same_cluster)

    # Calculate Moran's I for the genes
    morans_i = sc.metrics.morans_i(moran_clusters.obsp["adjacency"], vals=clustering.xenium_spot_data.X.T)
    morans_i_dict = dict(zip(clustering.xenium_spot_data.var.index, morans_i))

    # Print the number of non-zero adjacencies
    if print_output:
        num_nonzero = moran_clusters.obsp["adjacency"].getnnz()
        print(f"Number of non-zero adjacencies: {num_nonzero}")
        for gene in marker_genes:
            print(num_neighbors, gene, morans_i_dict[gene])

    return morans_i_dict

In [208]:
def gene_gearys_c(clustering, gearys_clusters, clusters, num_neighbors=100):

    # Create a binary adjacency matrix indicating if points are in the same cluster
    cluster_labels = clusters.values
    same_cluster = (cluster_labels[:, None] == cluster_labels).astype(int)
    gearys_clusters.obsp["adjacency"] = gearys_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
    print("Connectivities formed.")

    # Calculate Geary's C for the genes
    gearys_c= sc.metrics.gearys_c(gearys_clusters.obsp["adjacency"], vals=clustering.xenium_spot_data.X.T)

    gearys_c_dict = dict(zip(clustering.xenium_spot_data.var.index, gearys_c))

    return gearys_c_dict

In [299]:
models = ["BayesSpace", "Leiden", "Louvain", "K-Means", "Hierarchical_No_Spatial", "mclust"]
num_neighboring_spots = [50, 100, 200, 400]
kernels = ['umap']
spot_sizes = [50]
K = 17

In [None]:
for spot_size in spot_sizes:
    clustering = XeniumCluster(data=df_transcripts, dataset_name=dataset_name)
    clustering.set_spot_size(spot_size)
    clustering.create_spot_data(third_dim=False, save_data=True)
    locations = clustering.xenium_spot_data.obs[["row", "col"]]
    moran_clusters = ad.AnnData(locations)
    gearys_clusters = ad.AnnData(locations)
    for model in models:
        print(model)

        moran_clusters = ad.AnnData(locations)
        gearys_clusters = ad.AnnData(locations)

        # Define the directory where the results are stored
        results_dir = f"results/hBreast/{model}"

        # Loop through all subdirectories in the results directory
        for root, dirs, files in os.walk(results_dir):
            for file in files:
                # Check if the file is named morans_i_by_gene.json
                if (file == f"clusters_K={K}.csv" or (file.endswith(".csv") and "clusters_RES" in file)) and str(spot_size) in root:
                    # print(os.path.join(root, file))
                    for neighboring_spots in num_neighboring_spots:
                        for kernel in kernels:

                            clusters = pd.read_csv(os.path.join(root, file))[f"{model} cluster"]
                            save_results(gene_morans_i(clustering, moran_clusters, clusters, num_neighbors=neighboring_spots, kernel=kernel, print_output=False), root, "morans_i_by_gene", specification=f"{kernel}/{neighboring_spots}")

# Specific Method Marker Gene Ranking

In [250]:
# Define the directory where the results are stored
results_dir = "results/hBreast/Leiden"
morans_i_dict = {}
marker_gene_ranking_dict = {gene: {"rank": float('inf')} for gene in MARKER_GENES}
for root, dirs, files in os.walk(results_dir):
    for file in files:
        # Check if the file is named morans_i_by_gene.json
        if file == "morans_i_by_gene.jsonl":
            used_runs = ["Leiden/0.75", "Louvain/1.0"]

            # Construct the full path to the file
            file_path = os.path.join(root, file)
            if not any(x in file_path for x in ["Leiden", "Louvain"]) or any(x in file_path for x in used_runs):
                # Open and read the file
                with open(file_path, 'r') as f:
                    # Load the JSON data
                    data = [line for line in jsonlines.Reader(f)]
                    data = sorted(data, key=lambda x: next(iter(x.values())), reverse=True)
                    for i, data_point in enumerate(data):
                        gene, morans_i_val = next(iter(data_point.items()))
                        if gene in MARKER_GENES:
                            if marker_gene_ranking_dict[gene]["rank"] > (i + 1):
                                marker_gene_ranking_dict[gene]["rank"] = (i + 1)
                                marker_gene_ranking_dict[gene]["filepath"] = file_path

In [251]:
marker_gene_ranking_dict

{'BANK1': {'rank': 3,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl'},
 'CEACAM6': {'rank': 206,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/400/morans_i_by_gene.jsonl'},
 'FASN': {'rank': 14,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/400/morans_i_by_gene.jsonl'},
 'FGL2': {'rank': 145,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/100/morans_i_by_gene.jsonl'},
 'IL7R': {'rank': 14,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl'},
 'KRT6B': {'rank': 30,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl'},
 'POSTN': {'rank': 123,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/400/morans_i_by_gene.jsonl'},
 'TCIM': {'rank': 71,
  'filepath': 'results/hBreast/Leiden/0.75/clusters/50/umap/100/morans_i_by_gene.jsonl'}}

# BayXenSmooth Best Moran's I

In [None]:
# Define the directory where the results are stored
results_dir = "results/hBreast/BayXenSmooth"

# Loop through all subdirectories in the results directory
morans_i_dict = {}
for gene_name in MARKER_GENES:
    current_max_expression = 0
    current_best_mpd = float('inf')
    for root, dirs, files in os.walk(results_dir):
        for file in files:
            # Check if the file is named morans_i_by_gene.json
            if file == "morans_i_by_gene.jsonl":
                # Construct the full path to the file
                file_path = os.path.join(root, file)
                # Open and read the file
                with open(file_path, 'r') as f:
                    # Load the JSON data
                    data = [line for line in jsonlines.Reader(f)]
                    # Print the data to verify it's been loaded
                    morans_i = [x for x in data if gene_name in x]
                    if morans_i[0][gene_name] > current_max_expression and "SPOTSIZE=50" in file_path:# and "K-Means" in file_path:
                        current_max_filepath = file_path
                        current_max_expression = morans_i[0][gene_name]
                        mpd_root = root
                        pattern = r"(?:umap|naive_distance|gauss|basic)/\d+"
                        mpd_root = re.sub(pattern, "", mpd_root)
                        mpd_dir = mpd_root.replace("clusters", "mpd")
                        try:
                            mpd_file = next((f for f in os.listdir(mpd_dir) if f.endswith(".json")), None)
                            if mpd_file:
                                with open(os.path.join(mpd_dir, mpd_file)) as f:
                                    mpd_dict = json.load(f)
                                    if len(mpd_dict) == K:
                                        current_best_mpd = sum(mpd_dict.values()) / 1_000
                                    else:
                                        current_best_mpd = (sum(mpd_dict.values()) / 1_000, f"(but K* = {len(mpd_dict)})")
                        except FileNotFoundError:
                            print(os.path.join(mpd_dir, mpd_file), "has missing MPD.")
    morans_i_dict[gene_name] = current_max_expression, current_max_filepath, current_best_mpd

In [379]:
# Define the directory where the results are stored
results_dir = "results/hBreast"

# Loop through all subdirectories in the results directory
models = ["BayXenSmooth", "BayesSpace", "Leiden", "Louvain", "Hierarchical_No_Spatial", "K-Means", "mclust"]
for model in models:
    new_results_dir = os.path.join(results_dir, model)
    morans_i_dict = {}
    for spot_size in [50, 75, 100]:
        current_best_MI = float('-inf')
        current_best_filepath = ""
        for root, dirs, files in os.walk(new_results_dir):
            for file in files:
                # Check if the file is named morans_i_by_gene.json
                if file == "morans_i.jsonl":
                    # Construct the full path to the file
                    used_runs = ["Leiden/0.75", "Louvain/1.0"]
                    file_path = os.path.join(root, file)
                    if not any(x in file_path for x in ["Leiden", "Louvain"]) or any(x in file_path for x in used_runs):
                        # Open and read the file
                        with open(file_path, 'r') as f:
                            # Load the JSON data
                            morans_i = [line for line in jsonlines.Reader(f)][0]
                            # print(model, ((f"{spot_size}" in file_path.split('/')[:-2]) or (f"SPOTSIZE={spot_size}" in file_path.split('/')[:-2])))
                            if morans_i > current_best_MI and ((f"{spot_size}" in file_path.split('/')[:-2]) or (f"SPOTSIZE={spot_size}" in file_path.split('/')[:-2])):
                                current_best_MI = morans_i
                                current_best_filepath = file_path

        morans_i_dict[spot_size] = current_best_MI, current_best_filepath
        print(morans_i_dict)

{50: (0.7146477812460335, 'results/hBreast/BayXenSmooth/clusters/PCA/25/INIT=K-Means/NEIGHBORSIZE=4/NUMCLUSTERS=17/SAMPLEFORASSIGNMENT=False/SPATIALPRIORMULT=DIRECT/SPOTSIZE=50/AGG=mean/MU_PRIOR=1.0/SIGMA_PRIOR=0.25/LOGITS_PRIOR=0.1/LEARN_GLOBAL_VARS=True/umap/50/morans_i.jsonl')}
{50: (0.7146477812460335, 'results/hBreast/BayXenSmooth/clusters/PCA/25/INIT=K-Means/NEIGHBORSIZE=4/NUMCLUSTERS=17/SAMPLEFORASSIGNMENT=False/SPATIALPRIORMULT=DIRECT/SPOTSIZE=50/AGG=mean/MU_PRIOR=1.0/SIGMA_PRIOR=0.25/LOGITS_PRIOR=0.1/LEARN_GLOBAL_VARS=True/umap/50/morans_i.jsonl'), 75: (-inf, '')}
{50: (0.7146477812460335, 'results/hBreast/BayXenSmooth/clusters/PCA/25/INIT=K-Means/NEIGHBORSIZE=4/NUMCLUSTERS=17/SAMPLEFORASSIGNMENT=False/SPATIALPRIORMULT=DIRECT/SPOTSIZE=50/AGG=mean/MU_PRIOR=1.0/SIGMA_PRIOR=0.25/LOGITS_PRIOR=0.1/LEARN_GLOBAL_VARS=True/umap/50/morans_i.jsonl'), 75: (-inf, ''), 100: (-inf, '')}
{50: (0.5345209505978771, 'results/hBreast/BayesSpace/25/17/clusters/kmeans/50/3.00/umap/50/morans_i.json

In [293]:
morans_i_dict

{50: (0.7367223964612759,
  'results/hBreast/BayXenSmooth/clusters/PCA/25/INIT=Leiden/NEIGHBORSIZE=4/NUMCLUSTERS=17/SAMPLEFORASSIGNMENT=False/SPATIALPRIORMULT=DIRECT/SPOTSIZE=50/AGG=mean/MU_PRIOR=0.1/SIGMA_PRIOR=0.25/LOGITS_PRIOR=0.1/LEARN_GLOBAL_VARS=True/umap/50/morans_i.jsonl'),
 75: (-inf,
  'results/hBreast/BayXenSmooth/clusters/PCA/25/INIT=Leiden/NEIGHBORSIZE=4/NUMCLUSTERS=17/SAMPLEFORASSIGNMENT=False/SPATIALPRIORMULT=DIRECT/SPOTSIZE=50/AGG=mean/MU_PRIOR=0.1/SIGMA_PRIOR=0.25/LOGITS_PRIOR=0.1/LEARN_GLOBAL_VARS=True/umap/50/morans_i.jsonl'),
 100: (-inf,
  'results/hBreast/BayXenSmooth/clusters/PCA/25/INIT=Leiden/NEIGHBORSIZE=4/NUMCLUSTERS=17/SAMPLEFORASSIGNMENT=False/SPATIALPRIORMULT=DIRECT/SPOTSIZE=50/AGG=mean/MU_PRIOR=0.1/SIGMA_PRIOR=0.25/LOGITS_PRIOR=0.1/LEARN_GLOBAL_VARS=True/umap/50/morans_i.jsonl')}

# Best Moran's I

In [None]:
import os
import json

MARKER_GENES = ["BANK1", "CEACAM6", "FASN", "FGL2", "IL7R", "KRT6B", "POSTN", "TCIM"]

# Define the directory where the results are stored
results_dir = "results/hBreast"

# Loop through all subdirectories in the results directory
morans_i_dict = {}
for gene_name in MARKER_GENES:
    current_max_expression = 0
    current_best_mpd = float('inf')
    for root, dirs, files in os.walk(results_dir):
        for file in files:
            # Check if the file is named morans_i_by_gene.json
            if file == "morans_i_by_gene.jsonl":
                # Construct the full path to the file
                file_path = os.path.join(root, file)
                # Open and read the file
                with open(file_path, 'r') as f:
                    # Load the JSON data
                    data = [line for line in jsonlines.Reader(f)]
                    # Print the data to verify it's been loaded
                    morans_i = [x for x in data if gene_name in x]
                    unused_runs = ["Leiden/2.0", "Leiden/1.5", "Louvain/2.0", "Louvain/1.5"]
                    if morans_i[0][gene_name] > current_max_expression and "50/" in file_path and not any(x in file_path for x in unused_runs):
                        current_max_filepath = file_path
                        current_max_expression = morans_i[0][gene_name]
                        mpd_root = root
                        pattern = r"(?:umap|naive_distance|gauss|basic)/\d+"
                        mpd_root = re.sub(pattern, "", mpd_root)
                        mpd_dir = mpd_root.replace("clusters", "mpd")
                        mpd_file = next((f for f in os.listdir(mpd_dir) if f.endswith(".json")), None)
                        if mpd_file:
                            with open(os.path.join(mpd_dir, mpd_file)) as f:
                                mpd_dict = json.load(f)
                                if len(mpd_dict) == K:
                                    current_best_mpd = sum(mpd_dict.values()) / 1_000_000
                                else:
                                    current_best_mpd = (sum(mpd_dict.values()) / 1_000_000, f"(but K* = {len(mpd_dict)})")
    morans_i_dict[gene_name] = current_max_expression, current_max_filepath, current_best_mpd

In [None]:
morans_i_dict

# Specific Methods Comparison

In [212]:
import os
import json

# Define the directory where the results are stored
# results_dirs = ["results/hBreast/BayXenSmooth", "results/hBreast/BayesSpace"]
results_dirs = ["results/hBreast/Leiden"]

# Loop through all subdirectories in the results directory
morans_i_dict = {}
for gene_name in MARKER_GENES:
    current_max_expression = 0
    current_best_mpd = float('inf')
    for dir in results_dirs:
        for root, dirs, files in os.walk(dir):
            for file in files:
                # Check if the file is named morans_i_by_gene.json
                if file == "morans_i_by_gene.jsonl":
                    # Construct the full path to the file
                    file_path = os.path.join(root, file)

                    # Open and read the file
                    with open(file_path, 'r') as f:
                        # Load the JSON data
                        data = [line for line in jsonlines.Reader(f)]
                        # Print the data to verify it's been loaded
                        morans_i = [x for x in data if gene_name in x]
                        used_runs = ["Leiden/0.75", "Louvain/1.0"]
                        if morans_i[0][gene_name] > current_max_expression and "50/" in file_path:
                            if not any(x in file_path for x in ["Leiden", "Louvain"]) or any(x in file_path for x in used_runs):
                                current_max_filepath = file_path
                                current_max_expression = morans_i[0][gene_name]
                                mpd_root = root
                                pattern = r"(?:umap|naive_distance|gauss|basic)/\d+"
                                mpd_root = re.sub(pattern, "", mpd_root)
                                mpd_dir = mpd_root.replace("clusters", "mpd")
                                mpd_file = next((f for f in os.listdir(mpd_dir) if f.endswith(".json")), None)
                                if mpd_file:
                                    with open(os.path.join(mpd_dir, mpd_file)) as f:
                                        mpd_dict = json.load(f)
                                        if len(mpd_dict) == K:
                                            current_best_mpd = sum(mpd_dict.values()) / 1_000
                                        else:
                                            current_best_mpd = (sum(mpd_dict.values()) / 1_000, f"(but K* = {len(mpd_dict)})")
        morans_i_dict[gene_name] = current_max_expression, current_max_filepath, current_best_mpd

In [213]:
morans_i_dict

{'BANK1': (1.4739417834190855,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl',
  55.93160193262027),
 'CEACAM6': (0.34395234742791236,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl',
  55.93160193262027),
 'FASN': (0.9982649300206032,
  'results/hBreast/Leiden/0.75/clusters/50/umap/400/morans_i_by_gene.jsonl',
  55.93160193262027),
 'FGL2': (0.5810988489304812,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl',
  55.93160193262027),
 'IL7R': (1.0562251422009445,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl',
  55.93160193262027),
 'KRT6B': (0.9815267718202886,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl',
  55.93160193262027),
 'POSTN': (0.6224454538593952,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gene.jsonl',
  55.93160193262027),
 'TCIM': (0.8467029807415433,
  'results/hBreast/Leiden/0.75/clusters/50/umap/50/morans_i_by_gen

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
rows.shape, bank1_data.shape

In [None]:
import matplotlib.pyplot as plt
import torch

# Get the index of the gene 

gene = "BANK1"

bank1_index = clustering.xenium_spot_data.var.index.get_loc(gene)

# Extract the data for "BANK1"
bank1_data = torch.tensor(clustering.xenium_spot_data.X[:, bank1_index])

rows = clustering.xenium_spot_data.obs["row"].astype(int)
columns = clustering.xenium_spot_data.obs["col"].astype(int)

num_rows = max(rows) + 1
num_cols = max(columns) + 1

marker_grid = torch.zeros(num_rows, num_cols, dtype=float)

marker_grid[rows, columns] = bank1_data

# Plot the data
plt.figure(figsize=(10, 6))
plt.imshow(marker_grid, origin='lower')  # Invert the y-axis by setting origin to 'upper'
plt.title(f'Expression of {gene}')
plt.xlabel('Row')
plt.ylabel('Column')
plt.show()

In [None]:
# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(clustering.xenium_spot_data.var["BANK1"], cmap='Blues')
plt.title('Connectivities Heatmap')
plt.xlabel('Spot Index')
plt.ylabel('Spot Index')
plt.show()

In [None]:
spots_per_side = 100

print("Starting Moran's I Calculation.")
moran_clusters = ad.AnnData(locations)
sc.pp.neighbors(moran_clusters, n_pcs=0, n_neighbors=100)
print("Neighbors calculated.")

connectivities_submatrix = moran_clusters.obsp["connectivities"][:spots_per_side, :spots_per_side].A
# connectivities_submatrix = same_cluster[:spots_per_side, :spots_per_side]

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(connectivities_submatrix, cmap='Blues')
plt.title('Connectivities Heatmap')
plt.xlabel('Spot Index')
plt.ylabel('Spot Index')
plt.show()

# Create a binary adjacency matrix indicating if points are in the same cluster
cluster_labels = clusters.values
same_cluster = (cluster_labels[:, None] == cluster_labels).astype(int)
print(moran_clusters.obsp["connectivities"].shape, same_cluster.shape)
moran_clusters.obsp["connectivities"] = moran_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
print("Connectivities formed.")

# Calculate Moran's I for the genes
morans_i = sc.metrics.morans_i(moran_clusters, vals=clustering.xenium_spot_data.X.T)

morans_i_dict = dict(zip(clustering.xenium_spot_data.var.index, morans_i))

In [None]:
spots_per_side = 100

print("Starting Moran's I Calculation.")
moran_clusters = ad.AnnData(locations)
sc.pp.neighbors(moran_clusters, n_pcs=0, n_neighbors=100)
print("Neighbors calculated.")

connectivities_submatrix = moran_clusters.obsp["connectivities"][:spots_per_side, :spots_per_side].A
# connectivities_submatrix = same_cluster[:spots_per_side, :spots_per_side]

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(connectivities_submatrix, cmap='Blues')
plt.title('Connectivities Heatmap')
plt.xlabel('Spot Index')
plt.ylabel('Spot Index')
plt.show()

# Create a binary adjacency matrix indicating if points are in the same cluster
cluster_labels = clusters.values
same_cluster = (cluster_labels[:, None] == cluster_labels).astype(int)
print(moran_clusters.obsp["connectivities"].shape, same_cluster.shape)
moran_clusters.obsp["connectivities"] = moran_clusters.obsp["connectivities"].multiply(csr_matrix(same_cluster))
print("Connectivities formed.")

# Calculate Moran's I for the genes
morans_i = sc.metrics.morans_i(moran_clusters, vals=clustering.xenium_spot_data.X.T)

morans_i_dict = dict(zip(clustering.xenium_spot_data.var.index, morans_i))

In [None]:
# Extract the first 10x10 submatrix of the connectivities matrix
connectivities_submatrix = moran_clusters.obsp["connectivities"][:spots_per_side, :spots_per_side].A
# connectivities_submatrix = same_cluster[:spots_per_side, :spots_per_side]

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(connectivities_submatrix, cmap='Blues')
plt.title('Connectivities Heatmap')
plt.xlabel('Spot Index')
plt.ylabel('Spot Index')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

spots_per_side = 100

# Extract the first 10x10 submatrix of the connectivities matrix
# connectivities_submatrix = moran_clusters.obsp["connectivities"][:spots_per_side, :spots_per_side].A
connectivities_submatrix = same_cluster[:spots_per_side, :spots_per_side]

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(connectivities_submatrix, cmap='Blues')
plt.title('Connectivities Heatmap')
plt.xlabel('Spot Index')
plt.ylabel('Spot Index')
plt.show()

# Sanity Check of Clusters and Moran's I Values

In [None]:
import torch
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

num_clusters = K

rows = locations["row"].astype(int)
columns = locations["col"].astype(int)

num_rows = max(rows) + 1
num_cols = max(columns) + 1

cluster_grid = torch.zeros((num_rows, num_cols), dtype=torch.long)

cluster_grid[rows, columns] = torch.tensor(clusters) + 1

colors = plt.cm.get_cmap('viridis', num_clusters + 1)

colormap_colors = np.vstack(([[1, 1, 1, 1]], colors(np.linspace(0, 1, num_clusters))))
colormap = ListedColormap(colormap_colors)

plt.figure(figsize=(6, 6))
plt.imshow(cluster_grid, cmap=colormap, interpolation='nearest', origin='lower')
plt.colorbar(ticks=range(num_clusters + 1), label='Cluster Values')
plt.title('Prior Cluster Assignment with BayXenSmooth')

In [None]:
import os
import numpy as np

# Define the directory to search
search_directory = 'results/hBreast/BayesSpace'

# List to store the paths of all .csv files
csv_files = []

# Walk through the directory
for root, dirs, files in os.walk(search_directory):
    for file in files:
        if file.endswith('.csv'):
            clusters = pd.read_csv(os.path.join(root, file))["BayesSpace cluster"]
            print(os.path.join(root, file), len(np.unique(clusters.values)))