In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import squidpy as sq
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

In [None]:
adata = sc.read_h5ad('xenium_kidney_preprocessed.h5ad')

print(f"Full data shape: {adata.shape}")
print(f"Samples: {adata.obs['sample'].unique()}")

In [None]:
## BayesSpace
try:
    import rpy2.robjects as robjects
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    pandas2ri.activate()
    
    base = importr('base')
    bayesspace = importr('BayesSpace')
    singlecellexp = importr('SingleCellExperiment')
    
    print("BayesSpace loaded successfully")
    BAYESSPACE_AVAILABLE = True
except:
    print("BayesSpace not available, using alternative clustering")
    BAYESSPACE_AVAILABLE = False

In [None]:
## Graph computation

for sample in adata.obs['sample'].unique():
    sample_mask = adata.obs['sample'] == sample
    sample_data = adata[sample_mask].copy()
    
    sq.gr.spatial_neighbors(sample_data, coord_type='generic', n_neighs=6)
    
    if 'spatial_connectivities' not in adata.obsp.keys():
        adata.obsp['spatial_connectivities'] = np.zeros((adata.n_obs, adata.n_obs))
        adata.obsp['spatial_distances'] = np.zeros((adata.n_obs, adata.n_obs))
    
    sample_indices = np.where(sample_mask)[0]
    for i, idx_i in enumerate(sample_indices):
        for j, idx_j in enumerate(sample_indices):
            adata.obsp['spatial_connectivities'][idx_i, idx_j] = sample_data.obsp['spatial_connectivities'][i, j]
            adata.obsp['spatial_distances'][idx_i, idx_j] = sample_data.obsp['spatial_distances'][i, j]

print("Spatial neighborhood graph constructed")

In [None]:
def run_bayesspace_clustering(adata_sample, n_clusters=7, n_hvg=2000):
    coords_df = pd.DataFrame(adata_sample.obsm['spatial'], columns=['x', 'y'])
    coords_df.index = adata_sample.obs.index
    
    expr_df = pd.DataFrame(adata_sample.X.toarray() if hasattr(adata_sample.X, 'toarray') else adata_sample.X,
                          index=adata_sample.obs.index, 
                          columns=adata_sample.var.index)
    
    robjects.globalenv['coords_r'] = pandas2ri.py2rpy(coords_df)
    robjects.globalenv['expr_r'] = pandas2ri.py2rpy(expr_df)
    robjects.globalenv['n_clusters'] = n_clusters
    
    r_script = f"""
    library(BayesSpace)
    library(SingleCellExperiment)
    
    sce <- SingleCellExperiment(assays=list(logcounts=t(expr_r)),
                               colData=coords_r)
    
    sce <- spatialPreprocess(sce, platform="Visium", n.PCs=50, n.HVGs={n_hvg})
    sce <- qTune(sce, qs=seq(2, 10), platform="Visium", d=50)
    sce <- spatialCluster(sce, q={n_clusters}, platform="Visium", d=50,
                         init.method="mclust", model="t", gamma=2,
                         nrep=1000, burn.in=100)
    
    clusters <- sce$spatial.cluster
    """
    
    robjects.r(r_script)
    clusters = robjects.r['clusters']
    
    return pd.Categorical(list(clusters))

In [None]:
bayesspace_results = {}

for sample in adata.obs['sample'].unique():
    print(f"Running BayesSpace on {sample}...")
    sample_mask = adata.obs['sample'] == sample
    sample_data = adata[sample_mask].copy()
    
    clusters = run_bayesspace_clustering(sample_data, n_clusters=8)
    bayesspace_results[sample] = clusters
    
    adata.obs.loc[sample_mask, 'bayesspace_domains'] = clusters

print("BayesSpace clustering completed")
print(f"Domain counts per sample:")
for sample in adata.obs['sample'].unique():
    sample_counts = adata.obs[adata.obs['sample'] == sample]['bayesspace_domains'].value_counts()
    print(f"{sample}: {dict(sample_counts)}")

## nichePCA

In [None]:
def compute_nichepca(adata_sample, n_components=10, neighborhood_size=50):
    coords = adata_sample.obsm['spatial']
    expr_data = adata_sample.X.toarray() if hasattr(adata_sample.X, 'toarray') else adata_sample.X
    
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=neighborhood_size, algorithm='ball_tree').fit(coords)
    distances, indices = nbrs.kneighbors(coords)
    
    niche_profiles = np.zeros((adata_sample.n_obs, adata_sample.n_vars))
    
    for i in range(adata_sample.n_obs):
        neighbor_indices = indices[i]
        weights = 1 / (distances[i] + 1e-8)  
        weights = weights / weights.sum()
        
        niche_profiles[i] = np.average(expr_data[neighbor_indices], axis=0, weights=weights)
    
    pca = PCA(n_components=n_components, random_state=42)
    niche_pca = pca.fit_transform(niche_profiles)
    
    return niche_pca, pca

def run_nichepca_clustering(adata_sample, n_components=10, n_clusters=8):
    niche_pca, pca_model = compute_nichepca(adata_sample, n_components=n_components)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
    clusters = kmeans.fit_predict(niche_pca)
    
    return clusters, niche_pca, pca_model

In [None]:
nichepca_results = {}
nichepca_embeddings = {}

for sample in adata.obs['sample'].unique():
    print(f"Running nichePCA on {sample}...")
    sample_mask = adata.obs['sample'] == sample
    sample_data = adata[sample_mask].copy()
    
    clusters, niche_embedding, pca_model = run_nichepca_clustering(sample_data, n_components=15, n_clusters=8)
    
    nichepca_results[sample] = clusters
    nichepca_embeddings[sample] = niche_embedding
    
    adata.obs.loc[sample_mask, 'nichepca_domains'] = clusters.astype(str)

print("nichePCA clustering completed")
print(f"Domain counts per sample:")
for sample in adata.obs['sample'].unique():
    sample_counts = adata.obs[adata.obs['sample'] == sample]['nichepca_domains'].value_counts()
    print(f"{sample}: {dict(sample_counts)}")

## Evaluation

In [None]:
def calculate_spatial_coherence(adata_sample, cluster_key):
    coords = adata_sample.obsm['spatial']
    clusters = adata_sample.obs[cluster_key]
    
    coherence_scores = []
    
    for cluster_id in clusters.unique():
        cluster_mask = clusters == cluster_id
        if cluster_mask.sum() < 3:
            continue
            
        cluster_coords = coords[cluster_mask]
        
        if len(cluster_coords) > 1:
            distances = pdist(cluster_coords)
            mean_intra_distance = np.mean(distances)
            
            other_coords = coords[~cluster_mask]
            if len(other_coords) > 0:
                inter_distances = []
                for coord in cluster_coords:
                    dist_to_others = np.sqrt(((other_coords - coord) ** 2).sum(axis=1))
                    inter_distances.extend(dist_to_others)
                
                mean_inter_distance = np.mean(inter_distances)
                coherence = mean_inter_distance / (mean_intra_distance + 1e-8)
                coherence_scores.append(coherence)
    
    return np.mean(coherence_scores) if coherence_scores else 0

def calculate_silhouette_spatial(adata_sample, cluster_key):
    coords = adata_sample.obsm['spatial']
    clusters = adata_sample.obs[cluster_key]
    
    if len(clusters.unique()) < 2:
        return 0
    
    cluster_labels = pd.Categorical(clusters).codes
    sil_score = silhouette_score(coords, cluster_labels)
    
    return sil_score

In [None]:
evaluation_results = {}

for sample in adata.obs['sample'].unique():
    sample_mask = adata.obs['sample'] == sample
    sample_data = adata[sample_mask].copy()
    
    results = {}
    
    for method, cluster_key in [('BayesSpace', 'bayesspace_domains'), 
                               ('nichePCA', 'nichepca_domains')]:
        
        spatial_coherence = calculate_spatial_coherence(sample_data, cluster_key)
        silhouette_spatial = calculate_silhouette_spatial(sample_data, cluster_key)
        
        n_clusters = len(sample_data.obs[cluster_key].unique())
        
        results[method] = {
            'spatial_coherence': spatial_coherence,
            'silhouette_spatial': silhouette_spatial,
            'n_clusters': n_clusters
        }
    
    evaluation_results[sample] = results

evaluation_df = pd.DataFrame({
    (sample, method): metrics 
    for sample, sample_results in evaluation_results.items()
    for method, metrics in sample_results.items()
}).T

print("Evaluation Results:")
print(evaluation_df.round(3))

In [None]:
## BayesSpace
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

for i, sample in enumerate(adata.obs['sample'].unique()):
    sample_mask = adata.obs['sample'] == sample
    sample_data = adata[sample_mask]
    coords = sample_data.obsm['spatial']
    clusters = sample_data.obs['bayesspace_domains']
    
    unique_clusters = clusters.unique()
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))
    
    for j, cluster in enumerate(unique_clusters):
        cluster_mask = clusters == cluster
        axes[i].scatter(coords[cluster_mask, 0], coords[cluster_mask, 1], 
                       c=[colors[j]], s=1, alpha=0.7, label=f'Domain {cluster}')
    
    axes[i].set_title(f'BayesSpace Domains - {sample}')
    axes[i].set_xlabel('X coordinate')
    axes[i].set_ylabel('Y coordinate')
    axes[i].axis('equal')
    axes[i].legend(bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=5)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

for i, sample in enumerate(adata.obs['sample'].unique()):
    sample_mask = adata.obs['sample'] == sample
    sample_data = adata[sample_mask]
    coords = sample_data.obsm['spatial']
    clusters = sample_data.obs['nichepca_domains']
    
    unique_clusters = clusters.unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(unique_clusters)))
    
    for j, cluster in enumerate(unique_clusters):
        cluster_mask = clusters == cluster
        axes[i].scatter(coords[cluster_mask, 0], coords[cluster_mask, 1], 
                       c=[colors[j]], s=1, alpha=0.7, label=f'Domain {cluster}')
    
    axes[i].set_title(f'nichePCA Domains - {sample}')
    axes[i].set_xlabel('X coordinate')
    axes[i].set_ylabel('Y coordinate')
    axes[i].axis('equal')
    axes[i].legend(bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=5)

plt.tight_layout()
plt.show()

In [None]:
adata.write('xenium_kidney_with_domains.h5ad')

results_summary = {
    'evaluation_metrics': evaluation_df,
    'bayesspace_results': bayesspace_results,
    'nichepca_results': nichepca_results,
    'nichepca_embeddings': nichepca_embeddings
}

import pickle
with open('spatial_domain_results.pkl', 'wb') as f:
    pickle.dump(results_summary, f)

print("Results saved successfully")
print("\nFinal domain counts:")
print("BayesSpace:")
print(adata.obs.groupby('sample')['bayesspace_domains'].value_counts())
print("\nnichePCA:")
print(adata.obs.groupby('sample')['nichepca_domains'].value_counts())

## Tasks

1. Implement nichePCA with cell type labels rather than gene expression. A similar approach is [MENDER](https://www.nature.com/articles/s41467-023-44367-9).

2. Assign compartment names (e.g. glomerular, tubules) based on the gene expression and annotated domains.

3. Align domains across samples. For nichePCA, a batch correction algorithm can be used on aggregated gene expression.

4. (optional) Implement multi-resolution spatial domains, e.g. with hierarchial clustering or [SCALE](https://www.biorxiv.org/content/10.1101/2025.05.21.653987v1).