In [None]:
import os
# import plotly.express as px
import scanpy as sc
import scimap as sm
import seaborn as sns
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
%matplotlib inline
import warnings
from skimage.io import imread
warnings.filterwarnings("ignore")

In [None]:
base_dir = "/home/smith6jt/panc_CODEX"

In [None]:
adata=ad.read_h5ad(os.path.join(base_dir, 'CODEX_panc_scvi_BioCov.h5ad'))

In [None]:
print(adata.layers['clr_normalized'].min(), adata.layers['clr_normalized'].max())
print(adata.layers['counts'].min(), adata.layers['counts'].max())
print(adata.layers['raw_mfi'].min(), adata.layers['raw_mfi'].max())
print(adata.layers['scaled'].min(), adata.layers['scaled'].max())
print(adata.X.min(), adata.X.max())

In [None]:
adata

In [None]:
adata.var_names

In [None]:
raw = adata.layers['raw_mfi']
adata = ad.AnnData(
X=raw.copy(),
obs=adata.obs[['imageid', 'Object ID', 'Object type', 'Classification', 'Parent', 'X_centroid', 'Y_centroid', 'Nucleus Area', 'Nucleus Length', 'Nucleus Circularity', 'Nucleus Solidity', 'Nucleus: Max diameter µm', 'Nucleus: Min diameter µm', 'Cell Area', 'Cell Length', 'Cell Circularity', 'Cell Solidity', 'Cell: Max diameter µm', 'Cell: Min diameter µm', 'Dist to Closest Peri-Islet', 'Dist to Closest Tissue', 'Dist to Closest Capillary', 'Dist to Closest Lymphatic', 'Dist to Closest Nerve', 'Dist to Closest Islet', 'Donor Status', 'Age', 'Gender', 'GADA', 'ZnT8A', 'IA2A', 'mIAA', 'None', 'n_genes_by_counts', 'total_counts', 'n_genes', '_scvi_batch', '_scvi_labels', 'donor_id']].copy(),
var=adata.var.copy()
)
adata.uns["all_markers"] = ['DAPI','CD31', 'CD8a', 'CD3e', 'SMA', 'Ki67','CD4', 'CD34', 'HLADR', 'PDPN', 'panCK',
       'ECAD', 'CD163', 'SST', 'ColIV', 'VIM', 'CD20', 'LGALS3', 'B3TUBB',
       'GCG', 'KRT14', 'GAP43', 'CD35', 'CHGA', 'PGP9.5', 'INS', 'CD44',
       'CD45', 'NaKATPase', 'BCatenin', 'CD68', 'BActin', 'CK19', 'epCAM', 'KRT8-18']

In [None]:
adata.raw = adata
adata = sm.pp.log1p(adata)

In [None]:
image_path = os.path.join(base_dir, 'local_images/Aab_6450.ome.tiff')

In [None]:
%gui qt

In [None]:
# napariGater - Try with flip_y=True to test coordinate alignment
%gui qt
sm.pl.napariGater(
    image_path,
    adata,
    centroid_units='microns',  # Centroids are in µm
    flip_y=True,               # TEST: Try flipping Y
    verbose=True,
    point_size=15,
    layer='raw',
    log=True
)

In [None]:
adata.raw.X.min(), adata.raw.X.max()

In [None]:
adata = sm.tl.cluster(adata, method='leiden', resolution=0.4, use_raw=False, log=False)

In [None]:
sm.pl.heatmap(adata, groupBy='leiden', layer=None, #standardScale='column', 
              figsize=(15,12), showPrevalence=True)
# , vmin=-2.5, vmax=2.5

In [None]:
cell_type_markers = {

    'Beta cells': ['INS'],
    'Alpha cells': ['GCG'],
    'Delta cells': ['SST'],
    'Acinar cells': ['BCatenin', 'ECAD'],
    'Ductal cells': ['BActin','CK19'],
    'T cells CD4+': ['CD4', 'CD3e'],
    'T cells CD8+': ['CD8a', 'CD3e'],
    'CD4+': ['CD4'],
    'B cells': ['CD20'],
    'Macrophages': ['CD68'],
    'APCs': ['HLADR'],
    'Endothelial': ['CD31'], 
    'Vasculature': ['CD34'],
    'Lymphatic': ['PDPN'],
    'Fibroblasts': ['ColIV', 'VIM'],
    'Smooth Muscle': ['SMA'],
    'Neural': ['B3TUBB'],
}

## 2. Cluster Quality Checks

In [None]:
cluster_keys = [key for key in adata.obs.columns if key.startswith('leiden_res_')]
if not cluster_keys:
    raise ValueError("No Leiden clustering columns found in adata.obs")

cluster_summaries = []
for key in sorted(cluster_keys):
    counts = adata.obs[key].value_counts().sort_index()
    percents = counts / counts.sum() * 100
    cluster_summaries.append(
        pd.DataFrame(
            {
                "resolution": key.replace("leiden_res_", ""),
                "cluster": counts.index.astype(str),
                "n_cells": counts.values,
                "pct_cells": percents.values,
            }
        )
    )

summary_df = pd.concat(cluster_summaries, ignore_index=True)
summarized = summary_df.groupby("resolution").agg(
    clusters=("cluster", "nunique"),
    min_cells=("n_cells", "min"),
    median_cells=("n_cells", "median"),
    max_cells=("n_cells", "max"),
)

summarized

In [None]:
mpl.rcParams.update({
    'font.size': 14,        # base font
    'axes.titlesize': 18,   # subplot titles
    'axes.labelsize': 16,   # x/y labels
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 14,
    'figure.titlesize': 20, # suptitle
})
fig = px.box(
    summary_df,
    x="resolution",
    y="pct_cells",
    points="all",
    color="resolution",
    labels={"pct_cells": "Cluster size (%)"},
    title="Cluster size distribution per resolution",
)
fig.show()

In [None]:
mpl.rcParams.update({
    'font.size': 14,        # base font
    'axes.titlesize': 18,   # subplot titles
    'axes.labelsize': 16,   # x/y labels
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'legend.fontsize': 14,
    'figure.titlesize': 20, # suptitle
})
fig, axes = plt.subplots(2, 1, figsize=(16, 16), layout="constrained")

# PCA space (before batch correction)
from sklearn.decomposition import PCA
pca_viz = PCA(n_components=2)
pca_coords = pca_viz.fit_transform(adata.X)

axes[0].scatter(pca_coords[:, 0], pca_coords[:, 1], 
               c=adata.obs['imageid'].astype('category').cat.codes, 
               s=1, alpha=0.4, cmap='Set1')
axes[0].set_title('PCA Space')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')

# scVI space (after batch correction)
from sklearn.decomposition import PCA as PCA2
pca_scvi = PCA2(n_components=2)
scvi_coords = pca_scvi.fit_transform(adata.obsm["X_scVI"])

scatter = axes[1].scatter(scvi_coords[:, 0], scvi_coords[:, 1],
                         c=adata.obs['imageid'].astype('category').cat.codes,
                         s=1, alpha=0.4, cmap='Set1')
axes[1].set_title('scVI Latent Space')
axes[1].set_xlabel('Latent Dim 1')
axes[1].set_ylabel('Latent Dim 2')

# Add legend
donors = adata.obs['imageid'].unique()
handles = [plt.Line2D([0], [0], marker='o', color='w', 
                     markerfacecolor=plt.cm.Set1(i/len(donors)), 
                     markersize=8, label=donors) 
          for i, donor in enumerate(donors)]


plt.tight_layout()
plt.show()


## 3. Draft Cell Type Annotation

In [None]:
annotation_resolution = "leiden_res_1.50"

if annotation_resolution not in adata.obs:
    raise KeyError(f"Resolution {annotation_resolution} not present. Available: {sorted(cluster_keys)}")

cluster_means = None
if "rank_genes_groups" in adata.uns:
    try:
        # Capture differential expression results if they exist for downstream review.
        cluster_means = sc.get.rank_genes_groups_df(adata, key="rank_genes_groups")
    except (KeyError, ValueError, TypeError):
        cluster_means = None

scaled = adata.layers.get("scaled", None)
if scaled is None:
    scaled = sc.pp.scale(adata, zero_center=True, copy=True).X
    adata.layers["scaled"] = scaled

scaled_df = pd.DataFrame(
    scaled,
    index=adata.obs.index,
    columns=adata.var_names,
 )

cluster_profiles = (
    scaled_df.join(adata.obs[[annotation_resolution]])
    .groupby(annotation_resolution)
    .mean()
    .sort_index()
)

annotation_table = []
for cluster, profile in cluster_profiles.iterrows():
    marker_scores = {}
    for label, markers in cell_type_markers.items():
        present = [m for m in markers if m in profile.index]
        if not present:
            continue
        marker_scores[label] = profile[present].mean()
    top_marker = max(marker_scores, key=marker_scores.get) if marker_scores else "Unknown"
    annotation_table.append(
        {
            "cluster": cluster,
            "top_marker": top_marker,
            "score": marker_scores.get(top_marker, np.nan),
            "marker_scores": marker_scores,
        }
    )

annotation_df = pd.DataFrame(annotation_table).sort_values("cluster").reset_index(drop=True)
annotation_df

In [None]:
adata.obs["draft_cell_type"] = adata.obs[annotation_resolution].map(
    annotation_df.set_index("cluster")["top_marker"]
)
adata.obs["draft_cell_type"].value_counts().sort_values(ascending=False).head(20)

In [None]:
# Recompute neighbors with fewer neighbors (more local structure)
print("Recomputing neighbor graph with fewer neighbors...")
sc.pp.neighbors(
    adata, 
    use_rep='X_scVI',
    n_neighbors=8,      # Reduced from 30 -> more local structure
    n_pcs=None,
    metric='cosine'
)
print("✓ Neighbor graph recomputed with n_neighbors=8")

In [None]:
# Recompute UMAP with parameters optimized for better separation
print("Computing UMAP with optimized parameters for CODEX data...")
sc.tl.umap(
    adata,
    min_dist=0.01,        # Tighter packing (default 0.5)
    spread=2.5,          # Larger spread (default 1.0)
    n_components=2
)

In [None]:
sc.tl.leiden(adata, key_added="leiden_res_0.50_v3", resolution=0.50, flavor="igraph")

In [None]:
adata.write_h5ad(os.path.join(base_dir, 'CODEX_panc_scvi_2025Nov12.h5ad'))

In [None]:
sc.set_figure_params(dpi=100, dpi_save=300, fontsize=26)
fig, axes = plt.subplots(2, 2, figsize=(25,18))
plt.rcParams['legend.markerscale'] = 4.0 
sc.pl.umap(
    adata,
    color='leiden_res_1.00',
    ax=axes[0,0],
    show=False,
    frameon=False,     size=1,
    alpha=0.4, wspace=0.4,
    legend_loc='right margin'
)
sc.pl.umap(
    adata,
    color='donor_id',
    ax=axes[0,1],
    show=False,
    frameon=False,     size=1,
    alpha=0.4, wspace=0.4,
    legend_loc='right margin'
)
sc.pl.umap(
    adata,
    color='draft_cell_type',
    ax=axes[1,0],
    show=False,
    frameon=False,     size=1,
    alpha=0.4, wspace=0.4,
    legend_loc='right margin'
)
sc.pl.umap(
    adata,
    color='Donor Status',
    ax=axes[1,1],
    show=False,
    frameon=False,     size=1,
    alpha=0.4, wspace=0.4,
    legend_loc='right margin'
)
plt.tight_layout()
plt.show()

## 3.5 Donor-Specific Views

In [None]:
status_key = "Donor Status"
if status_key not in adata.obs:
    raise ValueError(f"{status_key} column not found in adata.obs")

status_cluster_counts = (
    adata.obs.groupby([status_key, annotation_resolution])
    .size()
    .rename("n_cells")
    .reset_index()
)
status_cluster_counts["pct_within_status"] = (
    status_cluster_counts.groupby(status_key)["n_cells"]
    .transform(lambda x: x / x.sum() * 100)
)

status_cluster_counts.sort_values([status_key, "n_cells"], ascending=[True, False])

In [None]:
try:
    status_categories = list(adata.obs[status_key].cat.categories)
except AttributeError:
    status_categories = sorted(adata.obs[status_key].dropna().unique())

for status in status_categories:
    subset = adata[adata.obs[status_key] == status].copy()
    if subset.n_obs == 0:
        continue
    sc.pl.umap(
        subset,
        color=[annotation_resolution, "draft_cell_type"],
        title=f"{status_key}: {status}",
        frameon=False,
        wspace=0.4,
        legend_loc="right margin",
        size=3,
        show=False,
    )
    plt.show()

In [None]:
umap_df = (
    pd.DataFrame(
        adata.obsm["X_umap"],
        columns=["UMAP1", "UMAP2"],
        index=adata.obs.index,
    )
    .join(
        adata.obs[[status_key, annotation_resolution, "draft_cell_type"]]
    )
    .dropna(subset=[status_key])
)

fig = px.scatter(
    umap_df,
    x="UMAP1",
    y="UMAP2",
    color="draft_cell_type",
    facet_col=status_key,
    category_orders={status_key: list(status_categories)},
    opacity=0.4,
    title="UMAP colored by draft cell type, faceted by donor status",
    height=450,
)
fig.update_traces(marker=dict(size=1, line=dict(width=0)))
fig.show()

### Cluster Distribution Boxplots by Donor Status

Visualize how clusters are distributed across the three donor status groups (ND, Aab+, T1D):

In [None]:
# Prepare data for boxplots - calculate cluster percentages per donor
donor_id_col = 'donor_id'

# Group by only the essential columns first, as per your suggestion
cluster_donor_counts = (
    adata.obs.groupby([donor_id_col,  annotation_resolution], sort=False)
    .size()
    .reset_index(name='n_cells')
)

# Calculate percentage of cells for each cluster within each donor
cluster_donor_counts['pct_within_donor'] = (
    cluster_donor_counts.groupby(donor_id_col)['n_cells']
    .transform(lambda x: x / x.sum() * 100)
)

# Add Donor Status back from the source of truth (adata.obs)
donor_status_map = adata.obs.drop_duplicates(subset=[donor_id_col]).set_index(donor_id_col)['Donor Status']
cluster_donor_counts['Donor Status'] = cluster_donor_counts[donor_id_col].map(donor_status_map)


# Get total cells per donor status for reference
donor_totals = adata.obs['Donor Status'].value_counts()
print("Total cells per donor status:")
for status, count in donor_totals.items():
    print(f"  {status}: {count:,} cells")

print(f"\nClusters in {annotation_resolution}: {adata.obs[annotation_resolution].nunique()}")
print(f"Donors: {adata.obs[donor_id_col].nunique()}")
print("\nTop 5 clusters by overall size:")
print(adata.obs[annotation_resolution].value_counts().head())
print("\nData prepared for plotting:")
cluster_donor_counts.head()

### Identify Disease-Associated Clusters

Find clusters that are significantly enriched or depleted in specific donor status groups:

In [None]:
# Find clusters with notable differences between donor statuses
enrichment_analysis = []

# Calculate the mean percentage for each cluster and donor status
cluster_summary = cluster_donor_counts.groupby([annotation_resolution, 'Donor Status'])['pct_within_donor'].mean().reset_index()


for cluster in sorted(adata.obs[annotation_resolution].unique(), key=lambda x: int(x)):
    cluster_data = cluster_summary[cluster_summary[annotation_resolution] == cluster]
    
    # Get percentages for each donor status
    pct_dict = dict(zip(cluster_data['Donor Status'], cluster_data['pct_within_donor']))
    
    nd_pct = pct_dict.get('ND', 0)
    aab_pct = pct_dict.get('Aab+', 0)
    t1d_pct = pct_dict.get('T1D', 0)
    
    # Calculate fold changes
    max_pct = max(nd_pct, aab_pct, t1d_pct)
    min_pct = min(nd_pct, aab_pct, t1d_pct)
    
    # Avoid division by zero, handle case where min_pct is 0
    if min_pct > 0:
        fold_change = max_pct / min_pct
    elif max_pct > 0:
        fold_change = float('inf')  # Assign infinity if min is 0 and max is not
    else:
        fold_change = 1.0 # All are 0
        
    # Determine which status has highest percentage
    if nd_pct == max_pct:
        enriched_in = 'ND'
    elif aab_pct == max_pct:
        enriched_in = 'Aab+'
    else:
        enriched_in = 'T1D'
    
    enrichment_analysis.append({
        'cluster': cluster,
        'ND_%': nd_pct,
        'Aab+_%': aab_pct,
        'T1D_%': t1d_pct,
        'fold_change': fold_change,
        'enriched_in': enriched_in
    })

enrichment_df = pd.DataFrame(enrichment_analysis)

# Sort by fold change to find most disease-associated clusters
enrichment_df_sorted = enrichment_df.sort_values('fold_change', ascending=False)

print("Top 10 clusters with strongest donor status association:")
print("=" * 80)
print(enrichment_df_sorted.head(10).to_string(index=False))

print("\n\nClusters enriched in each donor status (fold change > 2):")
print("=" * 80)
for status in ['ND', 'Aab+', 'T1D']:
    enriched_clusters = enrichment_df[
        (enrichment_df['enriched_in'] == status) & 
        (enrichment_df['fold_change'] > 2.0)
    ].sort_values('fold_change', ascending=False)
    
    print(f"\n{status} enriched clusters:")
    if len(enriched_clusters) > 0:
        for _, row in enriched_clusters.iterrows():
            print(f"  Cluster {row['cluster']}: {row[f'{status}_%']:.2f}% (fold change: {row['fold_change']:.2f}x)")
    else:
        print("  None with fold change > 2")

## 3.6 Multipanel Cluster Boxplots (Matplotlib)

This section recreates the multipanel boxplots without Plotly. Each subplot is an independent plot with y-min fixed at 0 and an individually determined y-max for that cluster's data. Subplots use consistent chart areas via Matplotlib's layout management.

In [None]:
# Multipanel cluster boxplots (Matplotlib) — no Plotly
import math
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from matplotlib.patches import Patch

# Ensure required objects/keys exist
status_col = 'Donor Status'
donor_id_col = 'donor_id'

# Prefer the user's chosen resolution if defined, otherwise fall back
try:
    ann_key = annotation_resolution
except NameError:
    ann_key = 'leiden_res_1.00'

if ann_key not in adata.obs.columns:
    raise KeyError(f"Annotation key '{ann_key}' not found in adata.obs. Available: {list(adata.obs.columns)}")
if donor_id_col not in adata.obs.columns:
    raise KeyError(f"'{donor_id_col}' not found in adata.obs")
if status_col not in adata.obs.columns:
    raise KeyError(f"'{status_col}' not found in adata.obs")

# Build source dataframe from adata.obs
obs_df = adata.obs[[donor_id_col, status_col, ann_key]].copy()
obs_df[ann_key] = obs_df[ann_key].astype(str)
obs_df = obs_df.dropna(subset=[donor_id_col, status_col, ann_key])

# Group to compute per-donor percentages per cluster
cluster_donor_counts = (
    obs_df.groupby([donor_id_col, ann_key], sort=False)
    .size()
    .reset_index(name='n_cells')
)
cluster_donor_counts['pct_within_donor'] = (
    cluster_donor_counts.groupby(donor_id_col)['n_cells']
    .transform(lambda x: x / x.sum() * 100)
)

# Attach donor status by donor_id
status_map = obs_df.drop_duplicates(subset=[donor_id_col]).set_index(donor_id_col)[status_col]
cluster_donor_counts[status_col] = cluster_donor_counts[donor_id_col].map(status_map)

# Determine cluster and status ordering
all_clusters = sorted(obs_df[ann_key].unique(), key=lambda x: int(x))
status_present = list(pd.Series(obs_df[status_col].dropna().astype(str).unique()))
preferred_order = ['ND', 'Aab+', 'T1D']
status_order = [s for s in preferred_order if s in status_present] + [s for s in status_present if s not in preferred_order]

# Colors for statuses
color_map = {'ND': '#2ca02c', 'Aab+': '#1f77b4', 'T1D': '#d62728'}
palette = [color_map.get(s, '#7f7f7f') for s in status_order]

# Marker name mapping from annotation_df if available
try:
    marker_map = annotation_df.set_index('cluster')['top_marker'].astype(str).to_dict()
except Exception:
    marker_map = {}

# Grid layout
n_clusters = len(all_clusters)
if n_clusters == 0:
    raise ValueError(f"No clusters found in '{ann_key}'.")

ncols = 5
nrows = math.ceil(n_clusters / ncols)

# Larger figure to accommodate legend & title above plots
fig_width = ncols * 4.2  # was 3.6
fig_height = nrows * 4.0  # was 3.4
fig, axes = plt.subplots(
    nrows, ncols,
    figsize=(fig_width, fig_height),
    constrained_layout=False,  # manual spacing adjustments
    sharex=False,
    sharey=False,
)

# Adjust top padding to reserve space for legend + title
fig.subplots_adjust(top=0.82, hspace=0.45, wspace=0.30)

# Flatten axes
axes = np.array(axes).reshape(-1) if isinstance(axes, (np.ndarray, list)) else np.array([axes])

# Predefine comparisons and x-positions
comparisons = [('ND', 'Aab+'), ('Aab+', 'T1D'), ('ND', 'T1D')]
status_pos = {s: i for i, s in enumerate(status_order)}

# Plot each cluster
for idx, cluster in enumerate(all_clusters):
    ax = axes[idx]
    sub = cluster_donor_counts[cluster_donor_counts[ann_key] == cluster].copy()

    if len(sub) == 0:
        ax.set_visible(False)
        continue

    # Boxplot
    sns.boxplot(
        data=sub,
        x=status_col,
        y='pct_within_donor',
        order=status_order,
        palette=palette,
        ax=ax,
        showfliers=False,
        width=0.6,
        linewidth=1,
        boxprops=dict(alpha=0.9),
    )
    # Points
    sns.stripplot(
        data=sub,
        x=status_col,
        y='pct_within_donor',
        order=status_order,
        ax=ax,
        color='black',
        size=10,
        jitter=0.25,
        alpha=0.45,
        linewidth=0,
    )

    # Cluster marker name
    marker_name = marker_map.get(str(cluster), 'Unknown')
    ax.set_title(f"Cluster {cluster}: {marker_name}", fontsize=14, pad=6)
    ax.set_xlabel("")
    ax.tick_params(axis='x', labelbottom=False)
    ax.tick_params(axis='y', labelsize=12)
    ax.grid(axis='y', alpha=0.2)

    # Annotation placement calculations
    y_data_max = float(sub['pct_within_donor'].max()) if sub['pct_within_donor'].size else 0.0
    y_data_max = max(0.0, y_data_max)
    base = y_data_max * 1.05
    step = max(1.0, y_data_max * 0.12)
    annot_count = 0
    max_annot_y = 0.0

    for g1, g2 in comparisons:
        if g1 not in status_pos or g2 not in status_pos:
            continue
        d1 = sub.loc[sub[status_col] == g1, 'pct_within_donor'].dropna().values
        d2 = sub.loc[sub[status_col] == g2, 'pct_within_donor'].dropna().values
        if d1.size == 0 or d2.size == 0:
            continue
        try:
            stat, p_val = mannwhitneyu(d1, d2, alternative='two-sided')
        except ValueError:
            p_val = 1.0
        if p_val < 0.05:
            y = base + annot_count * step
            x1, x2 = status_pos[g1], status_pos[g2]
            ax.plot([x1, x2], [y, y], color='black', lw=1)
            cap = step * 0.08
            ax.plot([x1, x1], [y - cap, y + cap], color='black', lw=1)
            ax.plot([x2, x2], [y - cap, y + cap], color='black', lw=1)
            p_text = f"p={p_val:.3f}" if p_val >= 0.001 else "p<0.001"
            ax.text((x1 + x2) / 2, y + cap * 1.2, p_text, ha='center', va='bottom', fontsize=11)
            annot_count += 1
            max_annot_y = max(max_annot_y, y + cap * 1.8)

    # Y-limits
    ymax_from_data = y_data_max * 1.10
    ymax_from_ann = max_annot_y * 1.08 if max_annot_y > 0 else 0
    final_y_max = max(1.0, ymax_from_data, ymax_from_ann)
    ax.set_ylim(0, final_y_max)

    # Y label only on first column
    if idx % ncols == 0:
        ax.set_ylabel('Percentage of Cells (%)', fontsize=13)
    else:
        ax.set_ylabel('')

# Hide unused axes
for j in range(n_clusters, nrows * ncols):
    axes[j].set_visible(False)

# Global legend positioned above all plots
handles = [Patch(facecolor=color_map.get(s, '#7f7f7f'), edgecolor='black', label=s) for s in status_order]
fig.legend(
    handles=handles,
    labels=status_order,
    title='Donor Status',
    loc='upper center',
    bbox_to_anchor=(0.5, 0.96),  # inside top padding but above suptitle
    ncol=len(status_order),
    frameon=False,
    fontsize=12,
    title_fontsize=12,
)

# Suptitle moved further up
fig.suptitle(
    f"Cluster Percentage per Donor Status — {ann_key}",
    fontsize=18,
    y=0.995,
)

# Save figure
try:
    save_dir = os.path.join(base_dir, 'outputs') if 'base_dir' in globals() else os.path.join('.', 'outputs')
    os.makedirs(save_dir, exist_ok=True)
    out_path = os.path.join(save_dir, f"cluster_boxplots_{ann_key}_matplotlib.png")
    fig.savefig(out_path, dpi=300)
    print(f"Saved figure to: {out_path}")
except Exception as e:
    print(f"Warning: could not save figure: {e}")

plt.show()
print("\n✓ Updated multipanel boxplots: larger canvas, legend & title repositioned, marker names added to titles.")

## Vis

In [None]:
sc.tl.rank_genes_groups(adata, groupby="leiden_res_1.00", method="wilcoxon")

In [None]:
adata

In [None]:
adata.layers["scaled"] = sc.pp.scale(adata, zero_center=True, copy=True).X
sc.tl.dendrogram(adata, groupby='leiden_res_1.00')

In [None]:
sc.set_figure_params(dpi=60, dpi_save=300, fontsize=14)
sc.pl.dotplot(
    adata, 
    var_names=cell_type_markers,
    groupby='leiden_res_1.00', 
    standard_scale='var',
    figsize=(10, 8),
    layer="clr_normalized"
    # vmin=-1,
    # vmax=1,
    # color_map="RdBu_r"
)

In [None]:
adata.obs['Dist to Closest Islet'].min()

In [None]:
sc.set_figure_params(dpi=60, dpi_save=300, fontsize=18)
plt.rcParams['legend.markerscale'] = 3.0 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 8))
sc.pl.umap(adata, color=['Dist to Closest Islet'], ax=axes[0], show=False, cmap='magma',
    title='Cell Centroid Distance to Islet', frameon=False, size=5,vmin=0)
sc.pl.umap(adata, color=['INS'], ax=axes[1], show=False, cmap='magma',
    title='Insulin', frameon=False, size=5, layer='clr_normalized')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(24, 16))
plt.rcParams['legend.markerscale'] = 4.0 
sc.pl.umap(adata, color='Age', ax=axes[0,0], show=False, 
           title='Age', frameon=False,     size=1,
    alpha=0.4, palette='Set1')

sc.pl.umap(adata, color='Gender', ax=axes[0,1], show=False,
           title='Sex', frameon=False,     size=1,
    alpha=0.4, palette='Set1')
sc.pl.umap(adata, color='ZnT8A', ax=axes[0,2], show=False,
            title='ZnT8A', frameon=False,     size=1,
    alpha=0.4, palette='Set1')

sc.pl.umap(adata, color='IA2A', ax=axes[1,0], show=False,
               title='IA2A', frameon=False,     size=1,
    alpha=0.4, palette='Set1')
sc.pl.umap(adata, color='GADA', ax=axes[1,1], show=False,
               title='GADA', frameon=False,     size=1,
    alpha=0.4, palette='Set1')

sc.pl.umap(adata, color='mIAA', ax=axes[1,2], show=False,
            title='mIAA', frameon=False,     size=1,
    alpha=0.4, palette='Set1')
plt.tight_layout()
plt.show()

In [None]:
# Heatmaps per Donor Status, grouped by clusters
from pathlib import Path
import numpy as np
import scanpy as sc

# Ensure the grouping column exists
assert 'Donor Status' in adata.obs.columns, "Missing 'Donor Status' in adata.obs"

# Discover cluster labels (use existing if present; otherwise compute Leiden)
cluster_key = None
for key in ['leiden', 'louvain', 'clusters', 'cluster', 'celltype', 'cell_type', 'Cluster', 'seurat_clusters']:
    if key in adata.obs.columns:
        cluster_key = key
        break

if cluster_key is None:
    # Lightweight clustering if none present
    n_comps = min(50, adata.n_vars)
    if 'X_pca' not in adata.obsm:
        sc.pp.pca(adata, n_comps=n_comps, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=15, n_pcs=min(30, adata.obsm['X_pca'].shape[1]))
    sc.tl.leiden(adata, key_added='leiden', resolution=1.0)
    cluster_key = 'leiden'

# Select features to show in heatmaps
try:
    markers  # if you defined `markers` or `markers_of_interest` earlier
except NameError:
    try:
        markers = list(markers_of_interest)  # noqa: F821
    except NameError:
        if 'highly_variable' in adata.var.columns and adata.var['highly_variable'].any():
            markers = list(adata.var_names[adata.var['highly_variable']])
            markers = markers[:30] if len(markers) > 30 else markers
        else:
            # Fallback: first 30 features (edit this to your marker list)
            markers = list(adata.var_names[:30])

print(f"Using {len(markers)} features for heatmaps (override by setting `markers`).")

# Output directory for figures
figdir = Path.cwd() / "outputs" / "heatmaps_by_donor"
figdir.mkdir(parents=True, exist_ok=True)
sc.settings.figdir = str(figdir)

# Small helper for filenames
_def_sanitize = lambda s: "".join(c if (c.isalnum() or c in ('-', '_')) else '_' for c in str(s))

# Get donor groups
groups = [g for g in adata.obs['Donor Status'].dropna().unique().tolist() if str(g) != 'nan']
print(f"Found groups: {groups}")

# Generate one heatmap per donor group
for g in groups:
    ad = adata[adata.obs['Donor Status'] == g].copy()
    if ad.n_obs == 0:
        continue
    sc.pl.heatmap(
        ad,
        var_names=markers,
        groupby=cluster_key,
        standard_scale='var',
        cmap='magma',
        swap_axes=True,
        dendrogram=True,
        use_raw=True if ad.raw is not None else False,
        show=True,
        save=f"_donor_{_def_sanitize(g)}.png",
    )
    print(f"Saved: {figdir}/heatmap_donor_{_def_sanitize(g)}.png")
