In [None]:
import os
# import tempfile
import scanpy as sc
# import scvi
import seaborn as sns
# import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
base_dir = "/blue/clive/smith6jt/KINTSUGI/notebooks"

In [None]:
import anndata as ad
adata=ad.read_h5ad(os.path.join(base_dir, 'CODEX_panc_scvi.h5ad'))

In [None]:
adata

In [None]:
adata.layers["scaled"] = sc.pp.scale(adata, zero_center=True, copy=True).X

In [None]:
sc.pl.dotplot(adata, primary_markers, groupby='leiden_res_1.00',
              figsize=(12, 6),  layer="scaled", cmap='RdBu_r', vmin=-2, vmax=2)
plt.show()

In [None]:
cell_type_markers = {
    # Endocrine cells (Islet cells)
    'Beta cells': ['INS'],
    'Alpha cells': ['GCG'],
    'Delta cells': ['SST'],
    
    # Exocrine cells
    'Acinar cells': ['BActin', 'ECAD'],
    'Ductal cells': ['CK19'],
    
    # Immune cells - T cells
    'T cells CD8+': ['CD8a', 'CD3e'],
    'T cells CD4+': ['CD4', 'CD3e'],
    
    # Immune cells - B cells and myeloid
    'B cells': ['CD20'],
    'Macrophages': ['CD68', 'CD163'],
    'Antigen Presenting Cells': ['HLADR'],

    # Stromal cells
    'Endothelial cells': ['CD31', 'CD34'],
    'Fibroblasts': ['VIM', 'ColIV'],
    'Pericytes': ['SMA', 'VIM', 'CD44'],
    
    # Neural cells
    'Neurons': ['PGP9.5','B3TUBB'],
}

In [None]:
sc.pl.dotplot(adata, cell_type_markers, groupby='leiden_res_2.00',
              figsize=(12, 6),  layer="scaled", cmap='RdBu_r', vmin=-2, vmax=2)
plt.show()

## 2. Cluster Quality Checks

In [None]:
cluster_keys = [key for key in adata.obs.columns if key.startswith('leiden_res_')]
if not cluster_keys:
    raise ValueError("No Leiden clustering columns found in adata.obs")

cluster_summaries = []
for key in sorted(cluster_keys):
    counts = adata.obs[key].value_counts().sort_index()
    percents = counts / counts.sum() * 100
    cluster_summaries.append(
        pd.DataFrame(
            {
                "resolution": key.replace("leiden_res_", ""),
                "cluster": counts.index.astype(str),
                "n_cells": counts.values,
                "pct_cells": percents.values,
            }
        )
    )

summary_df = pd.concat(cluster_summaries, ignore_index=True)
summarized = summary_df.groupby("resolution").agg(
    clusters=("cluster", "nunique"),
    min_cells=("n_cells", "min"),
    median_cells=("n_cells", "median"),
    max_cells=("n_cells", "max"),
)

summarized

In [None]:
import plotly.express as px

fig = px.box(
    summary_df,
    x="resolution",
    y="pct_cells",
    points="all",
    color="resolution",
    labels={"pct_cells": "Cluster size (%)"},
    title="Cluster size distribution per resolution",
)
fig.show()

## 3. Draft Cell Type Annotation

In [None]:
annotation_resolution = "leiden_res_1.50"

if annotation_resolution not in adata.obs:
    raise KeyError(f"Resolution {annotation_resolution} not present. Available: {sorted(cluster_keys)}")

cluster_means = None
if "rank_genes_groups" in adata.uns:
    try:
        # Capture differential expression results if they exist for downstream review.
        cluster_means = sc.get.rank_genes_groups_df(adata, key="rank_genes_groups")
    except (KeyError, ValueError, TypeError):
        cluster_means = None

scaled = adata.layers.get("scaled", None)
if scaled is None:
    scaled = sc.pp.scale(adata, zero_center=True, copy=True).X
    adata.layers["scaled"] = scaled

scaled_df = pd.DataFrame(
    scaled,
    index=adata.obs.index,
    columns=adata.var_names,
 )

cluster_profiles = (
    scaled_df.join(adata.obs[[annotation_resolution]])
    .groupby(annotation_resolution)
    .mean()
    .sort_index()
)

annotation_table = []
for cluster, profile in cluster_profiles.iterrows():
    marker_scores = {}
    for label, markers in cell_type_markers.items():
        present = [m for m in markers if m in profile.index]
        if not present:
            continue
        marker_scores[label] = profile[present].mean()
    top_marker = max(marker_scores, key=marker_scores.get) if marker_scores else "Unknown"
    annotation_table.append(
        {
            "cluster": cluster,
            "top_marker": top_marker,
            "score": marker_scores.get(top_marker, np.nan),
            "marker_scores": marker_scores,
        }
    )

annotation_df = pd.DataFrame(annotation_table).sort_values("cluster").reset_index(drop=True)
annotation_df

In [None]:
adata.obs["draft_cell_type"] = adata.obs[annotation_resolution].map(
    annotation_df.set_index("cluster")["top_marker"]
)
adata.obs["draft_cell_type"].value_counts().sort_values(ascending=False).head(10)

In [None]:
sc.pl.umap(
    adata,
    color=[annotation_resolution, "draft_cell_type", "Donor Status"],
    frameon=False,
    wspace=0.4,
    legend_loc="on data",
    size=3,
    show=False,
)
plt.show()

## 3.5 Donor-Specific Views

In [None]:
status_key = "Donor Status"
if status_key not in adata.obs:
    raise ValueError(f"{status_key} column not found in adata.obs")

status_cluster_counts = (
    adata.obs.groupby([status_key, annotation_resolution])
    .size()
    .rename("n_cells")
    .reset_index()
)
status_cluster_counts["pct_within_status"] = (
    status_cluster_counts.groupby(status_key)["n_cells"]
    .transform(lambda x: x / x.sum() * 100)
)

status_cluster_counts.sort_values([status_key, "n_cells"], ascending=[True, False])

In [None]:
try:
    status_categories = list(adata.obs[status_key].cat.categories)
except AttributeError:
    status_categories = sorted(adata.obs[status_key].dropna().unique())

for status in status_categories:
    subset = adata[adata.obs[status_key] == status].copy()
    if subset.n_obs == 0:
        continue
    sc.pl.umap(
        subset,
        color=[annotation_resolution, "draft_cell_type"],
        title=f"{status_key}: {status}",
        frameon=False,
        wspace=0.4,
        legend_loc="right margin",
        size=3,
        show=False,
    )
    plt.show()

In [None]:
umap_df = (
    pd.DataFrame(
        adata.obsm["X_umap"],
        columns=["UMAP1", "UMAP2"],
        index=adata.obs.index,
    )
    .join(
        adata.obs[[status_key, annotation_resolution, "draft_cell_type"]]
    )
    .dropna(subset=[status_key])
)

fig = px.scatter(
    umap_df,
    x="UMAP1",
    y="UMAP2",
    color="draft_cell_type",
    facet_col=status_key,
    category_orders={status_key: list(status_categories)},
    opacity=0.7,
    title="UMAP colored by draft cell type, faceted by donor status",
    height=450,
)
fig.update_traces(marker=dict(size=4, line=dict(width=0)))
fig.show()

## 4. Save Updated AnnData

In [None]:
save_dir = os.path.join(base_dir, "outputs")
os.makedirs(save_dir, exist_ok=True)
output_path = os.path.join(save_dir, "CODEX_panc_scvi_draft_annotations.h5ad")
adata.write_h5ad(output_path, compression="gzip")
print(f"Saved annotated object -> {output_path}")