# Cell Type Phenotyping and Annotation

This notebook performs cell type annotation based on marker protein expression:
- Marker protein analysis per cluster
- Cell type scoring using predefined markers
- Automated annotation
- scVI-based advanced clustering
- Manual curation support

**Input:** Preprocessed AnnData from 01_preprocessing.ipynb

**Output:** Annotated AnnData with cell type assignments

In [None]:
# Import libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import scanpy as sc
import squidpy as sq
import scvi
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white')
sns.set_style('whitegrid')

print(f"Scanpy version: {sc.__version__}")
print(f"scVI version: {scvi.__version__}")

## 1. Load Preprocessed Data

In [None]:
# Define paths
DATA_DIR = Path("../data/processed")
FIGURES_DIR = Path("../figures/02_phenotyping")
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# Sample name
SAMPLE_NAME = "phenocycler_sample_01"

# Load preprocessed data
adata = sc.read_h5ad(DATA_DIR / f"{SAMPLE_NAME}_preprocessed.h5ad")
print(f"Loaded: {adata.shape[0]} cells, {adata.shape[1]} markers")
print(f"Clusters: {adata.obs['leiden'].nunique()}")

## 2. Define Cell Type Markers

In [None]:
# Define marker proteins for cell types
# Customize these for your tissue and available markers
marker_proteins = {
    'T cells': ['CD3', 'CD8', 'CD4'],
    'B cells': ['CD19', 'CD20'],
    'NK cells': ['CD56', 'CD16'],
    'Macrophages': ['CD68', 'CD163'],
    'Dendritic cells': ['CD11c', 'HLA-DR'],
    'Neutrophils': ['CD15', 'CD66b'],
    'Epithelial': ['PanCK', 'EpCAM'],
    'Endothelial': ['CD31', 'CD34'],
    'Fibroblasts': ['Vimentin', 'aSMA'],
    'Proliferating': ['Ki67', 'PCNA']
}

# Filter markers that are present in dataset
available_markers = {}
for celltype, markers in marker_proteins.items():
    present = [m for m in markers if m in adata.var_names]
    if present:
        available_markers[celltype] = present

print("Available markers:")
for ct, markers in available_markers.items():
    print(f"  {ct}: {markers}")

## 3. Calculate Marker Genes per Cluster

In [None]:
# Find marker genes for each cluster
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

# Plot top markers per cluster
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=False, save='_per_cluster.png')

# Extract top markers
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
top_markers_df = pd.DataFrame(
    {group + '_' + key: result[key][group]
     for group in groups for key in ['names', 'scores']}
)
top_markers_df.to_csv(DATA_DIR / f"{SAMPLE_NAME}_cluster_markers.csv")
print(f"Cluster markers saved")

## 4. Cell Type Scoring

In [None]:
# Calculate cell type scores
for celltype, markers in available_markers.items():
    sc.tl.score_genes(adata, markers, score_name=f'{celltype}_score')

# Plot scores on UMAP
score_cols = [f'{ct}_score' for ct in available_markers.keys()]
if len(score_cols) > 0:
    sc.pl.umap(adata, color=score_cols[:6], ncols=3, cmap='viridis', 
               save='_celltype_scores.png')

## 5. Automated Cell Type Assignment

In [None]:
# Assign cell types based on highest score
score_cols = [f'{ct}_score' for ct in available_markers.keys()]
if score_cols:
    score_matrix = adata.obs[score_cols].values
    celltype_idx = np.argmax(score_matrix, axis=1)
    celltype_names = list(available_markers.keys())
    adata.obs['celltype'] = [celltype_names[i] for i in celltype_idx]
    
    # Add confidence score (difference between top 2 scores)
    sorted_scores = np.sort(score_matrix, axis=1)
    adata.obs['celltype_confidence'] = sorted_scores[:, -1] - sorted_scores[:, -2]
    
    print(f"Cell type distribution:")
    print(adata.obs['celltype'].value_counts())
else:
    print("No markers available for automatic assignment")
    adata.obs['celltype'] = 'Unknown'

## 6. Visualize Cell Type Assignments

In [None]:
# Plot cell types on UMAP
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sc.pl.umap(adata, color='celltype', ax=axes[0], show=False)
sc.pl.umap(adata, color='celltype_confidence', ax=axes[1], show=False, cmap='viridis')
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'celltype_umap.png', dpi=300, bbox_inches='tight')
plt.show()

# Cell type proportions
fig, ax = plt.subplots(figsize=(10, 6))
celltype_counts = adata.obs['celltype'].value_counts()
celltype_counts.plot(kind='bar', ax=ax)
ax.set_ylabel('Number of cells')
ax.set_title('Cell Type Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'celltype_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. scVI-based Clustering (Optional)

In [None]:
# Train scVI model for advanced clustering
scvi.model.SCVI.setup_anndata(adata, layer=None)
vae = scvi.model.SCVI(adata, n_latent=20, n_layers=2)
vae.train(max_epochs=100, early_stopping=True)

# Get latent representation
adata.obsm['X_scvi'] = vae.get_latent_representation()

# Cluster on scVI latent space
sc.pp.neighbors(adata, use_rep='X_scvi')
sc.tl.leiden(adata, key_added='leiden_scvi', resolution=1.0)
sc.tl.umap(adata)

# Plot scVI clustering
sc.pl.umap(adata, color=['leiden_scvi', 'celltype'], 
           save='_scvi_clustering.png')

## 8. Save Annotated Data

In [None]:
# Save annotated data
output_file = DATA_DIR / f"{SAMPLE_NAME}_annotated.h5ad"
adata.write_h5ad(output_file)
print(f"Annotated data saved to: {output_file}")

# Export cell type assignments
celltype_df = adata.obs[['celltype', 'celltype_confidence', 'leiden']]
celltype_df.to_csv(DATA_DIR / f"{SAMPLE_NAME}_celltypes.csv")
print(f"Cell type assignments exported")

## Next Steps

Proceed to:
- **03_spatial_analysis.ipynb** for spatial analysis
- **04_group_comparisons.ipynb** for group comparisons