# T-cell Integration: Pancreas Spatial + pLN scRNAseq

This notebook performs T-cell specific integration to:
1. **Annotate T-cell subtypes** in pLN scRNAseq data
2. **Match** pLN RNA T-cells to Pancreas protein T-cells
3. **Transfer** subtype labels to spatial coordinates
4. **Enable** gene expression inference for spatial T-cells

## Rationale

- **Pancreas protein**: 8,167 CD3e+ T-cells (from 476k "immune" cells)
- **pLN RNA**: ~4,900 CD3E+ T-cells (from 10k CD45+ cells)
- **Ratio**: ~1.7:1 (excellent for MaxFuse)
- **Cross-tissue matching is valid**: T-cell subtypes have consistent marker profiles

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.preprocessing import StandardScaler
import os
import warnings
warnings.filterwarnings('ignore')

# Set scanpy settings
sc.settings.verbosity = 2
sc.settings.set_figure_params(dpi=100, facecolor='white')

print("Libraries loaded")

Changed working directory to: /home/smith6jt/maxfuse/notebooks


NameError: name 'sc' is not defined

In [None]:
# Load preprocessed data
# Fix working directory if notebook started from wrong location
if not os.path.exists('../results/1_preprocessing') and os.path.exists('/home/smith6jt/maxfuse/results/1_preprocessing'):
    os.chdir('/home/smith6jt/maxfuse/notebooks')
    print(f"Changed working directory to: {os.getcwd()}")

# Verify paths
preprocess_dir = '../results/1_preprocessing'
if not os.path.exists(preprocess_dir):
    raise FileNotFoundError(f"Preprocessing results not found at {preprocess_dir}. Run 1_preprocessing.ipynb first.")

rna_full = sc.read_h5ad(f'{preprocess_dir}/rna_adata.h5ad')
rna_lognorm = sc.read_h5ad(f'{preprocess_dir}/rna_adata_lognorm.h5ad')
protein_full = sc.read_h5ad(f'{preprocess_dir}/protein_adata.h5ad')

print("=" * 60)
print("LOADED DATA")
print("=" * 60)
print(f"RNA: {rna_full.shape}")
print(f"  Tissues: {rna_full.obs['Tissue'].value_counts().to_dict()}")
print(f"\nProtein: {protein_full.shape}")
print(f"  Tissues: {protein_full.obs['Tissue'].value_counts().to_dict()}")

---
## Phase 1: Data Preparation

Filter both datasets to T-cells only.

In [None]:
# Load preprocessed data
rna_full = sc.read_h5ad('../results/1_preprocessing/rna_adata.h5ad')
rna_lognorm = sc.read_h5ad('../results/1_preprocessing/rna_adata_lognorm.h5ad')
protein_full = sc.read_h5ad('../results/1_preprocessing/protein_adata.h5ad')

print("=" * 60)
print("LOADED DATA")
print("=" * 60)
print(f"RNA: {rna_full.shape}")
print(f"  Tissues: {rna_full.obs['Tissue'].value_counts().to_dict()}")
print(f"\nProtein: {protein_full.shape}")
print(f"  Tissues: {protein_full.obs['Tissue'].value_counts().to_dict()}")

In [None]:
# Filter RNA to T-cells from pLN
# Use CD3E expression > 0 after log normalization

print("=" * 60)
print("FILTERING RNA TO T-CELLS")
print("=" * 60)

# Get CD3E expression from log-normalized data
cd3e_expr = rna_lognorm[:, 'CD3E'].X
if sparse.issparse(cd3e_expr):
    cd3e_expr = cd3e_expr.toarray().flatten()
else:
    cd3e_expr = cd3e_expr.flatten()

# Filter criteria: pLN tissue AND CD3E > 0
is_pln = rna_full.obs['Tissue'].values == 'pLN'
is_tcell = cd3e_expr > 0

print(f"pLN cells: {is_pln.sum():,}")
print(f"CD3E+ cells: {is_tcell.sum():,}")
print(f"pLN AND CD3E+: {(is_pln & is_tcell).sum():,}")

# Apply filter
tcell_mask_rna = is_pln & is_tcell
rna_tcells = rna_full[tcell_mask_rna].copy()
rna_tcells_lognorm = rna_lognorm[tcell_mask_rna].copy()

print(f"\nFiltered RNA T-cells: {rna_tcells.shape}")

In [None]:
# Filter Protein to T-cells from Pancreas
# Use CD3e+ classifications from scimap

print("=" * 60)
print("FILTERING PROTEIN TO T-CELLS")
print("=" * 60)

# Get Pancreas cells with CD3e+ classification
is_pancreas = protein_full.obs['Tissue'].values == 'Pancreas'
has_cd3e = protein_full.obs['Classification'].str.contains('CD3e', na=False)

print(f"Pancreas cells: {is_pancreas.sum():,}")
print(f"CD3e+ classified: {has_cd3e.sum():,}")
print(f"Pancreas AND CD3e+: {(is_pancreas & has_cd3e).sum():,}")

# Classification breakdown in Pancreas
print("\nPancreas CD3e+ classifications:")
pancreas_cd3e = protein_full[(is_pancreas & has_cd3e)]
for cls, n in pancreas_cd3e.obs['Classification'].value_counts().items():
    print(f"  {cls}: {n:,}")

# Apply filter
tcell_mask_prot = is_pancreas & has_cd3e
protein_tcells = protein_full[tcell_mask_prot].copy()

print(f"\nFiltered Protein T-cells: {protein_tcells.shape}")

In [None]:
# Summary of filtered data
print("=" * 60)
print("T-CELL DATASET SUMMARY")
print("=" * 60)
print(f"RNA T-cells (pLN):      {rna_tcells.n_obs:,}")
print(f"Protein T-cells (Panc): {protein_tcells.n_obs:,}")
print(f"Cell ratio:             {protein_tcells.n_obs / rna_tcells.n_obs:.2f}:1")
print("\n✓ Excellent ratio for MaxFuse integration!")

---
## Phase 2: T-cell Subtype Annotation (RNA)

Cluster pLN T-cells and annotate based on canonical markers.

In [None]:
# Prepare RNA T-cells for clustering
# Use the log-normalized version

print("=" * 60)
print("PREPARING RNA T-CELLS FOR CLUSTERING")
print("=" * 60)

# Work with log-normalized data
adata = rna_tcells_lognorm.copy()

# Find highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat_v3')
print(f"Highly variable genes: {adata.var['highly_variable'].sum()}")

# Scale
sc.pp.scale(adata, max_value=10)

# PCA
sc.tl.pca(adata, n_comps=50)

# Neighbors and UMAP
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata)

print("PCA and UMAP computed")

In [None]:
# Cluster T-cells
# Use multiple resolutions to find optimal granularity

print("=" * 60)
print("CLUSTERING T-CELLS")
print("=" * 60)

for res in [0.3, 0.5, 0.8, 1.0]:
    sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')
    n_clusters = adata.obs[f'leiden_{res}'].nunique()
    print(f"Resolution {res}: {n_clusters} clusters")

# Use resolution 0.5 as default (can adjust later)
adata.obs['leiden'] = adata.obs['leiden_0.5']
print(f"\nUsing resolution 0.5: {adata.obs['leiden'].nunique()} clusters")

In [None]:
# Visualize clusters
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sc.pl.umap(adata, color='leiden', ax=axes[0], show=False, 
           title='T-cell Clusters (Leiden)')
sc.pl.umap(adata, color='leiden_0.8', ax=axes[1], show=False,
           title='Higher Resolution (0.8)')

plt.tight_layout()
plt.show()

In [None]:
# Define T-cell marker genes for annotation
tcell_markers = {
    # Lineage
    'CD4': 'CD4+ lineage',
    'CD8A': 'CD8+ lineage',
    'CD8B': 'CD8+ lineage',
    
    # Treg
    'FOXP3': 'Treg',
    'IL2RA': 'Treg (CD25)',
    'CTLA4': 'Treg/exhausted',
    
    # Naive/memory
    'CCR7': 'Naive/central memory',
    'SELL': 'Naive (CD62L)',
    'TCF7': 'Naive/stem-like',
    'LEF1': 'Naive',
    'IL7R': 'Memory (CD127)',
    
    # Effector
    'GZMB': 'Effector/cytotoxic',
    'GZMA': 'Effector',
    'PRF1': 'Effector (perforin)',
    'IFNG': 'Effector (IFNg)',
    'GNLY': 'Effector (granulysin)',
    
    # Exhaustion
    'PDCD1': 'Exhausted (PD-1)',
    'LAG3': 'Exhausted',
    'HAVCR2': 'Exhausted (TIM-3)',
    'TOX': 'Exhausted (TOX)',
    'TIGIT': 'Exhausted',
    'ENTPD1': 'Exhausted (CD39)',
    
    # Activation/proliferation
    'MKI67': 'Proliferating',
    'ICOS': 'Activated',
    'CD38': 'Activated',
    
    # Tissue residency
    'CD69': 'Tissue-resident',
    'ITGAE': 'Trm (CD103)',
}

# Filter to markers present in data
available_markers = [m for m in tcell_markers.keys() if m in adata.var_names]
print(f"Available markers: {len(available_markers)}/{len(tcell_markers)}")

In [None]:
# Visualize key markers on UMAP
key_markers = ['CD4', 'CD8A', 'FOXP3', 'TCF7', 'GZMB', 'PDCD1', 'TOX', 'MKI67']
key_markers = [m for m in key_markers if m in adata.var_names]

sc.pl.umap(adata, color=key_markers, ncols=4, cmap='viridis',
           title=[tcell_markers.get(m, m) for m in key_markers])

In [None]:
# Compute marker expression per cluster (dotplot)
sc.pl.dotplot(adata, var_names=available_markers, groupby='leiden',
              standard_scale='var', figsize=(16, 6),
              title='T-cell Marker Expression by Cluster')

In [None]:
# Compute mean expression per cluster for annotation
print("=" * 60)
print("CLUSTER MARKER PROFILES")
print("=" * 60)

# Get expression matrix (use raw counts for this)
expr_df = pd.DataFrame(
    adata[:, available_markers].X.toarray() if sparse.issparse(adata.X) else adata[:, available_markers].X,
    index=adata.obs_names,
    columns=available_markers
)
expr_df['cluster'] = adata.obs['leiden'].values

# Mean expression per cluster
cluster_means = expr_df.groupby('cluster').mean()

# Key lineage markers
lineage_markers = ['CD4', 'CD8A', 'FOXP3', 'GZMB', 'PDCD1', 'TOX', 'TCF7', 'CCR7']
lineage_markers = [m for m in lineage_markers if m in cluster_means.columns]

print("\nMean expression of key markers per cluster:")
print(cluster_means[lineage_markers].round(2).to_string())

In [None]:
# Semi-automated annotation based on marker expression
# This provides initial labels that should be reviewed

print("=" * 60)
print("AUTOMATED T-CELL SUBTYPE ANNOTATION")
print("=" * 60)

def annotate_tcell_cluster(row):
    """Annotate T-cell cluster based on marker expression."""
    # Thresholds (adjust based on your data)
    cd4_high = row.get('CD4', 0) > 0.5
    cd8_high = row.get('CD8A', 0) > 0.5
    foxp3_high = row.get('FOXP3', 0) > 0.3
    gzmb_high = row.get('GZMB', 0) > 0.3
    pdcd1_high = row.get('PDCD1', 0) > 0.3
    tox_high = row.get('TOX', 0) > 0.5
    tcf7_high = row.get('TCF7', 0) > 0.5
    ccr7_high = row.get('CCR7', 0) > 0.5
    mki67_high = row.get('MKI67', 0) > 0.1
    
    # Annotation logic
    annotations = []
    
    # Lineage
    if cd4_high and not cd8_high:
        lineage = 'CD4'
    elif cd8_high and not cd4_high:
        lineage = 'CD8'
    elif cd4_high and cd8_high:
        lineage = 'DP'  # Double positive
    else:
        lineage = 'DN'  # Double negative or low
    
    # Functional state
    if foxp3_high and lineage == 'CD4':
        return 'CD4_Treg'
    
    if mki67_high:
        return f'{lineage}_Proliferating'
    
    if pdcd1_high and tox_high:
        if tcf7_high:
            return f'{lineage}_Exhausted_Progenitor'
        else:
            return f'{lineage}_Exhausted_Terminal'
    elif pdcd1_high:
        return f'{lineage}_Exhausted'
    
    if gzmb_high:
        return f'{lineage}_Effector'
    
    if ccr7_high and tcf7_high:
        return f'{lineage}_Naive'
    
    if tcf7_high:
        return f'{lineage}_Memory'
    
    return f'{lineage}_Unspecified'

# Apply annotation
cluster_annotations = {}
for cluster in cluster_means.index:
    row = cluster_means.loc[cluster]
    annotation = annotate_tcell_cluster(row)
    cluster_annotations[cluster] = annotation
    print(f"Cluster {cluster}: {annotation}")

# Add to AnnData
adata.obs['tcell_subtype'] = adata.obs['leiden'].map(cluster_annotations)

print("\nSubtype counts:")
print(adata.obs['tcell_subtype'].value_counts())

In [None]:
# Visualize annotated subtypes
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sc.pl.umap(adata, color='tcell_subtype', ax=axes[0], show=False,
           title='T-cell Subtypes (Automated)')
sc.pl.umap(adata, color='leiden', ax=axes[1], show=False,
           title='Leiden Clusters')

plt.tight_layout()
plt.show()

In [None]:
# MANUAL ANNOTATION CELL
# Review the automated annotations above and adjust as needed
# Uncomment and modify the dictionary below to override annotations

# manual_annotations = {
#     '0': 'CD8_Effector',
#     '1': 'CD4_Memory',
#     '2': 'CD8_Exhausted',
#     # ... add more as needed
# }
# adata.obs['tcell_subtype'] = adata.obs['leiden'].map(manual_annotations)

print("Review annotations above. Modify this cell if manual corrections needed.")
print("\nCurrent subtype distribution:")
print(adata.obs['tcell_subtype'].value_counts())

In [None]:
# Transfer annotations back to original RNA objects
rna_tcells.obs['leiden'] = adata.obs['leiden']
rna_tcells.obs['tcell_subtype'] = adata.obs['tcell_subtype']
rna_tcells.obsm['X_umap'] = adata.obsm['X_umap']
rna_tcells.obsm['X_pca'] = adata.obsm['X_pca']

# Also update log-normalized version
rna_tcells_lognorm.obs['leiden'] = adata.obs['leiden']
rna_tcells_lognorm.obs['tcell_subtype'] = adata.obs['tcell_subtype']
rna_tcells_lognorm.obsm['X_umap'] = adata.obsm['X_umap']

print("Annotations transferred to RNA objects")

---
## Phase 3: MaxFuse Integration

Match pLN RNA T-cells to Pancreas Protein T-cells using T-cell specific markers.

In [None]:
# Define T-cell specific correspondence
# (RNA gene, Protein marker)

tcell_correspondence = [
    ('CD3E', 'CD3e'),
    ('CD4', 'CD4'),
    ('CD8A', 'CD8'),
    ('FOXP3', 'FOXP3'),
    ('PDCD1', 'PD-1'),
    ('LAG3', 'LAG3'),
    ('TOX', 'TOX'),
    ('TCF7', 'TCF-1'),
    ('ICOS', 'ICOS'),
    ('GZMB', 'Granzyme B'),
    ('ENTPD1', 'CD39'),
    ('MKI67', 'Ki67'),
    ('CD38', 'CD38'),
    ('CD44', 'CD44'),
]

# Verify all markers exist
valid_correspondence = []
for rna_gene, prot_marker in tcell_correspondence:
    rna_ok = rna_gene in rna_tcells.var_names
    prot_ok = prot_marker in protein_tcells.var_names
    if rna_ok and prot_ok:
        valid_correspondence.append((rna_gene, prot_marker))
        print(f"✓ {rna_gene} → {prot_marker}")
    else:
        print(f"✗ {rna_gene} ({rna_ok}) → {prot_marker} ({prot_ok})")

print(f"\nValid shared features: {len(valid_correspondence)}")
correspondence_array = np.array(valid_correspondence)

In [None]:
# Extract shared features
shared_rna_genes = correspondence_array[:, 0]
shared_prot_markers = correspondence_array[:, 1]

# Create shared feature AnnData objects
rna_shared_adata = rna_tcells_lognorm[:, shared_rna_genes].copy()
protein_shared_adata = protein_tcells[:, shared_prot_markers].copy()

print(f"Shared features:")
print(f"  RNA: {rna_shared_adata.shape}")
print(f"  Protein: {protein_shared_adata.shape}")

In [None]:
# Normalize shared features
# RNA: already log-normalized, just z-score
# Protein: z-score from raw

print("=" * 60)
print("NORMALIZING SHARED FEATURES")
print("=" * 60)

# RNA normalization
rna_shared_raw = rna_shared_adata.X.copy()
if sparse.issparse(rna_shared_raw):
    rna_shared_raw = rna_shared_raw.toarray()

scaler_rna = StandardScaler()
rna_shared = scaler_rna.fit_transform(rna_shared_raw)
rna_shared = np.clip(rna_shared, -5, 5).astype(np.float32)

print(f"RNA shared: mean={rna_shared.mean():.4f}, std={rna_shared.std():.4f}")

# Protein normalization
protein_shared_raw = protein_shared_adata.X.copy()
if sparse.issparse(protein_shared_raw):
    protein_shared_raw = protein_shared_raw.toarray()

scaler_prot = StandardScaler()
protein_shared = scaler_prot.fit_transform(protein_shared_raw)
protein_shared = np.clip(protein_shared, -5, 5).astype(np.float32)

print(f"Protein shared: mean={protein_shared.mean():.4f}, std={protein_shared.std():.4f}")

In [None]:
# Prepare active features
# RNA: Use T-cell relevant genes (broader than shared)
# Protein: Same as shared (all markers are T-cell relevant)

print("=" * 60)
print("PREPARING ACTIVE FEATURES")
print("=" * 60)

# RNA active: Use highly variable genes from T-cell clustering
hvg_mask = adata.var['highly_variable']
rna_hvg_names = adata.var_names[hvg_mask]

# Get HVG expression from log-normalized data
rna_active_adata = rna_tcells_lognorm[:, rna_hvg_names].copy()
rna_active_raw = rna_active_adata.X.copy()
if sparse.issparse(rna_active_raw):
    rna_active_raw = rna_active_raw.toarray()

scaler_rna_active = StandardScaler()
rna_active = scaler_rna_active.fit_transform(rna_active_raw)
rna_active = np.clip(rna_active, -5, 5).astype(np.float32)

print(f"RNA active (HVGs): {rna_active.shape}")

# Protein active: Same as shared
protein_active = protein_shared.copy()
print(f"Protein active: {protein_active.shape}")

In [None]:
# Verify dimensions
print("=" * 60)
print("DIMENSION VALIDATION")
print("=" * 60)
print(f"RNA shared:     {rna_shared.shape}")
print(f"RNA active:     {rna_active.shape}")
print(f"Protein shared: {protein_shared.shape}")
print(f"Protein active: {protein_active.shape}")

assert rna_shared.shape[0] == rna_active.shape[0], "RNA cell count mismatch"
assert protein_shared.shape[0] == protein_active.shape[0], "Protein cell count mismatch"
assert rna_shared.shape[1] == protein_shared.shape[1], "Shared feature count mismatch"

n_shared = rna_shared.shape[1]
n_rna = rna_shared.shape[0]
n_prot = protein_shared.shape[0]

print(f"\n✓ All dimensions valid")
print(f"  {n_rna:,} RNA cells, {n_prot:,} protein cells")
print(f"  {n_shared} shared features")
print(f"  Ratio: {n_prot/n_rna:.2f}:1")

In [None]:
# Initialize MaxFuse
from maxfuse import Fusor

print("=" * 60)
print("INITIALIZING MAXFUSE")
print("=" * 60)

fusor = Fusor(
    shared_arr1=rna_shared,
    shared_arr2=protein_shared,
    active_arr1=rna_active,
    active_arr2=protein_active,
    method='centroid_shrinkage'
)

print("Fusor initialized")

In [None]:
# Calculate batching parameters
# With ratio ~1.7:1, we can use standard parameters

ratio = n_prot / n_rna
matching_ratio = max(10, int(ratio) + 5)  # ~12 for 1.7:1 ratio
max_outward = min(8000, n_rna)  # Can process all RNA in one batch

print(f"Batching parameters:")
print(f"  matching_ratio: {matching_ratio}")
print(f"  max_outward_size: {max_outward}")

In [None]:
# Split into batches
fusor.split_into_batches(
    max_outward_size=max_outward,
    matching_ratio=matching_ratio,
    metacell_size=1,  # Disable metacells for small dataset
    verbose=True
)

In [None]:
# Construct graphs
# Parameters for ~14 shared features

n_rna_features = rna_active.shape[1]
n_prot_features = protein_active.shape[1]

svd_comp1 = min(50, n_rna_features - 1)
svd_comp2 = min(12, n_prot_features - 1)  # Cap at n_shared - 2

print(f"Graph construction SVD: RNA={svd_comp1}, Protein={svd_comp2}")

fusor.construct_graphs(
    n_neighbors1=15,
    n_neighbors2=15,
    svd_components1=svd_comp1,
    svd_components2=svd_comp2,
    resolution1=1.0,
    resolution2=1.0,
    randomize=False,
    verbose=True
)

In [None]:
# Find initial pivots
svd_shared = min(10, n_shared - 1)

print(f"Initial pivot SVD: {svd_shared}")

fusor.find_initial_pivots(
    svd_components1=svd_shared,
    svd_components2=svd_shared,
    wt1=0.7,
    wt2=0.7,
    verbose=True
)

In [None]:
# Refine pivots with CCA
# CCA components should be conservative for small shared feature set
cca_components = min(8, n_shared - 1)
svd_cca_prot = min(12, n_prot_features - 1)

print(f"CCA refinement: {cca_components} components")
print(f"SVD before CCA: RNA={svd_comp1}, Protein={svd_cca_prot}")

fusor.refine_pivots(
    wt1=0.3,
    wt2=0.3,
    svd_components1=svd_comp1,
    svd_components2=svd_cca_prot,
    cca_components=cca_components,
    n_iters=1,
    filter_prop=0.0,
    verbose=True
)

In [None]:
# Analyze pivot score distribution
from sklearn.mixture import GaussianMixture

pivot_scores = fusor._refined_scores[0]

# Fit GMM to detect bimodality
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(pivot_scores.reshape(-1, 1))

# Get mode statistics
means = gmm.means_.flatten()
weights = gmm.weights_
bad_mode_idx = np.argmax(means)  # Higher score = worse match
bad_mode_fraction = weights[bad_mode_idx]

print("=" * 60)
print("PIVOT SCORE ANALYSIS")
print("=" * 60)
print(f"Total pivots: {len(pivot_scores)}")
print(f"Score range: [{pivot_scores.min():.3f}, {pivot_scores.max():.3f}]")
print(f"Mean: {pivot_scores.mean():.3f}, Median: {np.median(pivot_scores):.3f}")
print(f"\nGMM modes:")
print(f"  Mode 1: mean={means[0]:.3f}, weight={weights[0]:.1%}")
print(f"  Mode 2: mean={means[1]:.3f}, weight={weights[1]:.1%}")
print(f"\nBad mode fraction: {bad_mode_fraction:.1%}")

# Plot
fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(pivot_scores, bins=50, density=True, alpha=0.7, edgecolor='white')
ax.axvline(means[0], color='green', linestyle='--', label=f'Mode 1: {means[0]:.2f}')
ax.axvline(means[1], color='red', linestyle='--', label=f'Mode 2: {means[1]:.2f}')
ax.set_xlabel('Match Score (lower = better)')
ax.set_ylabel('Density')
ax.set_title('Pivot Score Distribution')
ax.legend()
plt.show()

In [None]:
# Filter bad pivots
pivot_filter_prop = min(0.2, bad_mode_fraction + 0.02)
print(f"Filtering {pivot_filter_prop:.1%} of pivots")

fusor.filter_bad_matches(
    filter_prop=pivot_filter_prop,
    verbose=True
)

In [None]:
# Propagate to all cells
fusor.propagate(
    svd_components1=svd_comp1,
    svd_components2=svd_cca_prot,
    wt1=0.7,
    wt2=0.7,
    verbose=True
)

In [None]:
# Get final matching (RNA-centric)
full_matching = fusor.get_matching(order=(1, 2), target='full_data')

print("=" * 60)
print("FINAL MATCHING RESULTS")
print("=" * 60)
print(f"Total matches: {len(full_matching[0]):,}")
print(f"Unique RNA cells: {len(np.unique(full_matching[0])):,} / {n_rna:,}")
print(f"Unique Protein cells: {len(np.unique(full_matching[1])):,} / {n_prot:,}")
print(f"RNA coverage: {100 * len(np.unique(full_matching[0])) / n_rna:.1f}%")
print(f"\nScore statistics:")
print(f"  Mean: {np.mean(full_matching[2]):.3f}")
print(f"  Median: {np.median(full_matching[2]):.3f}")
print(f"  Range: [{np.min(full_matching[2]):.3f}, {np.max(full_matching[2]):.3f}]")

---
## Phase 4: Label Transfer & Expression Inference

Transfer T-cell subtype labels to spatial cells and enable gene expression queries.

In [None]:
# Build mapping from protein cells to RNA matches
# For each protein cell, find its best RNA match

print("=" * 60)
print("BUILDING PROTEIN → RNA MAPPING")
print("=" * 60)

rna_indices = full_matching[0]
prot_indices = full_matching[1]
scores = full_matching[2]

# Group by protein cell, keep best RNA match
prot_to_rna = {}
for rna_idx, prot_idx, score in zip(rna_indices, prot_indices, scores):
    if prot_idx not in prot_to_rna:
        prot_to_rna[prot_idx] = (rna_idx, score)
    else:
        # Keep better match (lower score)
        if score < prot_to_rna[prot_idx][1]:
            prot_to_rna[prot_idx] = (rna_idx, score)

print(f"Protein cells with RNA match: {len(prot_to_rna):,} / {n_prot:,}")

# Also build RNA → protein mapping (for expression queries)
rna_to_prot = {}
for rna_idx, prot_idx, score in zip(rna_indices, prot_indices, scores):
    if rna_idx not in rna_to_prot:
        rna_to_prot[rna_idx] = []
    rna_to_prot[rna_idx].append((prot_idx, score))

print(f"RNA cells with protein match: {len(rna_to_prot):,} / {n_rna:,}")

In [None]:
# Transfer T-cell subtype labels to protein cells

print("=" * 60)
print("TRANSFERRING SUBTYPE LABELS")
print("=" * 60)

# Get RNA subtypes
rna_subtypes = rna_tcells.obs['tcell_subtype'].values

# Create protein subtype array
protein_subtypes = np.full(n_prot, 'Unmatched', dtype=object)
protein_match_scores = np.full(n_prot, np.nan)
protein_rna_match = np.full(n_prot, -1, dtype=int)

for prot_idx, (rna_idx, score) in prot_to_rna.items():
    protein_subtypes[prot_idx] = rna_subtypes[rna_idx]
    protein_match_scores[prot_idx] = score
    protein_rna_match[prot_idx] = rna_idx

# Add to protein AnnData
protein_tcells.obs['tcell_subtype'] = protein_subtypes
protein_tcells.obs['match_score'] = protein_match_scores
protein_tcells.obs['rna_match_idx'] = protein_rna_match

print("\nProtein T-cell subtype distribution:")
print(protein_tcells.obs['tcell_subtype'].value_counts())

In [None]:
# Create expression lookup function

def get_inferred_expression(protein_idx, gene, return_all_matches=False):
    """
    Get inferred gene expression for a protein cell based on its RNA match.
    
    Parameters:
    -----------
    protein_idx : int
        Index of the protein cell
    gene : str
        Gene name to query
    return_all_matches : bool
        If True, return expression from all matched RNA cells
        
    Returns:
    --------
    float or array : Gene expression value(s)
    """
    if gene not in rna_tcells_lognorm.var_names:
        raise ValueError(f"Gene {gene} not found in RNA data")
    
    if protein_idx not in prot_to_rna:
        return np.nan
    
    rna_idx, score = prot_to_rna[protein_idx]
    
    expr = rna_tcells_lognorm[rna_idx, gene].X
    if sparse.issparse(expr):
        expr = expr.toarray()
    
    return float(expr.flatten()[0])

def get_spatial_expression(gene, score_threshold=None):
    """
    Get inferred expression for all protein cells for a gene.
    
    Parameters:
    -----------
    gene : str
        Gene name to query
    score_threshold : float, optional
        Only include matches with score below threshold
        
    Returns:
    --------
    array : Expression values for all protein cells (NaN for unmatched)
    """
    if gene not in rna_tcells_lognorm.var_names:
        raise ValueError(f"Gene {gene} not found in RNA data")
    
    gene_expr = rna_tcells_lognorm[:, gene].X
    if sparse.issparse(gene_expr):
        gene_expr = gene_expr.toarray().flatten()
    else:
        gene_expr = gene_expr.flatten()
    
    spatial_expr = np.full(n_prot, np.nan)
    
    for prot_idx, (rna_idx, score) in prot_to_rna.items():
        if score_threshold is not None and score > score_threshold:
            continue
        spatial_expr[prot_idx] = gene_expr[rna_idx]
    
    return spatial_expr

print("Expression lookup functions created:")
print("  get_inferred_expression(protein_idx, gene)")
print("  get_spatial_expression(gene, score_threshold=None)")

In [None]:
# Test expression inference
test_genes = ['IFNG', 'IL2', 'TNF', 'GZMB', 'PRF1', 'FOXP3']

print("=" * 60)
print("EXPRESSION INFERENCE TEST")
print("=" * 60)

for gene in test_genes:
    if gene in rna_tcells_lognorm.var_names:
        spatial_expr = get_spatial_expression(gene)
        n_valid = np.sum(~np.isnan(spatial_expr))
        mean_expr = np.nanmean(spatial_expr)
        print(f"{gene}: {n_valid:,} cells with data, mean={mean_expr:.3f}")
    else:
        print(f"{gene}: NOT FOUND in RNA")

---
## Phase 5: Spatial Visualization

Visualize T-cell subtypes and inferred expression in spatial context.

In [None]:
# Get spatial coordinates
# Check what coordinate columns are available

print("Available columns in protein_tcells.obs:")
print(protein_tcells.obs.columns.tolist())

# Try common coordinate column names
coord_cols = [('X', 'Y'), ('x', 'y'), ('X_centroid', 'Y_centroid'), 
              ('centroid_x', 'centroid_y'), ('Centroid X', 'Centroid Y')]

x_col, y_col = None, None
for xc, yc in coord_cols:
    if xc in protein_tcells.obs.columns and yc in protein_tcells.obs.columns:
        x_col, y_col = xc, yc
        break

if x_col is not None:
    print(f"\nUsing coordinates: {x_col}, {y_col}")
    x_coords = protein_tcells.obs[x_col].values
    y_coords = protein_tcells.obs[y_col].values
else:
    print("\nNo coordinate columns found. Checking obsm...")
    if 'spatial' in protein_tcells.obsm:
        coords = protein_tcells.obsm['spatial']
        x_coords = coords[:, 0]
        y_coords = coords[:, 1]
        print(f"Using obsm['spatial']")
    else:
        print("No spatial coordinates found!")
        x_coords = None

In [None]:
# Spatial plot of T-cell subtypes
if x_coords is not None:
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Get unique subtypes and colors
    subtypes = protein_tcells.obs['tcell_subtype'].unique()
    colors = plt.cm.tab20(np.linspace(0, 1, len(subtypes)))
    subtype_colors = dict(zip(subtypes, colors))
    
    # Plot each subtype
    for subtype in subtypes:
        if subtype == 'Unmatched':
            continue  # Skip unmatched for clarity
        mask = protein_tcells.obs['tcell_subtype'] == subtype
        ax.scatter(x_coords[mask], y_coords[mask], 
                   c=[subtype_colors[subtype]], 
                   s=5, alpha=0.7, label=f"{subtype} ({mask.sum()})")
    
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_title('T-cell Subtypes in Pancreas (Spatial)')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=3)
    ax.set_aspect('equal')
    ax.invert_yaxis()  # Match image orientation
    
    plt.tight_layout()
    plt.show()
else:
    print("Cannot create spatial plot - no coordinates available")

In [None]:
# Spatial plot of inferred gene expression
if x_coords is not None:
    # Select genes to visualize
    genes_to_plot = ['IFNG', 'GZMB', 'TOX', 'TCF7']
    genes_to_plot = [g for g in genes_to_plot if g in rna_tcells_lognorm.var_names]
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    axes = axes.flatten()
    
    for ax, gene in zip(axes, genes_to_plot):
        expr = get_spatial_expression(gene)
        
        # Only plot cells with valid expression
        valid = ~np.isnan(expr)
        
        sc = ax.scatter(x_coords[valid], y_coords[valid],
                       c=expr[valid], cmap='viridis', s=5, alpha=0.7)
        plt.colorbar(sc, ax=ax, label='Expression')
        ax.set_title(f'{gene} (inferred)')
        ax.set_aspect('equal')
        ax.invert_yaxis()
    
    plt.tight_layout()
    plt.show()

---
## Save Results

In [None]:
# Save results
import os
import pickle
import json
from datetime import datetime

results_dir = '../results/4_tcell_integration'
os.makedirs(results_dir, exist_ok=True)

# Save annotated RNA T-cells
rna_tcells.write_h5ad(f'{results_dir}/rna_tcells_annotated.h5ad')
print(f"Saved: rna_tcells_annotated.h5ad")

# Save protein T-cells with transferred labels
protein_tcells.write_h5ad(f'{results_dir}/protein_tcells_labeled.h5ad')
print(f"Saved: protein_tcells_labeled.h5ad")

# Save matching data
matching_data = {
    'rna_indices': full_matching[0],
    'protein_indices': full_matching[1],
    'scores': full_matching[2],
    'prot_to_rna': prot_to_rna,
    'rna_to_prot': rna_to_prot,
}
with open(f'{results_dir}/tcell_matching.pkl', 'wb') as f:
    pickle.dump(matching_data, f)
print(f"Saved: tcell_matching.pkl")

# Save parameters
params = {
    'timestamp': datetime.now().isoformat(),
    'n_rna_tcells': n_rna,
    'n_protein_tcells': n_prot,
    'n_shared_features': n_shared,
    'shared_features': list(shared_rna_genes),
    'n_matched_pairs': len(full_matching[0]),
    'rna_coverage': len(rna_to_prot) / n_rna,
    'protein_coverage': len(prot_to_rna) / n_prot,
    'subtype_counts': protein_tcells.obs['tcell_subtype'].value_counts().to_dict(),
}
with open(f'{results_dir}/tcell_integration_params.json', 'w') as f:
    json.dump(params, f, indent=2)
print(f"Saved: tcell_integration_params.json")

print(f"\n✓ All results saved to {results_dir}")

In [None]:
# Summary
print("=" * 70)
print("T-CELL INTEGRATION SUMMARY")
print("=" * 70)
print(f"\nRNA T-cells (pLN): {n_rna:,}")
print(f"Protein T-cells (Pancreas): {n_prot:,}")
print(f"Shared features: {n_shared}")
print(f"\nMatching results:")
print(f"  Total matches: {len(full_matching[0]):,}")
print(f"  RNA coverage: {100 * len(rna_to_prot) / n_rna:.1f}%")
print(f"  Protein coverage: {100 * len(prot_to_rna) / n_prot:.1f}%")
print(f"\nT-cell subtypes transferred to Pancreas:")
for subtype, count in protein_tcells.obs['tcell_subtype'].value_counts().items():
    if subtype != 'Unmatched':
        print(f"  {subtype}: {count:,}")
print(f"\n✓ Expression inference enabled for {rna_tcells_lognorm.n_vars:,} genes")
print("=" * 70)