# Rebuild Trajectory from Fixed Pipeline

**Purpose**: Regenerate `data/adata_ins_root.h5ad` from the validated fixed pipeline,
replacing the problematic version that mixed morphology features and used weak parameters.

**Pipeline**:
1. Load canonical single-cell data (`CODEX_scvi_BioCov_phenotyped_newDuctal.h5ad`)
2. Aggregate to islet-level using `fixed_islet_aggregation.py` (core only, min_cells=20)
3. Compute neighbors using scVI latent space (n=15, cosine)
4. Run PAGA for topology, then UMAP (init_pos='paga')
5. Compute DPT with ND-INS root
6. Validate: INS correlation, donor ordering, 6533 inclusion
7. Save validated `data/adata_ins_root.h5ad`

**Key differences from old version**:
- `.X` contains only protein expression (31 vars), not mixed with morphology (47 vars)
- Uses `X_scVI_mean` for neighbors (batch-corrected)
- n_neighbors=15 (standard) vs old n_neighbors=5
- PAGA-initialized UMAP for better topology
- Includes phenotype proportions in `.obsm`

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

sc.settings.verbosity = 2
sc.settings.set_figure_params(dpi=100, frameon=False)

# Add islet_analysis to path for fixed_islet_aggregation
sys.path.insert(0, os.path.join('..', 'islet_analysis'))
from fixed_islet_aggregation import create_islet_dataset_fixed, compute_neighbors_and_umap, compute_trajectory

## 1. Load canonical single-cell data

In [None]:
sc_path = os.path.join('..', 'single_cell_analysis', 'CODEX_scvi_BioCov_phenotyped_newDuctal.h5ad')
print(f'Loading: {sc_path}')
adata_sc = sc.read_h5ad(sc_path)
print(f'Shape: {adata_sc.shape}')
print(f'Donor status: {dict(adata_sc.obs["Donor Status"].value_counts())}')
print(f'Unique donors: {adata_sc.obs["imageid"].nunique()}')

## 2. Check existing islets_core_fixed.h5ad

If it already exists and has good properties, we can use it directly.
Otherwise, regenerate from single-cell data.

In [None]:
fixed_path = os.path.join('..', 'islet_analysis', 'islets_core_fixed.h5ad')
regenerate = False

if os.path.exists(fixed_path):
    adata_fixed = sc.read_h5ad(fixed_path)
    print(f'Existing islets_core_fixed.h5ad: {adata_fixed.shape}')
    print(f'Variables: {list(adata_fixed.var_names)}')
    print(f'Obsm keys: {list(adata_fixed.obsm.keys())}')
    print(f'Donor status: {dict(adata_fixed.obs["donor_status"].value_counts())}')
    
    # Check for 6533
    has_6533 = any('6533' in str(x) for x in adata_fixed.obs['imageid'].unique())
    print(f'Has 6533: {has_6533}')
    
    # Check for scVI means
    has_scvi = 'X_scVI_mean' in adata_fixed.obsm
    print(f'Has X_scVI_mean: {has_scvi}')
    
    # Check for phenotype proportions
    has_pheno = 'phenotype_proportions' in adata_fixed.obsm
    print(f'Has phenotype_proportions: {has_pheno}')
    
    if not has_6533 or not has_scvi or not has_pheno:
        print('\nMissing required data — will regenerate from single-cell data')
        regenerate = True
    else:
        print('\nExisting file has all required properties')
        adata = adata_fixed
else:
    print(f'{fixed_path} not found — will generate from single-cell data')
    regenerate = True

In [None]:
if regenerate:
    print('Regenerating islet-level dataset from single-cell data...')
    print('Parameters: region=islet_only, min_cells=20')
    adata = create_islet_dataset_fixed(
        adata_sc,
        region='islet_only',
        min_cells=20,
        require_paired=False  # Don't require peri-islet for trajectory
    )
    print(f'\nGenerated: {adata.shape}')
    
    # Save the regenerated fixed file
    adata.write_h5ad(fixed_path)
    print(f'Saved: {fixed_path}')
else:
    print(f'Using existing {fixed_path} ({adata.n_obs} islets)')

## 3. Verify dataset properties

In [None]:
print(f'Dataset shape: {adata.shape}')
print(f'\nVariables (should be proteins only):')
print(list(adata.var_names))
print(f'\nObs columns: {list(adata.obs.columns)}')
print(f'\nObsm keys: {list(adata.obsm.keys())}')
print(f'\nDonor status:')
print(adata.obs['donor_status'].value_counts())
print(f'\nDonors (imageids):')
print(adata.obs['imageid'].value_counts())

# Verify 6533
donors_with_6533 = [d for d in adata.obs['imageid'].unique() if '6533' in str(d)]
print(f'\n6533 donors found: {donors_with_6533}')
if not donors_with_6533:
    print('WARNING: 6533 not found! Check single-cell data inclusion.')

## 4. Compute neighbors and UMAP using scVI latent space

In [None]:
# Clear any existing embeddings to start fresh
for key in ['X_umap', 'X_diffmap', 'X_pca']:
    if key in adata.obsm:
        del adata.obsm[key]
for key in ['neighbors', 'paga']:
    if key in adata.uns:
        del adata.uns[key]

# Compute neighbors from scVI latent means (batch-corrected)
if 'X_scVI_mean' in adata.obsm:
    print('Computing neighbors from X_scVI_mean (batch-corrected)...')
    sc.pp.neighbors(adata, n_neighbors=15, use_rep='X_scVI_mean', metric='cosine')
else:
    print('WARNING: X_scVI_mean not available, falling back to PCA on expression')
    sc.pp.pca(adata, n_comps=10)
    sc.pp.neighbors(adata, n_neighbors=15)

print('Neighbors computed')

In [None]:
# PAGA for trajectory topology
sc.tl.paga(adata, groups='donor_status')
print('PAGA connectivity:')
print(adata.uns['paga']['connectivities'].toarray())

# PAGA-initialized UMAP
sc.pl.paga(adata, plot=False)
sc.tl.umap(adata, init_pos='paga', min_dist=0.1, spread=1.5)
print(f'\nUMAP computed: {adata.obsm["X_umap"].shape}')

## 5. Compute diffusion pseudotime (DPT)

In [None]:
# Find root: ND islet with highest INS expression
ins_idx = list(adata.var_names).index('INS')
nd_mask = adata.obs['donor_status'] == 'ND'

if nd_mask.any():
    nd_indices = np.where(nd_mask)[0]
    ins_values = adata.X[nd_mask, ins_idx]
    root_idx = nd_indices[np.argmax(ins_values)]
    adata.uns['iroot'] = root_idx
    print(f'Root islet: index={root_idx}, INS={adata.X[root_idx, ins_idx]:.3f}')
    print(f'Root donor: {adata.obs.iloc[root_idx]["imageid"]}, status: {adata.obs.iloc[root_idx]["donor_status"]}')
else:
    raise ValueError('No ND islets found!')

# Compute diffusion map
sc.tl.diffmap(adata, n_comps=10)
print(f'Diffusion map: {adata.obsm["X_diffmap"].shape}')

# Compute DPT
sc.tl.dpt(adata)
print(f'\nPseudotime range: [{adata.obs["dpt_pseudotime"].min():.3f}, {adata.obs["dpt_pseudotime"].max():.3f}]')

# Store diffusion components in obs for easy access
adata.obs['DC1'] = adata.obsm['X_diffmap'][:, 0]
adata.obs['DC2'] = adata.obsm['X_diffmap'][:, 1]

## 6. Validate trajectory quality

In [None]:
print('='*60)
print('TRAJECTORY VALIDATION')
print('='*60)

pt = adata.obs['dpt_pseudotime'].values

# --- Check 1: INS negatively correlated with pseudotime ---
ins_expr = adata.X[:, ins_idx]
r_ins, p_ins = stats.spearmanr(pt, ins_expr, nan_policy='omit')
print(f'\n1. INS vs pseudotime: r={r_ins:.3f}, p={p_ins:.2e}')
print(f'   Expected: r < -0.3 → {"PASS" if r_ins < -0.3 else "FAIL"}')

# --- Check 2: GCG positively correlated ---
if 'GCG' in adata.var_names:
    gcg_idx = list(adata.var_names).index('GCG')
    gcg_expr = adata.X[:, gcg_idx]
    r_gcg, p_gcg = stats.spearmanr(pt, gcg_expr, nan_policy='omit')
    print(f'\n2. GCG vs pseudotime: r={r_gcg:.3f}, p={p_gcg:.2e}')
    print(f'   Expected: r > 0.2 → {"PASS" if r_gcg > 0.2 else "FAIL"}')

# --- Check 3: Donor status ordering ---
print(f'\n3. Pseudotime by donor status:')
status_order = ['ND', 'Aab+', 'T1D']
status_means = []
for status in status_order:
    mask = adata.obs['donor_status'] == status
    if mask.any():
        mean_pt = pt[mask].mean()
        std_pt = pt[mask].std()
        status_means.append(mean_pt)
        print(f'   {status:5s}: {mean_pt:.3f} ± {std_pt:.3f} (n={mask.sum()})')

ordering_correct = all(status_means[i] <= status_means[i+1] for i in range(len(status_means)-1))
print(f'   Expected: ND < Aab+ < T1D → {"PASS" if ordering_correct else "FAIL"}')

# --- Check 4: No single donor dominates quintiles ---
print(f'\n4. Donor distribution across pseudotime quintiles:')
adata.obs['pt_quintile'] = pd.qcut(pt, q=5, labels=['Q1 (early)', 'Q2', 'Q3', 'Q4', 'Q5 (late)'])
ct = pd.crosstab(adata.obs['pt_quintile'], adata.obs['imageid'])
# Check if any donor is >80% of any quintile
ct_pct = ct.div(ct.sum(axis=1), axis=0)
max_dominance = ct_pct.max().max()
print(f'   Max donor fraction in any quintile: {max_dominance:.2f}')
print(f'   Expected: < 0.80 → {"PASS" if max_dominance < 0.80 else "WARNING: donor dominance detected"}')
print(ct)

# --- Check 5: Donor 6533 included ---
has_6533 = any('6533' in str(x) for x in adata.obs['imageid'].unique())
print(f'\n5. Donor 6533 included: {"PASS" if has_6533 else "FAIL"}')

# --- Overall ---
checks = [
    ('INS correlation < -0.3', r_ins < -0.3),
    ('Donor status ordering', ordering_correct),
    ('No donor dominance', max_dominance < 0.80),
    ('6533 included', has_6533),
]
if 'GCG' in adata.var_names:
    checks.insert(1, ('GCG correlation > 0.2', r_gcg > 0.2))

print(f'\n{"="*60}')
print('VALIDATION SUMMARY')
all_pass = True
for desc, passed in checks:
    status = 'PASS' if passed else 'FAIL'
    if not passed:
        all_pass = False
    print(f'  [{status}] {desc}')
print(f'\nOverall: {"ALL PASSED" if all_pass else "SOME CHECKS FAILED — review above"}')

## 7. Visualization

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 11))

# Row 1: UMAP
sc.pl.umap(adata, color='donor_status', ax=axes[0, 0], show=False, title='Donor Status',
           palette={'ND': '#2ca02c', 'Aab+': '#ffcc00', 'T1D': '#9467bd'})
sc.pl.umap(adata, color='dpt_pseudotime', ax=axes[0, 1], show=False, title='DPT Pseudotime',
           color_map='viridis')
sc.pl.umap(adata, color='INS', ax=axes[0, 2], show=False, title='INS Expression',
           color_map='RdYlBu_r')

# Row 2: Validation plots
# Pseudotime violin by donor status
for i, status in enumerate(status_order):
    mask = adata.obs['donor_status'] == status
    color = {'ND': '#2ca02c', 'Aab+': '#ffcc00', 'T1D': '#9467bd'}[status]
    data = pt[mask]
    parts = axes[1, 0].violinplot([data], positions=[i], showmeans=True, showmedians=True)
    for pc in parts['bodies']:
        pc.set_facecolor(color)
        pc.set_alpha(0.7)
axes[1, 0].set_xticks(range(len(status_order)))
axes[1, 0].set_xticklabels(status_order)
axes[1, 0].set_ylabel('Pseudotime')
axes[1, 0].set_title('Pseudotime by Donor Status')

# INS vs pseudotime scatter
colors = adata.obs['donor_status'].map({'ND': '#2ca02c', 'Aab+': '#ffcc00', 'T1D': '#9467bd'})
axes[1, 1].scatter(pt, ins_expr, c=colors, alpha=0.5, s=15)
axes[1, 1].set_xlabel('Pseudotime')
axes[1, 1].set_ylabel('INS expression')
axes[1, 1].set_title(f'INS vs Pseudotime (r={r_ins:.3f})')

# Marker correlations bar plot
correlations = []
for marker in adata.var_names:
    m_idx = list(adata.var_names).index(marker)
    r, p = stats.spearmanr(pt, adata.X[:, m_idx], nan_policy='omit')
    correlations.append({'marker': marker, 'r': r, 'p': p})
corr_df = pd.DataFrame(correlations).sort_values('r')

bar_colors = ['#d62728' if r < 0 else '#2ca02c' for r in corr_df['r']]
axes[1, 2].barh(corr_df['marker'], corr_df['r'], color=bar_colors, height=0.7)
axes[1, 2].set_xlabel('Spearman r with pseudotime')
axes[1, 2].set_title('Marker-Pseudotime Correlations')
axes[1, 2].axvline(0, color='k', linewidth=0.5)
axes[1, 2].tick_params(axis='y', labelsize=7)

plt.tight_layout()
plt.savefig('../notebooks/trajectory_validation.png', dpi=150, bbox_inches='tight')
plt.show()
print('Figure saved: trajectory_validation.png')

## 8. Save validated trajectory H5AD

In [None]:
# Clean up temporary columns
if 'pt_quintile' in adata.obs.columns:
    del adata.obs['pt_quintile']

# Add combined_islet_id for compatibility with the Shiny trajectory module
# The module expects 'combined_islet_id' (format: "imageid_Islet_N")
# In islets_core_fixed.h5ad this is stored as 'islet_id'
if 'islet_id' in adata.obs.columns and 'combined_islet_id' not in adata.obs.columns:
    adata.obs['combined_islet_id'] = adata.obs['islet_id'].copy()
    print(f'Added combined_islet_id from islet_id (e.g., {adata.obs["combined_islet_id"].iloc[0]})')

output_path = os.path.join('..', 'data', 'adata_ins_root.h5ad')
adata.write_h5ad(output_path)
print(f'Saved validated trajectory: {output_path}')
print(f'  Shape: {adata.shape}')
print(f'  Obsm: {list(adata.obsm.keys())}')
print(f'  Obs columns: {list(adata.obs.columns)}')
print(f'\nThis file replaces the problematic old version.')
print(f'Key improvements:')
print(f'  - .X contains {adata.n_vars} protein markers only (no morphology mixing)')
print(f'  - Neighbors computed from X_scVI_mean (batch-corrected)')
print(f'  - PAGA-initialized UMAP')
print(f'  - n_neighbors=15 (standard)')
print(f'  - {adata.n_obs} islets with validated pseudotime')

## 9. Compare with old version (if available)

In [None]:
# Load old version for comparison if it exists as backup
old_path = os.path.join('..', 'data', 'adata_ins_root_old.h5ad')
if os.path.exists(old_path):
    adata_old = sc.read_h5ad(old_path)
    print('Comparison with old version:')
    print(f'  Old: {adata_old.shape[0]} islets, {adata_old.shape[1]} vars')
    print(f'  New: {adata.shape[0]} islets, {adata.shape[1]} vars')
    print(f'  Old vars: {list(adata_old.var_names)}')
    print(f'  New vars: {list(adata.var_names)}')
    print(f'  Old obsm: {list(adata_old.obsm.keys())}')
    print(f'  New obsm: {list(adata.obsm.keys())}')
else:
    print(f'No old version found at {old_path} for comparison.')
    print('Consider backing up the current data/adata_ins_root.h5ad before overwriting.')