# Multi-Tissue Comparisons and Integration

Integrate and compare multiple samples/tissues.

**Input:** Multiple annotated samples
**Output:** Integrated dataset with batch correction

In [None]:
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
import scanpy as sc
import scvi
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA_DIR = Path('../data/processed')
FIGURES_DIR = Path('../figures/05_tissue_comparisons')
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

## Load Multiple Samples

In [None]:
# List of sample files
sample_files = list(DATA_DIR.glob('*_annotated.h5ad'))
print(f'Found {len(sample_files)} annotated samples')

if len(sample_files) == 0:
    print('No annotated samples found. Run 02_phenotyping.ipynb first.')
elif len(sample_files) == 1:
    print('Only one sample found. Multi-sample integration requires at least 2 samples.')
    adata = sc.read_h5ad(sample_files[0])
else:
    # Load and concatenate
    adatas = [sc.read_h5ad(f) for f in sample_files]
    adata = sc.concat(adatas, label='batch', keys=[f.stem for f in sample_files])
    print(f'Concatenated shape: {adata.shape}')

## Batch Correction with scVI

In [None]:
if len(sample_files) > 1:
    # Setup and train scVI model
    scvi.model.SCVI.setup_anndata(adata, batch_key='batch')
    vae = scvi.model.SCVI(adata, n_latent=30, n_layers=2)
    vae.train(max_epochs=200, early_stopping=True)
    
    # Get corrected latent representation
    adata.obsm['X_scvi'] = vae.get_latent_representation()
    
    # Recompute neighbors and UMAP
    sc.pp.neighbors(adata, use_rep='X_scvi')
    sc.tl.umap(adata)
    sc.tl.leiden(adata, key_added='leiden_integrated')
    
    print('Batch correction complete')

## Visualize Integration

In [None]:
if len(sample_files) > 1:
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    sc.pl.umap(adata, color='batch', ax=axes[0], show=False)
    sc.pl.umap(adata, color='celltype', ax=axes[1], show=False)
    sc.pl.umap(adata, color='leiden_integrated', ax=axes[2], show=False, legend_loc='on data')
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'integrated_umap.png', dpi=300, bbox_inches='tight')
    plt.show()

## Cross-Tissue Comparisons

In [None]:
if len(sample_files) > 1 and 'celltype' in adata.obs.columns:
    # Cell type composition by batch
    comp = adata.obs.groupby(['batch', 'celltype']).size().unstack(fill_value=0)
    comp_pct = comp.div(comp.sum(axis=1), axis=0) * 100
    
    fig, ax = plt.subplots(figsize=(12, 6))
    comp_pct.plot(kind='bar', ax=ax, colormap='tab20')
    ax.set_ylabel('Percentage')
    ax.set_xlabel('Sample')
    ax.set_title('Cell Type Composition Across Samples')
    plt.xticks(rotation=45, ha='right')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'composition_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Save composition table
    comp_pct.to_csv(DATA_DIR / 'tissue_composition_comparison.csv')
    print('Composition comparison saved')

## Save Integrated Data

In [None]:
output_file = DATA_DIR / 'integrated_tissues.h5ad'
adata.write_h5ad(output_file)
print(f'Integrated data saved to: {output_file}')
print('Multi-tissue analysis complete!')