# Data Integration 
(Using scANVI, celltypist and GPT-annotations)

In [1]:
## Define paths
### Human Datasets
path_to_hum = 'path/to/...' # Must be a subset of >300 cells per subtype

### NMR Dataset
path_to_nmr = 'path/to/...' # Using the merge of all regions from cellbender-filtered cells

### Orthology map
path_to_orthologs = '/orthology/nmr_human_orthology.csv' # Merge of biomart nmr->human and human->nmr outputs (1:1 orthologs)

### Annotations
path_to_harmony_crossann = ''
path_to_celltypist_ann = ''
path_to_gpt_ann = ''

### Revision number (change every time you run it)
rev_n = 10



In [2]:
import pandas as pd
import scanpy as sc
import scvi
import anndata
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


  from .autonotebook import tqdm as notebook_tqdm


## Load orthology map

In [None]:
othologies = pd.read_csv(path_to_orthologs)

## Load Human Data
Using ≥300 cells per subtype (use ~100 per subtype for testing)

In [None]:
human_adata = sc.read(path_to_hum)
human_adata.obs['species'] = 'human'


In [None]:
sc.filter_genes(human_adata, min_cells=3)
sc.filter_cells(human_adata, min_genes=200)

## Subset Testing Human Data
Using ~100 cells per subtype

In [None]:
def subset_adata(adata, random_seed=None, test_fraction=0.20):
    """
    Return (train_adata, test_adata). Test set contains ~test_fraction of cells.
    """
    n_cells = adata.n_obs
    if not (0.0 < test_fraction < 1.0):
        raise ValueError("test_fraction must be between 0 and 1 (exclusive).")
    n_sample = int(n_cells * test_fraction)
    n_sample = max(1, min(n_sample, n_cells))
    print(f"Original: {n_cells} cells, target {n_sample}")
    rng = np.random.default_rng(random_seed)
    chosen_pos = rng.choice(n_cells, size=n_sample, replace=False)

    mask = np.zeros(n_cells, dtype=bool)
    mask[chosen_pos] = True

    # Slice AnnData
    test_adata = adata[mask].copy()
    train_adata = adata[~mask].copy()

    return train_adata, test_adata


In [None]:
nmr_adatahuman_train, human_test = subset_adata(human_adata, random_seed=rev_n, test_fraction=0.2)

## Load NMR Data


In [None]:
nmr_adata = sc.read(path_to_nmr)
nmr_adata.obs['species'] = 'nmr'

## Map NMR gene names to human
(mantain NMR gene names from non-orthologs)

## Subset human genes to only 1:1 othologs to NMR

In [None]:
sc.filter_genes(nmr_adata, min_cells=3)
sc.filter_cells(nmr_adata, min_genes=200)

## Merge datasets

In [None]:
all_adata = sc.concatenate(nmr_adata, nmr_adatahuman_train, join='outer', batch_key='species', batch_categories=['nmr', 'human'])
all_adata = sc. concatenate(all_adata, human_test, join='outer', batch_key='species', batch_categories=['nmr', 'human', 'human_test'])

# Preprocessing

In [None]:
all_adata.var_names_make_unique()

In [None]:
# Save raw counts
all_adata.layers['counts']= all_adata.X.copy()

In [None]:
# Normalization

sc.normalize_total(all_adata, target_sum=1e4)
sc.log1p(all_adata)

# Keep norm+log1p in .raw
all_adata.raw = all_adata.copy()

### Find HVG

In [None]:
sc.pp.find_variable_genes(all_adata, n_top_genes=4000, subset=True, flavor='seurat_v3', layer = 'counts', batch_key='species')

## Train scVI model

In [None]:

scvi.model.SCANVI.setup_anndata(all_adata, 
                                categorical_covariate_keys=['species', 'region','sample'],
                                continuous_covariate_keys= ['n_genes_by_counts','total_counts','pct_counts_mt','pct_counts_ribo'],
                                batch_key='dataset_name', 
                                labels_key='cell_supertype', 
                                unlabeled_category=np.nan)

In [None]:
scanvi_model = scvi.model.SCANVI(all_adata, unlabeled_category=np.nan)


In [None]:
scanvi_model.train(max_epochs=1000, n_epochs_kl_warmup=500)

In [None]:
all_adata.obsm['X_scVI'] = scanvi_model.get_latent_representation()

In [None]:
all_adata.layers['X_scVI_normalized'] = scanvi_model.get_normalized_expression(library_size=1e4)

### Find Doublets with scVI

In [None]:
dbl_solo_model = scvi.external.SOLO.from_scvi_model(scanvi_model)
dbl_solo_model.train(max_epochs=1000, n_epochs_kl_warmup=500)

In [None]:
df = dbl_solo_model.predict()
df['dbl_prediction'] = dbl_solo_model.predict( soft=False)
df.index = df.index.map(lambda x x[:-2]) # Remove trailing -0 from cell names

In [None]:
all_adata.obs['dbl_prediction'] = df.loc[all_adata.obs_names, 'dbl_prediction'].values

In [None]:
all_adata = all_adata[all_adata.obs['dbl_prediction'] != 'doublet']

### Perform Clustering

In [None]:
sc.pp.neighbors(all_adata, use_rep='X_scVI')


In [None]:
sc.tl.leiden(all_adata, resolution=1.2)

In [None]:
sc.tl.umap(all_adata)

In [None]:
sc.pl.umap(all_adata, color=['dataset_name', 'cell_supertype', 'leiden'], wspace=0.4, save=f'_scanvi_rev{rev_n}.png')

0) Assumptions about your adata (what the code expects)

adata is an AnnData with:

.X normalized (you mentioned preprocessing done).

adata.obs['species'] contains 'human' for human cells and 'nmr' (or similar) for naked mole rat cells.

adata.obs['human_label'] exists for human cells (categorical labels at your chosen supertype level); for NMR this can be NaN, 'Unknown' or missing.

The gene set is the intersection of 1:1 orthologs (you already have that).

If names differ, adapt variable names in the code accordingly.

In [None]:
# make a copy so we can revert if needed
adata_sc = adata.copy()

# 1) Create a labels column that scANVI can use. 
#    We want human cells labeled and NMR unlabeled (set to 'Unknown').
adata_sc.obs['scvi_label'] = adata_sc.obs.get('human_label', None).astype('category')

# Set unlabeled category name
unlabeled_cat = 'Unknown'
adata_sc.obs['scvi_label'] = adata_sc.obs['scvi_label'].cat.add_categories([unlabeled_cat])
adata_sc.obs.loc[adata_sc.obs['species'] != 'human', 'scvi_label'] = unlabeled_cat

# If you have a batch key (donor, library, region) include it; else leave out.
# Example: use 'batch' if exists
batch_key = 'batch' if 'batch' in adata_sc.obs.columns else None

# Register anndata with scvi-tools
if batch_key:
    scvi.data.setup_anndata(adata_sc, labels_key='scvi_label', batch_key=batch_key)
else:
    scvi.data.setup_anndata(adata_sc, labels_key='scvi_label')


## Train scANVI
Use human annotated cells and include NMR as unlabeled

In [None]:
from scvi.model import SCANVI

# Create model: tell it the unlabeled category name (the string we used above)
model = SCANVI(
    adata_sc,
    unlabeled_category=unlabeled_cat,
    labels_key='human_label',
    n_latent=20,
    n_layers=2,
    n_hidden=128,
    dropout_rate=0.1
)

# Train on CPU: set max_epochs (start modest)
max_epochs = 120
model.train(max_epochs=max_epochs, use_gpu=False)


## Get scANVI predictions

In [None]:
# 1) hard predicted labels
pred_labels = model.predict(adata_sc)  # returns array of category strings
adata_sc.obs['scanvi_pred'] = pred_labels

# 2) probabilities: try the convenience call first
try:
    probs = model.predict(adata_sc, soft=True)  # returns np.array shape (n_cells, n_classes)
    # If the returned structure is a DataFrame, convert to numpy
    if isinstance(probs, pd.DataFrame):
        probs = probs.values
except TypeError:
    # fallback: some versions provide predict_proba()
    try:
        probs = model.predict_proba(adata_sc)
    except Exception:
        # ultimate fallback: compute probabilites from the posterior logits
        print("predict(..., soft=True) and predict_proba() failed; attempting posterior sampling fallback.")
        posterior = model.get_posterior(adata_sc)
        probs = posterior.label_probabilities().numpy()  # may work in newer versions


In [None]:
# get label names in the same order as columns of probs
label_names = model.summary_labels_  # scvi usually stores label names here; fallback below
if label_names is None:
    # fallback to categories from adata
    label_names = adata_sc.obs['scvi_label'].cat.categories.tolist()

# Convert to DataFrame and attach
probs_df = pd.DataFrame(probs, columns=label_names, index=adata_sc.obs_names)
# top probability and argmax label
adata_sc.obs['scanvi_top_prob'] = probs_df.max(axis=1).values
adata_sc.obs['scanvi_top_label'] = probs_df.idxmax(axis=1).values

# Optional: store full probs in adata.obsm
adata_sc.obsm['scanvi_proba'] = probs


## Integrate different annotations
For each cell compute:

scANVI predicted label + probability (softmax).

Marker enrichment score for the predicted label: run DE (cluster vs others or cell vs others) and compute AUC for canonical markers. For marker lists, use human canonical markers mapped via orthology.

celltypist prediction + probability (if celltypist supports the species/model).

GPT-marker annotation score (you already have these — convert to binary match vs canonical markers).

Cluster consensus: what fraction of cells in the cluster share the predicted label.

Combine into a consensus score per cell, e.g.:

consensus_score = 0.5 * scANVI_prob + 0.3 * marker_enrichment_score + 0.2 * celltypist_prob

(Weights are configurable; I prefer classifier probability highest.) Normalize to [0,1].

## Assign final labels
Consider label assignment only when confidence threshold is passed

If consensus_score >= 0.8 → assign label.

If 0.5 <= consensus_score < 0.8 → assign label but mark "provisional / review".

If consensus_score < 0.5 → label as "unknown" or “species-specific candidate.”
Report how many cells fall in each bin and list “unknown” clusters for manual curation.

For each cluster, compute:

Label purity (fraction of cells with the same final label).

Mean scANVI probability and mean marker enrichment.

Top DE genes (pseudobulk) and compare to known marker panels.

For clusters with low purity or conflicting evidence, either:

Split cluster and re-run scANVI/probabilities, or

Keep as “ambiguous / novel” and provide marker list + suggested experiments (ISH, immuno).

In [None]:
sc.tl.rank_genes_groups(method='wilcoxon')

UMAPs colored by each annotation method and by consensus_label and consensus_score (continuous heat).

Confusion matrices between methods (human cross-annotation vs celltypist vs scANVI).

Alluvial plot showing how cells move between annotations.

Per-label ROC / PR curves for marker enrichment vs assigned label.

In [None]:
sc.pl.umap(adata, color=['consensus_label','scANVI_prob'])

Make a table with one row per final label containing:

number cells, purity, consensus_score_mean, canonical markers enriched, top DE genes, suggested validation (marker probes), and confidence (high/medium/low).