# Train a scVI model using Anncollection dataloader wrapper

In this tutorial we will show how to apply the annCollection wrapper in scvi-tools to load and train several adata's that are stored on disk

```{note}
Running the following cell will install tutorial dependencies on Google Colab only. It will have no effect on environments other than Google Colab.
```

In [1]:
!pip install --quiet scvi-colab
from scvi_colab import install

install()

                Not currently in Google Colab environment.

                Please run with `run_outside_colab=True` to override.

                Returning with no further action.
                
  warn(


In [2]:
import math
import anndata
from anndata.experimental import AnnCollection
import gdown
import numpy as np
import scanpy as sc
from scipy import sparse as sp
from scvi.dataloaders import CollectionAdapter
import scvi
from pathlib import Path
import seaborn as sns
import torch
import os
import tempfile

In [3]:
scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)

Seed set to 0


Last run with scvi-tools version: 1.3.2


In [4]:
sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
torch.set_float32_matmul_precision("high")
save_dir = tempfile.TemporaryDirectory()

%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

We will use 2 types of datasets : PBMC and Covid data, both from SCVI datasets repo

In [5]:
# the data is from this scvi reproducibility notebook
# https://yoseflab.github.io/scvi-tools-reproducibility/scarches_totalvi_seurat_data/
if Path("./pbmc_seurat_v4.h5ad").exists() and Path("./covid_cite.h5ad").exists():
    print("Data already downloaded")
else:
    gdown.download(url="https://drive.google.com/uc?id=1X5N9rOaIqiGxZRyr1fyZ6NpDPeATXoaC",
                output="pbmc_seurat_v4.h5ad", quiet=False)
    gdown.download(url="https://drive.google.com/uc?id=1JgaXNwNeoEqX7zJL-jJD3cfXDGurMrq9",
                output="covid_cite.h5ad", quiet=False)

Data already downloaded


## Preprocessing of the data

In [6]:
covid = sc.read('covid_cite.h5ad', backed="r")
pbmc = sc.read('pbmc_seurat_v4.h5ad', backed="r")

In [7]:
pbmc

AnnData object with n_obs × n_vars = 161764 × 20729 backed at 'pbmc_seurat_v4.h5ad'
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'X_index'
    obsm: 'protein_counts'

In [8]:
covid

AnnData object with n_obs × n_vars = 57669 × 33538 backed at 'covid_cite.h5ad'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'RNA_snn_res.0.4', 'seurat_clusters', 'set', 'Resp', 'disease', 'subj_code', 'covidpt_orhealth', 'mito', 'ncount', 'nfeat', 'bust_21', 'og_clust', 'severmod_other', 'og_clusts', 'nCount_ADT', 'nFeature_ADT', 'UMAP1', 'UMAP2', 'final_clust', 'final_clust_v2', 'new_pt_id', 'Resp_og', 'final_clust_withnum', 'final_clust_review', 'Age', 'Gender', 'Gender_num'
    obsm: 'pro_exp'

Note covid datasets has more genes than the pbmc. In anncollection we autoamticaly select the intersection that exists in both.

In [9]:
# create a fake counts layer to test training
covid.layers["test"] = covid.X
pbmc.layers["test"] = pbmc.X

In [10]:
# take annotations from the `pbmc` dataset and leave
# annotations in `covid` as an Unknown
covid.obs["celltype.l1"] = "Unknown"

Note that our count data is in a sparse form, which is the only one supported currently when using the AnnCollection Wrapper in SCVI-Tools

In [11]:
# X is all raw counts
assert np.all(np.mod(pbmc.X[:10].toarray(), 1) == 0)
assert np.all(np.mod(covid.X[:10].toarray(), 1) == 0)

In [12]:
# create an AnnCollection on a subset of the data
# we're subsetting purely for speed
adata = AnnCollection(
    [covid[:5000], pbmc[:5000]],
    join_vars="inner",
    join_obs="inner",
    label='dataset',
)
print(adata)

AnnCollection object with n_obs × n_vars = 10000 × 20729
  constructed from 2 AnnData objects
    view of layers: 'test'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'celltype.l1', 'dataset'


## Build a wrapper AnnData around the collection

In [13]:
collection_adapter = CollectionAdapter(adata)
collection_adapter

Adapter for:
AnnCollection object with n_obs × n_vars = 10000 × 20729
  constructed from 2 AnnData objects
    view of layers: 'test'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'nFeature_ADT', 'celltype.l1', 'dataset'

In [14]:
sp.issparse(collection_adapter.layers["test"])

True

In [15]:
#sc.pp.filter_genes(collection_adapter, min_counts=3)

In [16]:
#sc.pp.normalize_total(collection_adapter, target_sum=1e4)
#sc.pp.log1p(collection_adapter)

In [17]:
#sc.pp.highly_variable_genes(
#    collection_adapter,
#    n_top_genes=1000,
#    subset=True,
#    layer="counts",
#    flavor="seurat_v3",
#    batch_key="dataset",
#)

In [18]:
scvi.model.SCANVI.setup_anndata(
    collection_adapter,
    layer="test",
    batch_key="dataset",
    labels_key="celltype.l1",
    unlabeled_category="Unknown",
)

In [19]:
model = scvi.model.SCANVI(collection_adapter, n_latent=10)

In [None]:
# we're only training for a few epochs to show it works
model.train(max_epochs=25, early_stopping=True)

[34mINFO    [0m Training for [1;36m25[0m epochs.                                                                                   


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/access/miniconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.
/home/access/miniconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argume

Training:   0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
SCVI_LATENT_KEY = "X_scVI"
latent = model.get_latent_representation()
collection_adapter.obsm[SCVI_LATENT_KEY] = latent
latent.shape

In [None]:
# run PCA then generate UMAP plots
sc.tl.pca(collection_adapter)
sc.pp.neighbors(collection_adapter, n_pcs=30, n_neighbors=20)
sc.tl.umap(collection_adapter, min_dist=0.3)

In [None]:
sc.pl.umap(
    collection_adapter,
    color=["cell_type"],
    frameon=False,
)
sc.pl.umap(
    collection_adapter,
    color=["donor", "cell_source"],
    ncols=2,
    frameon=False,
)

In [None]:
# use scVI latent space for UMAP generation
sc.pp.neighbors(collection_adapter, use_rep=SCVI_LATENT_KEY)
sc.tl.umap(collection_adapter, min_dist=0.3)

In [None]:
sc.pl.umap(
    collection_adapter,
    color=["cell_type"],
    frameon=False,
)
sc.pl.umap(
    collection_adapter,
    color=["donor", "cell_source"],
    ncols=2,
    frameon=False,
)

In [None]:
# neighbors were already computed using scVI
SCVI_CLUSTERS_KEY = "leiden_scVI"
sc.tl.leiden(collection_adapter, key_added=SCVI_CLUSTERS_KEY, resolution=0.5)

In [None]:
sc.pl.umap(
    collection_adapter,
    color=[SCVI_CLUSTERS_KEY],
    frameon=False,
)

In [None]:
predictions = model.predict(collection_adapter)