# Train a scVI model using Lamin

This notebook demonstrates a scalable approach to training an [scVI](https://docs.scvi-tools.org/en/latest/user_guide/models/scvi.html) model on Census data using [Lamin](https://lamin.ai/) dataloader.
LaminDB is a database system based on its MappedCollection designed to support efficient storage, management, and querying of scientific data, particularly in machine learning, bioinformatics, and data science applications. It allows for the easy organization, sharing, and querying of complex datasets, such as those involved in research, experiments, or models.
See [here](https://docs.scvi-tools.org/en/stable/user_guide/use_case/custom_dataloaders.html) for more information

```{note}
Running the following cell will install tutorial dependencies on Google Colab only. It will have no effect on environments other than Google Colab.
```

In [1]:
!pip install --quiet scvi-colab==0.13.0

In [2]:
!pip install --quiet "lamindb[bionty,jupyter]>=1.3.0"
!pip install --quiet biomart
!pip install --quiet bionty
!pip install --quiet cellxgene_lamin
!pip install --quiet urllib3==1.26.20
!pip install --quiet tiledbsoma
!pip install --quiet tiledb
!pip install --quiet tiledbsoma_ml
!pip install --quiet cellxgene-census

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.4/277.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.7/142.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.1/164.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.2/264.2 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# We will need to initialzie the lamindb instance
!lamin connect laminlabs/cellxgene

[93m![0m using anonymous user (to identify, call: lamin login)
[92m→[0m connected lamindb: laminlabs/cellxgene


In [4]:
# A Manual fix to uv issue in Colab, see: https://github.com/astral-sh/uv/issues/12724
import os

os.environ["UV_CONSTRAINT"] = os.environ["UV_BUILD_CONSTRAINT"] = ""

In [5]:
from scvi_colab import install

install(branch="ori-2907-custom-dataloader-registry", for_tutorials=False)

[34mINFO    [0m scvi-colab: Installing scvi-tools.                                                                        
[34mINFO    [0m scvi-colab: Install successful. Testing import.                                                           
[92m→[0m connected lamindb: laminlabs/cellxgene


In [6]:
import os

import scanpy as sc
import scvi
from scvi.dataloaders import MappedCollectionDataModule

In [7]:
!lamin init --storage ./lamindb_collection  # one time for github runner (comment)
import lamindb as ln
# ln.setup.init()  # one time for github runner (comment out when runing localy)

[92m→[0m initialized lamindb: anonymous/lamindb_collection


In [9]:
# ln.track()

In [10]:
# We load the collection to see it consists of many h5ad files
collection = ln.Collection.using("laminlabs/cellxgene").get(name="covid_normal_lung")
artifacts = collection.artifacts.all()
artifacts.df()

[93m![0m no run & transform got linked, call `ln.track()` & re-run


Unnamed: 0_level_0,uid,key,description,suffix,kind,otype,size,hash,n_files,n_observations,_hash_type,_key_is_virtual,_overwrite_versions,space_id,storage_id,schema_id,version,is_latest,run_id,created_at,created_by_id,_aux,_branch_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2894,7uWdx2sU0D0ujw5YXrHs,cell-census/2024-07-01/h5ads/0ba16f4b-cb87-4fa...,Myeloid,.h5ad,dataset,AnnData,84794822,F3RByNhyDKAexaU4LODH4g,,6947,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:08.908938+00:00,1,,1
2905,5VCheRCxgdRWtDnBNVQC,cell-census/2024-07-01/h5ads/0e9d47fb-89b1-42d...,Airway epithelial cells,.h5ad,dataset,AnnData,444425539,rF2Y_n0Fg_rNvc9Zq8zgvA,,29505,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:08.926925+00:00,1,,1
3001,sahSoRDphdi5QINmYKga,cell-census/2024-07-01/h5ads/2d85960a-2ba8-4f5...,Myeloid cells,.h5ad,dataset,AnnData,477060382,16ejFHaCfxlQ2GMtgb8lbQ,,40634,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.129814+00:00,1,,1
3005,HpnxGOoYonDkD4UIEP9V,cell-census/2024-07-01/h5ads/2f132ec9-24b5-422...,Lung,.h5ad,dataset,AnnData,358508307,16xp2QLMQeC910m_7DEWTQ,,39778,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.138873+00:00,1,,1
3043,h1OfBAUVyqCe47TmODzc,cell-census/2024-07-01/h5ads/3de0ad6d-4378-4f6...,Single-cell multiomic profiling of human lungs...,.h5ad,dataset,AnnData,352144365,-TosSu-93OSu_jhPk0cOQQ,,46500,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.212862+00:00,1,,1
3096,qV5esGG2dDXlcEYVNlcp,cell-census/2024-07-01/h5ads/4ebcbeeb-2208-4d3...,Vascular endothelial cells,.h5ad,dataset,AnnData,194059730,JzQWkYD4qxNfb7S-5rpDFQ,,20855,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.324041+00:00,1,,1
3157,0EvCxCNIHX5sJ5SuiJc4,cell-census/2024-07-01/h5ads/62315937-e268-4fa...,"T, NK and ILC",.h5ad,dataset,AnnData,48914732,CLWqAhLiHdiA9R9b7nboBQ,,4778,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.430883+00:00,1,,1
3189,yBlPj8fH71ZpQzzICcyd,cell-census/2024-07-01/h5ads/703f00e6-b996-48e...,PNS,.h5ad,dataset,AnnData,12575655,1UJhLvRP0EveA1DqXEahxA,,649,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.492181+00:00,1,,1
3223,5v0IoJIlnuX4q1y57LDy,cell-census/2024-07-01/h5ads/7b3368a5-c1a0-497...,B cells,.h5ad,dataset,AnnData,42340721,XMc31Q7EW4ychA4jMKJiqA,,4138,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.553054+00:00,1,,1
3238,vBAoZ3akkvDiaSXlfeM3,cell-census/2024-07-01/h5ads/810ac45f-8969-469...,Endothelium,.h5ad,dataset,AnnData,71326733,7kcyLK0LjVZYewd7Did3sg,,5467,md5-n,False,False,1,2,,2024-07-01,True,27,2024-07-12 12:34:09.578579+00:00,1,,1


In [11]:
# we can now define the batch and data loader
batch_keys = "assay"
datamodule = MappedCollectionDataModule(
    collection,
    batch_key=batch_keys,
    batch_size=1024,
    join="inner",
)

... synchronizing e22c2ab4-c025-4804-b7a3-0b0ebd48c87a.h5ad: 100.0%
... synchronizing 7b3368a5-c1a0-4973-9e75-d95b4150c7da.h5ad: 100.0%
... synchronizing 703f00e6-b996-48e5-bc34-00c41b9876f4.h5ad: 100.0%
... synchronizing 62315937-e268-4fa5-a032-8f7d776f3a3f.h5ad: 100.0%
... synchronizing 0ba16f4b-cb87-4fa3-9363-19fc51eec6e7.h5ad: 100.0%
... synchronizing e04daea4-4412-45b5-989e-76a9be070a89.h5ad: 100.0%
... synchronizing 2d85960a-2ba8-4f54-9aec-537fae839f5d.h5ad: 100.0%
... synchronizing e871881f-b42d-4500-906d-0972a14ba47d.h5ad: 100.0%


In [12]:
print(datamodule.n_obs, datamodule.n_vars, datamodule.n_batch)

343281 11771 6


In [13]:
print(datamodule.registry)

Pretty printing has been turned OFF


In [14]:
# Init the model
model = scvi.model.SCVI(registry=datamodule.registry)

In [None]:
# Training the model
model.train(
    max_epochs=10,
    batch_size=1024,
    datamodule=datamodule,
)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training:   0%|          | 0/10 [00:00<?, ?it/s]

  reconst_loss = -generative_outputs[MODULE_KEYS.PX_KEY].log_prob(x).sum(-1)
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 1024. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
  reconst_loss = -generative_outputs[MODULE_KEYS.PX_KEY].log_prob(x).sum(-1)


In [None]:
# Save the model
model.save("lamin_model", save_anndata=False, overwrite=True, datamodule=datamodule)

In [None]:
model.history.keys()

In [None]:
# The way to extract the internal model analysis is by the inference_dataloader
# Datamodule will always require to pass it into all downstream functions.
inference_dataloader = datamodule.inference_dataloader()
latent = model.get_latent_representation(dataloader=inference_dataloader)

In [None]:
# We extract the adata of the model, to be able to plot it
adata = collection.load(join="inner")

In [None]:
adata.obsm["scvi"] = latent

In [None]:
# We can now generate the neighbors and the UMAP.
sc.pp.neighbors(adata, use_rep="scvi", key_added="scvi")
sc.tl.umap(adata, neighbors_key="scvi")
sc.pl.umap(adata, color="dataset_id", title="SCVI")

In [None]:
sc.pl.umap(adata, color="tissue_general", title="SCVI")

In [None]:
sc.pl.umap(adata, color="cell_type", title="SCVI")