### Check observations with embeddings in CellxGene

In [25]:
import pandas as pd
import numpy as np
import scanpy as sc
import cellxgene_census
from cellxgene_census.experimental import get_embedding, get_embedding_metadata, get_all_available_embeddings

# 1. Choose a Census version and organism
ORGANISM        = "homo_sapiens"
MEASUREMENT     = "RNA"

In [2]:
cellxgene_census.get_census_version_directory()

OrderedDict([('stable',
              {'release_date': None,
               'release_build': '2025-01-30',
               'soma': {'uri': 's3://cellxgene-census-public-us-west-2/cell-census/2025-01-30/soma/',
                'relative_uri': '/cell-census/2025-01-30/soma/',
                's3_region': 'us-west-2'},
               'h5ads': {'uri': 's3://cellxgene-census-public-us-west-2/cell-census/2025-01-30/h5ads/',
                'relative_uri': '/cell-census/2025-01-30/h5ads/',
                's3_region': 'us-west-2'},
               'flags': {'lts': True}}),
             ('latest',
              {'release_date': None,
               'release_build': '2025-10-13',
               'soma': {'uri': 's3://cellxgene-census-public-us-west-2/cell-census/2025-10-13/soma/',
                'relative_uri': '/cell-census/2025-10-13/soma/',
                's3_region': 'us-west-2'},
               'h5ads': {'uri': 's3://cellxgene-census-public-us-west-2/cell-census/2025-10-13/h5ads/',
        

### Check "2023-12-15" version

In [23]:
census_version = "2023-12-15"
embs = get_all_available_embeddings(census_version)
for e in embs:
    if e["experiment_name"] == ORGANISM and e["data_type"] == "obs_embedding":
        print(e["embedding_name"])

scvi
geneformer
scgpt
uce
nmf


In [24]:
# get metadata
with cellxgene_census.open_soma(census_version=census_version) as census:
    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = "is_primary_data == True",
    )

    cell_metadata = cell_metadata.concat()

    # Converts to pandas.DataFrame
    cell_metadata = cell_metadata.to_pandas()


In [26]:
cell_metadata

Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,suspension_type,tissue,tissue_ontology_term_id,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,1402958,a5d5c529-8a1f-40b5-bda3-35208970070d,Smart-seq v4,EFO:0700016,native cell,CL:0000003,50-year-old human stage,HsapDv:0000144,normal,PATO:0000461,...,nucleus,middle temporal gyrus,UBERON:0002771,brain,UBERON:0000955,947847.0,6130,154.624307,4.511148e+05,34498
1,1402959,a5d5c529-8a1f-40b5-bda3-35208970070d,Smart-seq v4,EFO:0700016,vip GABAergic cortical interneuron,CL:4023016,50-year-old human stage,HsapDv:0000144,normal,PATO:0000461,...,nucleus,middle temporal gyrus,UBERON:0002771,brain,UBERON:0000955,1649220.0,7229,228.139438,4.450355e+05,34498
2,1402960,a5d5c529-8a1f-40b5-bda3-35208970070d,Smart-seq v4,EFO:0700016,lamp5 GABAergic cortical interneuron,CL:4023011,50-year-old human stage,HsapDv:0000144,normal,PATO:0000461,...,nucleus,middle temporal gyrus,UBERON:0002771,brain,UBERON:0000955,1570071.0,7832,200.468718,1.080806e+06,34498
3,1402961,a5d5c529-8a1f-40b5-bda3-35208970070d,Smart-seq v4,EFO:0700016,lamp5 GABAergic cortical interneuron,CL:4023011,50-year-old human stage,HsapDv:0000144,normal,PATO:0000461,...,nucleus,middle temporal gyrus,UBERON:0002771,brain,UBERON:0000955,1347705.0,7253,185.813457,8.796240e+05,34498
4,1402962,a5d5c529-8a1f-40b5-bda3-35208970070d,Smart-seq v4,EFO:0700016,vip GABAergic cortical interneuron,CL:4023016,50-year-old human stage,HsapDv:0000144,normal,PATO:0000461,...,nucleus,middle temporal gyrus,UBERON:0002771,brain,UBERON:0000955,1644662.0,8411,195.537035,1.150199e+06,34498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36227898,62998412,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,10x 3' v2,EFO:0009899,pericyte,CL:0000669,51-year-old human stage,HsapDv:0000145,normal,PATO:0000461,...,cell,lung,UBERON:0002048,lung,UBERON:0002048,1525.0,953,1.600210,2.051752e+01,20921
36227899,62998413,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,10x 3' v2,EFO:0009899,pericyte,CL:0000669,51-year-old human stage,HsapDv:0000145,normal,PATO:0000461,...,cell,lung,UBERON:0002048,lung,UBERON:0002048,1475.0,994,1.483903,2.803448e+01,20921
36227900,62998414,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,10x 3' v2,EFO:0009899,pericyte,CL:0000669,51-year-old human stage,HsapDv:0000145,normal,PATO:0000461,...,cell,lung,UBERON:0002048,lung,UBERON:0002048,1152.0,649,1.775039,2.127339e+01,20921
36227901,62998415,8c42cfd0-0b0a-46d5-910c-fc833d83c45e,10x 3' v2,EFO:0009899,pericyte,CL:0000669,51-year-old human stage,HsapDv:0000145,normal,PATO:0000461,...,cell,lung,UBERON:0002048,lung,UBERON:0002048,1856.0,1045,1.776077,1.465288e+01,20921


~36 million cells, embeddings for scvi, geneformer, scgpt, uce

### Check latest version

In [15]:
census_version = "2025-01-30"
embs = get_all_available_embeddings(census_version)
for e in embs:
    if e["experiment_name"] == ORGANISM and e["data_type"] == "obs_embedding":
        print(e["embedding_name"])

scvi
geneformer
tf-sapiens
tf-exemplar-human


In [16]:
# get metadata
with cellxgene_census.open_soma(census_version=census_version) as census:
    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = "is_primary_data == True",
    )

    cell_metadata = cell_metadata.concat()

    # Converts to pandas.DataFrame
    cell_metadata = cell_metadata.to_pandas()

~62 million cells, embeddings for scvi, geneformer, and trasncriptformer