### Generate a random set for quick testing

In [11]:
import pandas as pd
import numpy as np
import scanpy as sc
import cellxgene_census
from sklearn.model_selection import train_test_split
from cellxgene_census.experimental import get_embedding, get_embedding_metadata, get_all_available_embeddings

# 1. Choose a Census version and organism
ORGANISM = "homo_sapiens"
MEASUREMENT = "RNA"
CENSUS_VERSION = "2025-01-30"
SEED = 42

SAMPLE_SIZE = 100_000

EMBEDDING_NAME = "geneformer"

METADATA_FIELDS = [
    "assay",
    "dataset_id",
    "cell_type",
    "development_stage",
    "disease",
    "self_reported_ethnicity",
    "sex",
    "tissue_general",
    "soma_joinid"
]

In [12]:
with cellxgene_census.open_soma(census_version=CENSUS_VERSION) as census:
    adata = cellxgene_census.get_anndata(
        census,
        organism=ORGANISM,
        measurement_name=MEASUREMENT,
        obs_value_filter=f"soma_joinid < {SAMPLE_SIZE}",
        obs_embeddings=[EMBEDDING_NAME]
    )



In [15]:
adata.obsm["geneformer"].shape

(5000, 512)

In [None]:
# save adata
adata.write("adata_continuous_sample.h5ad")

### Try getting stratified samples

In [8]:

with cellxgene_census.open_soma(census_version=CENSUS_VERSION) as census:
    # Filter for primary data and normal tissue
    cell_metadata = census["census_data"]["homo_sapiens"].obs.read(
        value_filter="is_primary_data == True and disease == 'normal'",
        column_names=METADATA_FIELDS
    )

    cell_metadata = cell_metadata.concat()
    cell_metadata = cell_metadata.to_pandas()

cell_metadata

Unnamed: 0,assay,dataset_id,cell_type,development_stage,disease,self_reported_ethnicity,sex,tissue_general,soma_joinid,is_primary_data
0,Smart-seq2,a5d95a42-0137-496f-8a60-101e17f263c8,naive B cell,26-year-old stage,normal,unknown,male,blood,5716,True
1,Smart-seq2,a5d95a42-0137-496f-8a60-101e17f263c8,naive B cell,26-year-old stage,normal,unknown,male,blood,5717,True
2,Smart-seq2,a5d95a42-0137-496f-8a60-101e17f263c8,naive B cell,26-year-old stage,normal,unknown,male,blood,5718,True
3,Smart-seq2,a5d95a42-0137-496f-8a60-101e17f263c8,naive B cell,26-year-old stage,normal,unknown,male,blood,5719,True
4,Smart-seq2,a5d95a42-0137-496f-8a60-101e17f263c8,naive B cell,26-year-old stage,normal,unknown,male,blood,5720,True
...,...,...,...,...,...,...,...,...,...,...
42780453,10x 3' v3,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,L2/3-6 intratelencephalic projecting glutamate...,88-year-old stage,normal,European,female,brain,106118159,True
42780454,10x 3' v3,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,pvalb GABAergic cortical interneuron,86-year-old stage,normal,European,female,brain,106118160,True
42780455,10x 3' v3,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,L2/3-6 intratelencephalic projecting glutamate...,80 year-old and over stage,normal,European,female,brain,106118161,True
42780456,10x 3' v3,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3,L2/3-6 intratelencephalic projecting glutamate...,78-year-old stage,normal,Asian,male,brain,106118164,True


In [10]:
sampled_obs = cell_metadata.sample(n=1_000_000, random_state=SEED)

In [12]:
sampled_obs['stratify_group'] = sampled_obs[["tissue_general", "cell_type", "disease"]].astype(str).agg("_".join, axis=1)
group_sizes = sampled_obs['stratify_group'].value_counts()
sampled_obs['sample_weight'] = 1 / group_sizes[sampled_obs['stratify_group']].values

# Sample cells proportional to weights
sampled_obs = sampled_obs.sample(n=min(SAMPLE_SIZE, len(sampled_obs)), weights='sample_weight', random_state=SEED)

In [13]:
sampled_obs

Unnamed: 0,assay,dataset_id,cell_type,development_stage,disease,self_reported_ethnicity,sex,tissue_general,soma_joinid,is_primary_data,stratify_group,sample_weight
40624965,10x 3' v3,53d208b0-2cfd-4366-9866-c3c6114081bc,"activated CD4-positive, alpha-beta T cell",59-year-old stage,normal,European,male,small intestine,102833549,True,"small intestine_activated CD4-positive, alpha-...",0.062500
35269486,sci-RNA-seq3,f7c1c579-2dc0-47e2-ba19-8165c5a0e353,stromal cell,10th week post-fertilization stage,normal,unknown,female,intestine,91765509,True,intestine_stromal cell_normal,0.001028
24508509,10x 5' v1,fd072bc3-2dfb-46f8-b4e3-467cb3223182,group 3 innate lymphoid cell,14th week post-fertilization stage,normal,unknown,male,liver,69222810,True,liver_group 3 innate lymphoid cell_normal,0.022727
28625105,10x 3' v2,6725ee8e-ef5b-4e68-8901-61bd14a1fe73,Schwann cell,unknown,normal,unknown,unknown,intestine,76737521,True,intestine_Schwann cell_normal,0.016667
1989178,10x 3' v2,01209dce-3575-4bed-b1df-129f57fbc031,"CD8-positive, alpha-beta thymocyte",65-year-old stage,normal,Hispanic or Latin American,male,lung,5476227,True,"lung_CD8-positive, alpha-beta thymocyte_normal",0.014286
...,...,...,...,...,...,...,...,...,...,...,...,...
10238298,microwell-seq,2adb1f8a-a6b1-4909-8ee8-484814e2d4bf,mesenchymal stem cell,12th week post-fertilization stage,normal,Han Chinese,male,testis,23697256,True,testis_mesenchymal stem cell_normal,0.020000
6351828,10x 3' v3,9a281de7-cee5-4e80-8584-1929f46f152f,melanocyte,30-year-old stage,normal,European,male,eye,13807547,True,eye_melanocyte_normal,0.002079
15714825,10x 5' v2,1b9d8702-5af8-4142-85ed-020eb06ec4f6,"CD8-positive, alpha-beta memory T cell",eighth decade stage,normal,European,female,lamina propria,42211322,True,"lamina propria_CD8-positive, alpha-beta memory...",0.006329
22610699,10x 3' transcription profiling,c7775e88-49bf-4ba2-a03b-93f00447c958,"effector CD8-positive, alpha-beta T cell",63-year-old stage,normal,unknown,male,blood,63466230,True,"blood_effector CD8-positive, alpha-beta T cell...",0.005495


In [14]:

sorted_joinids = sorted(list(sampled_obs['soma_joinid']))
sorted_joinids

[5871,
 6111,
 6393,
 6416,
 6679,
 6866,
 6868,
 6872,
 7082,
 7118,
 7135,
 7304,
 7382,
 7439,
 7518,
 7688,
 7712,
 7745,
 8038,
 8095,
 8302,
 8483,
 8558,
 8575,
 8579,
 8604,
 8643,
 8657,
 8724,
 8855,
 8911,
 9315,
 9415,
 9691,
 10005,
 10019,
 10164,
 10202,
 10314,
 10365,
 10525,
 10542,
 10564,
 10580,
 10631,
 10636,
 10694,
 10707,
 10713,
 10727,
 10864,
 11331,
 11366,
 11398,
 11489,
 11560,
 11667,
 11725,
 11910,
 11921,
 12147,
 12262,
 12344,
 12368,
 12376,
 12415,
 12530,
 12734,
 12772,
 12881,
 13144,
 13232,
 13236,
 13241,
 13405,
 13618,
 13797,
 13828,
 13856,
 14223,
 14330,
 14364,
 14454,
 14718,
 14840,
 14941,
 15138,
 15139,
 15140,
 15145,
 15249,
 15299,
 16378,
 16815,
 18462,
 18478,
 18597,
 18887,
 18945,
 19052,
 19143,
 19252,
 19278,
 19292,
 19319,
 19388,
 19577,
 20288,
 20354,
 20648,
 20883,
 21245,
 29381,
 30428,
 45785,
 45896,
 45908,
 46000,
 46038,
 46060,
 46076,
 46196,
 46218,
 46259,
 46266,
 46288,
 46294,
 46390,
 46480,
 5

In [15]:
# Create obs filter for the specific soma_joinids
# Convert to sorted list for better performance
with cellxgene_census.open_soma(census_version=CENSUS_VERSION) as census:
    adata = cellxgene_census.get_anndata(
        census,
        organism=ORGANISM,
        measurement_name=MEASUREMENT,
        obs_coords=sorted_joinids,
        var_value_filter="feature_type=='protein_coding'",
        obs_column_names=METADATA_FIELDS,  # Include metadata columns,
        obs_embeddings=[EMBEDDING_NAME]
    )

KeyboardInterrupt: 

In [None]:
# save adata
adata.write("adata_stratified_sample.h5ad")