In [1]:
meas = "RNA"
layer = "raw"
census = "2024-07-01"
tissue_general = "tongue"
var_filter = None
dask_chunk_size = 0  # Non-Dask mode, by default

In [2]:
# Parameters
tissue_general = "skin of body"
dask_chunk_size = 50000


In [3]:
from typing import Literal
import anndata as ad
import scanpy as sc
from somacore import AxisQuery
from tiledbsoma import Experiment, SOMATileDBContext

DEFAULT_CONFIG = {
    "vfs.s3.no_sign_request": "true",
    "vfs.s3.region": "us-west-2"
}
CENSUS_S3 = "s3://cellxgene-census-public-us-west-2/cell-census"

species = "homo_sapiens"
soma_uri = f"{CENSUS_S3}/{census}/soma"
exp_uri = f"{soma_uri}/census_data/{species}"

exp = Experiment.open(
    exp_uri,
    context=SOMATileDBContext(tiledb_config=DEFAULT_CONFIG)
)
exp

<Experiment 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens' (open for 'r') (2 items)
    'ms': 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/ms' (unopened)
    'obs': 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/obs' (unopened)>

In [4]:
%%time
obs_filter = f'tissue_general == "{tissue_general}"' if tissue_general else None
query = exp.axis_query(
    measurement_name=meas,
    obs_query=AxisQuery(value_filter=obs_filter) if obs_filter else None,
    var_query=AxisQuery(value_filter=var_filter) if var_filter else None,
)

CPU times: user 4.15 ms, sys: 2.47 ms, total: 6.63 ms
Wall time: 90.8 ms


In [5]:
%%time
query.n_obs

CPU times: user 579 ms, sys: 351 ms, total: 930 ms
Wall time: 806 ms


1045024

In [6]:
from dask.distributed import Client, LocalCluster
if dask_chunk_size:
    cluster = LocalCluster()
    client = Client(cluster)
else:
    dask_chunk_size = None
    client = None
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 61.45 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39603,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 61.45 GiB

0,1
Comm: tcp://127.0.0.1:38631,Total threads: 4
Dashboard: http://127.0.0.1:36129/status,Memory: 15.36 GiB
Nanny: tcp://127.0.0.1:35941,
Local directory: /tmp/dask-scratch-space/worker-qskwg271,Local directory: /tmp/dask-scratch-space/worker-qskwg271

0,1
Comm: tcp://127.0.0.1:34127,Total threads: 4
Dashboard: http://127.0.0.1:45683/status,Memory: 15.36 GiB
Nanny: tcp://127.0.0.1:39777,
Local directory: /tmp/dask-scratch-space/worker-8mcxem9h,Local directory: /tmp/dask-scratch-space/worker-8mcxem9h

0,1
Comm: tcp://127.0.0.1:43341,Total threads: 4
Dashboard: http://127.0.0.1:39261/status,Memory: 15.36 GiB
Nanny: tcp://127.0.0.1:36013,
Local directory: /tmp/dask-scratch-space/worker-0_tnksaj,Local directory: /tmp/dask-scratch-space/worker-0_tnksaj

0,1
Comm: tcp://127.0.0.1:46489,Total threads: 4
Dashboard: http://127.0.0.1:42393/status,Memory: 15.36 GiB
Nanny: tcp://127.0.0.1:45725,
Local directory: /tmp/dask-scratch-space/worker-5mibxl8s,Local directory: /tmp/dask-scratch-space/worker-5mibxl8s


## HVG

In [7]:
%%time
add = query.to_anndata(layer, dask_chunk_size=dask_chunk_size)
add

CPU times: user 2.77 s, sys: 2.11 s, total: 4.89 s
Wall time: 4.65 s


AnnData object with n_obs × n_vars = 1045024 × 60530
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [8]:
%%time
sc.pp.normalize_total(add)

CPU times: user 132 ms, sys: 30.7 ms, total: 163 ms
Wall time: 158 ms


In [9]:
%%time
sc.pp.log1p(add)

CPU times: user 2.48 ms, sys: 0 ns, total: 2.48 ms
Wall time: 2.39 ms


In [10]:
%%time
hvg = sc.pp.highly_variable_genes(add, inplace=False, subset=True)
hvg



CPU times: user 14.8 s, sys: 3.21 s, total: 18 s
Wall time: 6min 17s


Unnamed: 0,means,dispersions,mean_bin,dispersions_norm,highly_variable
0,0.931193,2.521423,"(0.726, 0.968]",0.631123,True
1,0.161581,2.260028,"(-0.00484, 0.242]",1.130201,True
6,0.659608,2.778323,"(0.484, 0.726]",1.216669,True
24,0.043459,1.909486,"(-0.00484, 0.242]",0.912260,True
30,0.212682,1.331868,"(-0.00484, 0.242]",0.553142,True
...,...,...,...,...,...
57441,0.043003,2.199967,"(-0.00484, 0.242]",1.092859,True
57514,0.019218,1.982776,"(-0.00484, 0.242]",0.957826,True
57599,0.018154,1.966757,"(-0.00484, 0.242]",0.947867,True
57670,0.016679,1.921009,"(-0.00484, 0.242]",0.919424,True


---
Census `tissue_general` counts, for reference:
```python
exp_obs = exp.obs.read().concat().to_pandas()
exp_obs.tissue_general.value_counts()
```
```
tissue_general
brain                       26281059
blood                       10835244
lung                         6231233
breast                       5555979
eye                          4190842
heart                        3629952
kidney                       2083054
liver                        1815408
small intestine              1237182
skin of body                 1045024
endocrine gland               979667
respiratory system            944355
bone marrow                   813373
spleen                        751041
lymph node                    717899
colon                         714413
placenta                      638786
reproductive system           570488
adrenal gland                 560114
nose                          470445
adipose tissue                468603
stomach                       446348
prostate gland                348664
fallopian tube                250178
pancreas                      250161
esophagus                     215951
digestive system              211912
musculature                   189452
large intestine               180996
pleural fluid                 179134
yolk sac                      169725
embryo                        165937
uterus                        148198
mucosa                        131978
spinal cord                   117463
exocrine gland                115722
intestine                     104490
immune system                  89046
bladder organ                  82797
vasculature                    63667
ovary                          53751
lamina propria                 45230
tongue                         45060
central nervous system         31780
esophagogastric junction       29105
axilla                         19792
pleura                         19695
skeletal system                14680
saliva                         14502
omentum                        14003
testis                         13211
tendon of semitendinosus       10533
gallbladder                     9769
scalp                           3029
ureter                          2390
Name: count, dtype: int64
```