In [1]:
meas = "RNA"
layer = "raw"
census = "2024-07-01"
tissue = "tongue"
var_filter = None
dask_chunk_size = 0  # Non-Dask mode, by default
n_workers = None
threads_per_worker = 1
tdb_workers = None
dashboard_port = 8787

In [2]:
# Parameters
tissue = "colon"
dask_chunk_size = 1000
n_workers = 16
threads_per_worker = 1
tdb_workers = 1
dashboard_port = 8786


In [3]:
from concurrent.futures import ThreadPoolExecutor
from os import cpu_count
from typing import Literal
import anndata as ad
import scanpy as sc
from somacore import AxisQuery
from tiledbsoma import Experiment, SOMATileDBContext

DEFAULT_CONFIG = {
    "vfs.s3.no_sign_request": "true",
    "vfs.s3.region": "us-west-2"
}
CENSUS_S3 = "s3://cellxgene-census-public-us-west-2/cell-census"

species = "homo_sapiens"
soma_uri = f"{CENSUS_S3}/{census}/soma"
exp_uri = f"{soma_uri}/census_data/{species}"

if not tdb_workers:
    tdb_workers = cpu_count()

exp = Experiment.open(
    exp_uri,
    context=SOMATileDBContext(
        tiledb_config=DEFAULT_CONFIG,
        threadpool=ThreadPoolExecutor(max_workers=tdb_workers),
    ),
)
exp

<Experiment 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens' (open for 'r') (2 items)
    'ms': 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/ms' (unopened)
    'obs': 's3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/obs' (unopened)>

In [4]:
%%time
obs_filter = f'tissue_general == "{tissue}"' if tissue else None
query = exp.axis_query(
    measurement_name=meas,
    obs_query=AxisQuery(value_filter=obs_filter) if obs_filter else None,
    var_query=AxisQuery(value_filter=var_filter) if var_filter else None,
)

CPU times: user 9.99 ms, sys: 4.02 ms, total: 14 ms
Wall time: 192 ms


In [5]:
%%time
query.n_obs

CPU times: user 725 ms, sys: 1.44 s, total: 2.17 s
Wall time: 586 ms


714413

In [6]:
from dask.distributed import Client, LocalCluster
if dask_chunk_size:
    if n_workers is None:
        n_workers = cpu_count() // tdb_workers // threads_per_worker
    cluster = LocalCluster(
        n_workers=n_workers,
        threads_per_worker=threads_per_worker,
        dashboard_address=f":{dashboard_port}",
    )
    print(f"{n_workers=}, {threads_per_worker=}, {tdb_workers=}")
    client = Client(cluster)
else:
    dask_chunk_size = None
    client = None
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45311 instead


n_workers=16, threads_per_worker=1, tdb_workers=1


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:45311/status,

0,1
Dashboard: http://127.0.0.1:45311/status,Workers: 16
Total threads: 16,Total memory: 246.55 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:38851,Workers: 16
Dashboard: http://127.0.0.1:45311/status,Total threads: 16
Started: Just now,Total memory: 246.55 GiB

0,1
Comm: tcp://127.0.0.1:39943,Total threads: 1
Dashboard: http://127.0.0.1:37773/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:45049,
Local directory: /tmp/dask-scratch-space/worker-atqzoeax,Local directory: /tmp/dask-scratch-space/worker-atqzoeax

0,1
Comm: tcp://127.0.0.1:41655,Total threads: 1
Dashboard: http://127.0.0.1:38193/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:45121,
Local directory: /tmp/dask-scratch-space/worker-l4wc3efv,Local directory: /tmp/dask-scratch-space/worker-l4wc3efv

0,1
Comm: tcp://127.0.0.1:34819,Total threads: 1
Dashboard: http://127.0.0.1:40437/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:38011,
Local directory: /tmp/dask-scratch-space/worker-nen2r170,Local directory: /tmp/dask-scratch-space/worker-nen2r170

0,1
Comm: tcp://127.0.0.1:38243,Total threads: 1
Dashboard: http://127.0.0.1:45045/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:44119,
Local directory: /tmp/dask-scratch-space/worker-3wt76d97,Local directory: /tmp/dask-scratch-space/worker-3wt76d97

0,1
Comm: tcp://127.0.0.1:33807,Total threads: 1
Dashboard: http://127.0.0.1:42317/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:45349,
Local directory: /tmp/dask-scratch-space/worker-qan06u9b,Local directory: /tmp/dask-scratch-space/worker-qan06u9b

0,1
Comm: tcp://127.0.0.1:46417,Total threads: 1
Dashboard: http://127.0.0.1:43297/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:33835,
Local directory: /tmp/dask-scratch-space/worker-7nqyxbhr,Local directory: /tmp/dask-scratch-space/worker-7nqyxbhr

0,1
Comm: tcp://127.0.0.1:44233,Total threads: 1
Dashboard: http://127.0.0.1:40911/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:35341,
Local directory: /tmp/dask-scratch-space/worker-0yehr2fs,Local directory: /tmp/dask-scratch-space/worker-0yehr2fs

0,1
Comm: tcp://127.0.0.1:43553,Total threads: 1
Dashboard: http://127.0.0.1:45435/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:40121,
Local directory: /tmp/dask-scratch-space/worker-pgno156p,Local directory: /tmp/dask-scratch-space/worker-pgno156p

0,1
Comm: tcp://127.0.0.1:44609,Total threads: 1
Dashboard: http://127.0.0.1:39337/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:39805,
Local directory: /tmp/dask-scratch-space/worker-0_7mryf1,Local directory: /tmp/dask-scratch-space/worker-0_7mryf1

0,1
Comm: tcp://127.0.0.1:36751,Total threads: 1
Dashboard: http://127.0.0.1:38635/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:45145,
Local directory: /tmp/dask-scratch-space/worker-oqcw0x7a,Local directory: /tmp/dask-scratch-space/worker-oqcw0x7a

0,1
Comm: tcp://127.0.0.1:38275,Total threads: 1
Dashboard: http://127.0.0.1:40421/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:37931,
Local directory: /tmp/dask-scratch-space/worker-l0z4onfd,Local directory: /tmp/dask-scratch-space/worker-l0z4onfd

0,1
Comm: tcp://127.0.0.1:42509,Total threads: 1
Dashboard: http://127.0.0.1:41017/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:40571,
Local directory: /tmp/dask-scratch-space/worker-ff7vwjq3,Local directory: /tmp/dask-scratch-space/worker-ff7vwjq3

0,1
Comm: tcp://127.0.0.1:39569,Total threads: 1
Dashboard: http://127.0.0.1:46601/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:45755,
Local directory: /tmp/dask-scratch-space/worker-xw3susil,Local directory: /tmp/dask-scratch-space/worker-xw3susil

0,1
Comm: tcp://127.0.0.1:40967,Total threads: 1
Dashboard: http://127.0.0.1:40009/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:38807,
Local directory: /tmp/dask-scratch-space/worker-zgmw54zo,Local directory: /tmp/dask-scratch-space/worker-zgmw54zo

0,1
Comm: tcp://127.0.0.1:37849,Total threads: 1
Dashboard: http://127.0.0.1:40405/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:41621,
Local directory: /tmp/dask-scratch-space/worker-ft2rx3e7,Local directory: /tmp/dask-scratch-space/worker-ft2rx3e7

0,1
Comm: tcp://127.0.0.1:40163,Total threads: 1
Dashboard: http://127.0.0.1:34371/status,Memory: 15.41 GiB
Nanny: tcp://127.0.0.1:35853,
Local directory: /tmp/dask-scratch-space/worker-fhp8znwd,Local directory: /tmp/dask-scratch-space/worker-fhp8znwd


## HVG

In [7]:
%%time
add = query.to_anndata(layer, dask_chunk_size=dask_chunk_size)
add

CPU times: user 2.91 s, sys: 3.06 s, total: 5.96 s
Wall time: 2.26 s


AnnData object with n_obs × n_vars = 714413 × 60530
    obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'

In [8]:
%%time
sc.pp.normalize_total(add)

CPU times: user 123 ms, sys: 15.9 ms, total: 139 ms
Wall time: 133 ms


In [9]:
%%time
sc.pp.log1p(add)

CPU times: user 1.51 ms, sys: 878 μs, total: 2.38 ms
Wall time: 2.41 ms


In [10]:
%%time
hvg = sc.pp.highly_variable_genes(add, inplace=False, subset=True)
hvg

CPU times: user 1min 33s, sys: 13.3 s, total: 1min 46s
Wall time: 3min 55s


Unnamed: 0,means,dispersions,mean_bin,dispersions_norm,highly_variable
1,0.026894,1.289713,"(-0.00486, 0.243]",0.810094,True
5,0.014958,1.619181,"(-0.00486, 0.243]",1.197983,True
6,0.131333,2.046089,"(-0.00486, 0.243]",1.700591,True
8,0.064922,1.228058,"(-0.00486, 0.243]",0.737506,True
13,0.026449,1.090968,"(-0.00486, 0.243]",0.576108,True
...,...,...,...,...,...
51229,0.018870,1.307655,"(-0.00486, 0.243]",0.831218,True
51526,0.027907,1.786350,"(-0.00486, 0.243]",1.394795,True
53899,0.022165,1.578129,"(-0.00486, 0.243]",1.149652,True
56651,0.020105,1.251600,"(-0.00486, 0.243]",0.765223,True


---
Census `tissue_general` counts, for reference:
```python
exp_obs = exp.obs.read().concat().to_pandas()
exp_obs.tissue_general.value_counts()
```
```
tissue_general
brain                       26281059
blood                       10835244
lung                         6231233
breast                       5555979
eye                          4190842
heart                        3629952
kidney                       2083054
liver                        1815408
small intestine              1237182
skin of body                 1045024
endocrine gland               979667
respiratory system            944355
bone marrow                   813373
spleen                        751041
lymph node                    717899
colon                         714413
placenta                      638786
reproductive system           570488
adrenal gland                 560114
nose                          470445
adipose tissue                468603
stomach                       446348
prostate gland                348664
fallopian tube                250178
pancreas                      250161
esophagus                     215951
digestive system              211912
musculature                   189452
large intestine               180996
pleural fluid                 179134
yolk sac                      169725
embryo                        165937
uterus                        148198
mucosa                        131978
spinal cord                   117463
exocrine gland                115722
intestine                     104490
immune system                  89046
bladder organ                  82797
vasculature                    63667
ovary                          53751
lamina propria                 45230
tongue                         45060
central nervous system         31780
esophagogastric junction       29105
axilla                         19792
pleura                         19695
skeletal system                14680
saliva                         14502
omentum                        14003
testis                         13211
tendon of semitendinosus       10533
gallbladder                     9769
scalp                           3029
ureter                          2390
Name: count, dtype: int64
```