# Querying CZI Census data via CZI CellXGene

In [None]:
!pip install -U cellxgene-census scanpy pandas numpy

In [None]:
import cellxgene_census as cxg
import scanpy as sc
import pandas as pd
import numpy as np
import os

In [None]:
# Constants
VERSION = "latest"
OUTPUT_DIR = ""  # Adjust based on need
OUTPUT_FILE = "" # Name chunks 
PARTITION_SIZE = 500000  # Adjust based on memory availability

In [None]:
def get_soma_ids(QUERY, column_names: list):
    """
    Retrieve all `soma_joinid` values
    """
    with cxg.open_soma(census_version=VERSION) as census:
        df = census["census_data"]["homo_sapiens"].obs.read(
            value_filter=QUERY,
            column_names=column_names
        ).concat().to_pandas()

        df = df[df.development_stage.str.contains('year|decade|adult')] # only filter adult entries

        return df

def get_cxg_data(soma_ids, output_file, column_names, partition_size=PARTITION_SIZE):
    """
    Download data in chunks and save as H5AD.

    Parameters:
        output_file (str): Path to save the output h5ad file.
        partition_size (int): Number of cells per partition.
    """
    # Get all matching cell IDs
    total_cells = len(soma_ids)

    print(f"Found {total_cells} cells matching the query.")

    if total_cells == 0:
        print("No data found.")
        return

    with cxg.open_soma(census_version=VERSION) as census:
        for i in range(0, total_cells, partition_size):
            chunk_ids = soma_ids[i: i + partition_size]
            print(f"Downloading cells {i} to {i + len(chunk_ids)}...")

            adata = cxg.get_anndata(
                census=census,
                organism="Homo sapiens",
                obs_coords=chunk_ids,  # Using `soma_joinid` values
                obs_column_names=column_names
            )

            chunk_file = f"{output_file}_chunk{i // partition_size}.h5ad"
            adata.write_h5ad(chunk_file)
            print(f"Saved {adata.shape[0]} cells to {chunk_file}")
            del adata

In [None]:
column_names = [
                        "soma_joinid",
                        "dataset_id",
                        "assay",
                        # "assay_ontology_term_id",
                        "cell_type",
                        "cell_type_ontology_term_id",
                        "development_stage",
                        "development_stage_ontology_term_id",
                        "disease",
                        "disease_ontology_term_id",
                        "donor_id",
                        # "is_primary_data",
                        "observation_joinid",
                        "self_reported_ethnicity",
                        "self_reported_ethnicity_ontology_term_id",
                        "sex",
                        "sex_ontology_term_id",
                        # "suspension_type",
                        # "tissue",
                        # "tissue_ontology_term_id",
                        # "tissue_type",
                        "tissue_general",
                        "tissue_general_ontology_term_id",
                        # "raw_sum",
                        # "nnz",
                        # "raw_mean_nnz",
                        # "raw_variance_nnz",
                        # "n_measured_vars",
                    ]

In [None]:
# Cell metadata filter values

tissue_general = "['bone marrow', 'blood', 'spleen', 'lung', 'lymph node']"
disease = "['normal']"
assay_ontology_term_id = "['EFO:0030003', 'EFO:0009901', 'EFO:0009899', 'EFO:0009922', 'EFO:0022604', 'EFO:0030004', 'EFO:0009900', 'EFO:0022605']" # all 10x 3' and 10x 5' protocol assay ontologies
suspension_type = "['cell']"
is_primary_data = "True"

# Final query string
obs_value_filter =  f"tissue_general in {tissue_general} and disease in {disease} and assay_ontology_term_id in {assay_ontology_term_id} and suspension_type in {suspension_type} and is_primary_data == {is_primary_data}"

In [None]:
# Settings to get immune cells

cells = get_soma_ids(obs_value_filter, column_names=['soma_joinid', 'development_stage', 'tissue_general'])
cells.tissue_general.value_counts()

In [None]:
# Set the desired number of cells per tissue
n_cells = 200000

# Group by tissue_general and apply sampling
downsampled_cells = cells.groupby('tissue_general').apply(lambda x: x.sample(n=min(len(x), n_cells))).reset_index(drop=True)

# Verify the new counts
display(downsampled_cells.tissue_general.value_counts())

In [None]:
downsampled_cells_by_tissue = downsampled_cells.soma_joinid.to_list() # 200k per type

In [None]:
# Run download and create file
get_cxg_data(soma_ids=downsampled_cells_by_tissue,
             output_file=os.path.join(OUTPUT_DIR, OUTPUT_FILE),
             column_names=column_names)