In [10]:
import pertpy as pt
import scanpy as sc
import numpy as np

In [11]:
adata = pt.dt.dixit_2016_scperturb()

In [12]:
# basic qc and pp
sc.pp.filter_cells(adata, min_counts=1000)
sc.pp.normalize_per_cell(adata)
sc.pp.filter_genes(adata, min_cells=50)
sc.pp.log1p(adata)

In [13]:
# subsample against high class imbalance (and to reduce memory footprint for the tutorial set)
N_min = 100
counts = adata.obs["perturbation"].value_counts()
groups = counts.index[counts >= N_min]
indices = [
    np.random.choice(
        adata.obs_names[adata.obs["perturbation"] == group], size=N_min, replace=False
    )
    for group in groups
]
selection = np.hstack(np.array(indices))
adata = adata[selection].copy()
sc.pp.filter_genes(adata, min_cells=3)  # sanity cleaning

In [14]:
# select HVGs and do PCA
n_var_max = 2000  # max total features to select
sc.pp.highly_variable_genes(adata, n_top_genes=n_var_max, subset=True)
sc.pp.pca(adata, use_highly_variable=True)

In [15]:
adata

AnnData object with n_obs × n_vars = 3200 × 2000
    obs: 'perturbation', 'grna_lenient', 'target', 'moi', 'cell_line', 'celltype', 'perturbation_type', 'cancer', 'disease', 'guide_id', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'n_counts'
    var: 'gene_id', 'mt', 'ribo', 'ncounts', 'ncells', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [16]:
adata.write("distances_example_data.h5ad")