In [1]:
import scanpy as sc
import pandas as pd
import scanpy.external as sce

sc.settings.verbosity = 4  # show logging output
sc.settings.autosave = True  # save figures, do not show them
sc.settings.set_figure_params(dpi=400, dpi_save=500)  # set sufficiently high resolution for saving
sc.settings.figdir = '/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/integrated_objects/figures'

Read in the scanpy object containing all 240 libraries, post-filtering and embedding.

In [2]:
adata = sc.read('/directflow/SCCGGroupShare/projects/blabow/tenk10k_phase1/data_processing/scanpy/output/integrated_objects/240_libraries/240_libraries_concatenated_harmony.h5ad')
adata

AnnData object with n_obs × n_vars = 5058048 × 3342
    obs: 'cellbender_background_fraction', 'cellbender_cell_probability', 'cellbender_cell_size', 'cellbender_droplet_efficiency', 'celltypist_predicted_labels', 'celltypist_over_clustering', 'celltypist_majority_voting', 'celltypist_conf_score', 'wg2_sample', 'wg2_nCount_RNA', 'wg2_nFeature_RNA', 'wg2_percent_mt', 'wg2_azimuth_predicted_celltype_l2', 'wg2_azimuth_predicted_celltype_l2_score', 'wg2_scpred_prediction', 'Vireo_Individual_Assignment', 'Vireo_DropletType', 'scDblFinder_DropletType', 'scDblFinder_Score', 'scds_score', 'scds_DropletType', 'MajoritySinglet_DropletType', 'MajoritySinglet_Individual_Assignment', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'original_barcode', 'new_cell_name', 'sequencing_library', 'individual', 'cohort', 'onek1k_id', 'cpg_id_old', 'tob_id', 'cpg_id', 'onek1k_donor', 'ct_id', 'batc

# Clustering

In [None]:
# cluster using leiden algorithm
# update scanpy to use this faster igraph implementation??
# sc.tl.leiden(adata, flavor="igraph", n_iterations=2)

sc.tl.leiden(
    adata,
)

running Leiden clustering


In [None]:
with rc_context({"figure.figsize": (5, 5)}):
    sc.pl.umap(
        adata,
        color=["leiden"],
        add_outline=True,
        legend_loc="on data",
        legend_fontsize=12,
        legend_fontoutline=2,
        frameon=False,
)

# Re-assess QC 

In [None]:
# look at sequencing depth / complexity 
sc.pl.umap(
    adata,
    color=["log1p_total_counts", "log1p_n_genes_by_counts"],
    wspace=0.5,
    ncols=2,
)

In [None]:
# look at mitochondrial, ribosomal, and hemoglobin gene expression 
sc.pl.umap(
    adata,
    color=["pct_counts_mt", "pct_counts_ribo", "pct_counts_hb"],
    wspace=0.5,
    ncols=2,
)

We will remove the following cells:
* Cells classified as platelets, erythrocytes, doublets by scPred
* Cells that are in the "low sequencing depth" cluster

In [None]:
adata.obs.columns

In [None]:
# Manually identify the cluster that corresponds to the low-seq-depth cells based on the UMAPs above
cluster_remove =

adata = adata[adata.obs['leiden'] != cluster_remove]
adata = adata[adata.obs['wg2_scpred_prediction']]
