In [20]:
import anndata as ad
from anndata.experimental import read_lazy
import dask.array as da
import h5py
import numpy as np
import scanpy as sc
from pathlib import Path

datadir = Path('/Users/poldrack/data_unsynced/BCBS/ad_scrnaseq/')

In [21]:
datafile = datadir / 'dad4819b-4c14-439c-b32a-2c8d68bd22e1.h5ad'

load_annotation_index = True
adata = read_lazy(h5py.File(datafile, 'r'),
    load_annotation_index=load_annotation_index)

In [3]:
print(adata)

AnnData object with n_obs × n_vars = 1395601 × 35483
    obs: 'assay_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'donor_id', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'ADNC_colors', 'APOE4 status_colors', 'Age at death_color

Filter to a smaller set of glial cell types

In [4]:
unique_cell_types = np.unique(adata.obs['cell_type'])
print(unique_cell_types)

['L2/3-6 intratelencephalic projecting glutamatergic neuron'
 'L5 extratelencephalic projecting glutamatergic cortical neuron'
 'L6b glutamatergic cortical neuron' 'VIP GABAergic cortical interneuron'
 'astrocyte of the cerebral cortex'
 'caudal ganglionic eminence derived cortical interneuron'
 'cerebral cortex endothelial cell'
 'chandelier pvalb GABAergic cortical interneuron'
 'corticothalamic-projecting glutamatergic cortical neuron'
 'lamp5 GABAergic cortical interneuron' 'microglial cell'
 'near-projecting glutamatergic cortical neuron' 'oligodendrocyte'
 'oligodendrocyte precursor cell' 'pvalb GABAergic cortical interneuron'
 'sncg GABAergic cortical interneuron'
 'sst GABAergic cortical interneuron' 'vascular leptomeningeal cell']


In [5]:
selected_types = [
    'astrocyte of the cerebral cortex',
    'microglial cell'
]
mask = np.isin(adata.obs['cell_type'].values, selected_types)
np.sum(mask)

np.int64(129930)

In [6]:

print("Subsetting data...")
subset_adata = adata[mask, :]

print("Loading data into memory (this can take a few minutes)...")
subset_loaded = subset_adata.to_memory()

# filter out genes with zero counts across all selected cells
print("Filtering genes with zero counts...")
sc.pp.filter_genes(subset_loaded, min_counts=1)



Subsetting data...
Loading data into memory...
Filtering genes with zero counts...
View of AnnData object with n_obs × n_vars = 129930 × 35483
    obs: 'assay_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'donor_id', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_

In [7]:
print(subset_loaded)


AnnData object with n_obs × n_vars = 129930 × 33389
    obs: 'assay_ontology_term_id', 'suspension_type', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'donor_id', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'n_counts'
    uns: 'ADNC_colors', 'APOE4 status_colors', 'Age at 

In [8]:
subset_df = subset_loaded.to_df()
print(subset_df.shape)

(129930, 33389)


In [22]:
subset_loaded.write(datadir /  'glia_subset.h5ad')

In [23]:
!ls -lh /Users/poldrack/data_unsynced/BCBS/ad_scrnaseq

total 117720632
-rw-r--r--  1 poldrack  staff    50G Dec 17 13:09 dad4819b-4c14-439c-b32a-2c8d68bd22e1.h5ad
-rw-r--r--@ 1 poldrack  staff   6.6G Dec 17 16:18 glia_subset.h5ad
