In [44]:
import pandas as pd
import numpy as np
from pathlib import Path
from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import anndata
import scipy.sparse
import scanpy as sc
import os

In [2]:
download_base = Path('/gpfs/scratch/blukacsy/abc_atlas')
abc_cache = AbcProjectCache.from_cache_dir(download_base)
print(f"Current manifest: {abc_cache.current_manifest}")

Current manifest: releases/20250531/manifest.json


In [3]:
abc_cache.list_directories

['ASAP-PMDBS-10X',
 'ASAP-PMDBS-taxonomy',
 'Allen-CCF-2020',
 'HMBA-10xMultiome-BG',
 'HMBA-10xMultiome-BG-Aligned',
 'HMBA-BG-taxonomy-CCN20250428',
 'MERFISH-C57BL6J-638850',
 'MERFISH-C57BL6J-638850-CCF',
 'MERFISH-C57BL6J-638850-imputed',
 'MERFISH-C57BL6J-638850-sections',
 'SEAAD',
 'SEAAD-taxonomy',
 'WHB-10Xv3',
 'WHB-taxonomy',
 'WMB-10X',
 'WMB-10XMulti',
 'WMB-10Xv2',
 'WMB-10Xv3',
 'WMB-neighborhoods',
 'WMB-taxonomy',
 'Zeng-Aging-Mouse-10Xv3',
 'Zeng-Aging-Mouse-WMB-taxonomy',
 'Zhuang-ABCA-1',
 'Zhuang-ABCA-1-CCF',
 'Zhuang-ABCA-2',
 'Zhuang-ABCA-2-CCF',
 'Zhuang-ABCA-3',
 'Zhuang-ABCA-3-CCF',
 'Zhuang-ABCA-4',
 'Zhuang-ABCA-4-CCF']

In [4]:
abc_cache.list_metadata_files('WMB-10X')

['cell_metadata',
 'cell_metadata_with_cluster_annotation',
 'example_genes_all_cells_expression',
 'gene',
 'region_of_interest_metadata']

In [5]:
abc_cache.list_data_files('WMB-10Xv2')

['WMB-10Xv2-CTXsp/log2',
 'WMB-10Xv2-CTXsp/raw',
 'WMB-10Xv2-HPF/log2',
 'WMB-10Xv2-HPF/raw',
 'WMB-10Xv2-HY/log2',
 'WMB-10Xv2-HY/raw',
 'WMB-10Xv2-Isocortex-1/log2',
 'WMB-10Xv2-Isocortex-1/raw',
 'WMB-10Xv2-Isocortex-2/log2',
 'WMB-10Xv2-Isocortex-2/raw',
 'WMB-10Xv2-Isocortex-3/log2',
 'WMB-10Xv2-Isocortex-3/raw',
 'WMB-10Xv2-Isocortex-4/log2',
 'WMB-10Xv2-Isocortex-4/raw',
 'WMB-10Xv2-MB/log2',
 'WMB-10Xv2-MB/raw',
 'WMB-10Xv2-OLF/log2',
 'WMB-10Xv2-OLF/raw',
 'WMB-10Xv2-TH/log2',
 'WMB-10Xv2-TH/raw']

In [6]:
cell = abc_cache.get_metadata_dataframe(
    directory='WMB-10X',
    file_name='cell_metadata',
    dtype={'cell_label': str}
)
cell.set_index('cell_label', inplace=True)
print(f"Total cells: {len(cell)}")

cluster_details = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_to_cluster_annotation_membership_pivoted',
    keep_default_na=False
)
cluster_details.set_index('cluster_alias', inplace=True)

Total cells: 4042976


In [7]:
cell

Unnamed: 0_level_0,cell_barcode,barcoded_cell_sample_label,library_label,feature_matrix_label,entity,brain_section_label,library_method,region_of_interest_acronym,donor_label,donor_genotype,donor_sex,dataset_label,x,y,cluster_alias,abc_sample_id
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
GCGAGAAGTTAAGGGC-410_B05,GCGAGAAGTTAAGGGC,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.146826,-3.086639,1,484be5df-5d44-4bfe-9652-7b5bc739c211
AATGGCTCAGCTCCTT-411_B06,AATGGCTCAGCTCCTT,411_B06,L8TX_201029_01_E10,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550851,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.138481,-3.022000,1,5638505d-e1e8-457f-9e5b-59e3e2302417
AACACACGTTGCTTGA-410_B05,AACACACGTTGCTTGA,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.472557,-2.992709,1,a0544e29-194f-4d34-9af4-13e7377b648f
CACAGATAGAGGCGGA-410_A05,CACAGATAGAGGCGGA,410_A05,L8TX_201029_01_A10,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.379622,-3.043442,1,c777ac0b-77e1-4d76-bf8e-2b3d9e08b253
AAAGTGAAGCATTTCG-410_B05,AAAGTGAAGCATTTCG,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.909480,-2.601536,1,49860925-e82b-46df-a228-fd2f97e75d39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGTGAGCAAACGCGA-1350_C05,GTGTGAGCAAACGCGA,1350_C05,L8XR_220728_01_A05,WMB-10XMulti,cell,,10xRSeq_Mult,MB,C57BL6J-641405,wt/wt,M,WMB-10XMulti,-7.716915,0.223654,8861,ba1d0e38-bea7-4d4f-bfcd-49121938e743
TTAGCAATCCCTGTTA-1350_C05,TTAGCAATCCCTGTTA,1350_C05,L8XR_220728_01_A05,WMB-10XMulti,cell,,10xRSeq_Mult,MB,C57BL6J-641405,wt/wt,M,WMB-10XMulti,-3.115098,-3.024478,8215,342bd0bb-cbe5-479b-9c70-fef59a730255
TTTGGCTGTCGCGCAA-1350_C05,TTTGGCTGTCGCGCAA,1350_C05,L8XR_220728_01_A05,WMB-10XMulti,cell,,10xRSeq_Mult,MB,C57BL6J-641405,wt/wt,M,WMB-10XMulti,-7.950964,0.409335,8798,4634de09-d8e0-4e40-a49b-eba311de08b5
ATCCACCTCACAGACT-1320_B04,ATCCACCTCACAGACT,1320_B04,L8XR_220630_02_B10,WMB-10XMulti,cell,,10xRSeq_Mult,OLF,C57BL6J-625156,wt/wt,F,WMB-10XMulti,4.579441,12.135833,8798,5b3061de-1cb8-47b6-9368-52824e1031ce


In [8]:
cluster_details

Unnamed: 0_level_0,neurotransmitter,class,subclass,supertype,cluster
cluster_alias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
2,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0327 L2 IT PPP-APr Glut_3
3,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0081 L2 IT PPP-APr Glut_2,0322 L2 IT PPP-APr Glut_2
4,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0081 L2 IT PPP-APr Glut_2,0323 L2 IT PPP-APr Glut_2
5,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0081 L2 IT PPP-APr Glut_2,0325 L2 IT PPP-APr Glut_2
...,...,...,...,...,...
34368,GABA-Glyc,27 MY GABA,288 MDRN Hoxb5 Ebf2 Gly-Gaba,1102 MDRN Hoxb5 Ebf2 Gly-Gaba_1,4955 MDRN Hoxb5 Ebf2 Gly-Gaba_1
34372,GABA-Glyc,27 MY GABA,285 MY Lhx1 Gly-Gaba,1091 MY Lhx1 Gly-Gaba_3,4901 MY Lhx1 Gly-Gaba_3
34374,GABA-Glyc,27 MY GABA,285 MY Lhx1 Gly-Gaba,1091 MY Lhx1 Gly-Gaba_3,4902 MY Lhx1 Gly-Gaba_3
34376,GABA-Glyc,27 MY GABA,285 MY Lhx1 Gly-Gaba,1091 MY Lhx1 Gly-Gaba_3,4903 MY Lhx1 Gly-Gaba_3


In [9]:
cell_extended = cell.join(cluster_details, on='cluster_alias')
print(f"Cells with annotations: {len(cell_extended)}")

Cells with annotations: 4042976


In [10]:
cell_extended

Unnamed: 0_level_0,cell_barcode,barcoded_cell_sample_label,library_label,feature_matrix_label,entity,brain_section_label,library_method,region_of_interest_acronym,donor_label,donor_genotype,...,dataset_label,x,y,cluster_alias,abc_sample_id,neurotransmitter,class,subclass,supertype,cluster
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCGAGAAGTTAAGGGC-410_B05,GCGAGAAGTTAAGGGC,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,...,WMB-10Xv3,23.146826,-3.086639,1,484be5df-5d44-4bfe-9652-7b5bc739c211,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
AATGGCTCAGCTCCTT-411_B06,AATGGCTCAGCTCCTT,411_B06,L8TX_201029_01_E10,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550851,Ai14(RCL-tdT)/wt,...,WMB-10Xv3,23.138481,-3.022000,1,5638505d-e1e8-457f-9e5b-59e3e2302417,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
AACACACGTTGCTTGA-410_B05,AACACACGTTGCTTGA,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,...,WMB-10Xv3,23.472557,-2.992709,1,a0544e29-194f-4d34-9af4-13e7377b648f,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
CACAGATAGAGGCGGA-410_A05,CACAGATAGAGGCGGA,410_A05,L8TX_201029_01_A10,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,...,WMB-10Xv3,23.379622,-3.043442,1,c777ac0b-77e1-4d76-bf8e-2b3d9e08b253,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
AAAGTGAAGCATTTCG-410_B05,AAAGTGAAGCATTTCG,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,...,WMB-10Xv3,23.909480,-2.601536,1,49860925-e82b-46df-a228-fd2f97e75d39,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGTGAGCAAACGCGA-1350_C05,GTGTGAGCAAACGCGA,1350_C05,L8XR_220728_01_A05,WMB-10XMulti,cell,,10xRSeq_Mult,MB,C57BL6J-641405,wt/wt,...,WMB-10XMulti,-7.716915,0.223654,8861,ba1d0e38-bea7-4d4f-bfcd-49121938e743,GABA-Glyc,26 P GABA,278 NLL Gata3 Gly-Gaba,1074 NLL Gata3 Gly-Gaba_1,4804 NLL Gata3 Gly-Gaba_1
TTAGCAATCCCTGTTA-1350_C05,TTAGCAATCCCTGTTA,1350_C05,L8XR_220728_01_A05,WMB-10XMulti,cell,,10xRSeq_Mult,MB,C57BL6J-641405,wt/wt,...,WMB-10XMulti,-3.115098,-3.024478,8215,342bd0bb-cbe5-479b-9c70-fef59a730255,Glut,19 MB Glut,157 RN Spp1 Glut,0682 RN Spp1 Glut_1,2761 RN Spp1 Glut_1
TTTGGCTGTCGCGCAA-1350_C05,TTTGGCTGTCGCGCAA,1350_C05,L8XR_220728_01_A05,WMB-10XMulti,cell,,10xRSeq_Mult,MB,C57BL6J-641405,wt/wt,...,WMB-10XMulti,-7.950964,0.409335,8798,4634de09-d8e0-4e40-a49b-eba311de08b5,GABA-Glyc,26 P GABA,278 NLL Gata3 Gly-Gaba,1076 NLL Gata3 Gly-Gaba_3,4806 NLL Gata3 Gly-Gaba_3
ATCCACCTCACAGACT-1320_B04,ATCCACCTCACAGACT,1320_B04,L8XR_220630_02_B10,WMB-10XMulti,cell,,10xRSeq_Mult,OLF,C57BL6J-625156,wt/wt,...,WMB-10XMulti,4.579441,12.135833,8798,5b3061de-1cb8-47b6-9368-52824e1031ce,GABA-Glyc,26 P GABA,278 NLL Gata3 Gly-Gaba,1076 NLL Gata3 Gly-Gaba_3,4806 NLL Gata3 Gly-Gaba_3


In [11]:
print(f"Cells missing subclass: {cell_extended['class'].isna().sum()}")

Cells missing subclass: 0


In [12]:
feature_matrix_label = 'WMB-10Xv2-TH'
file = abc_cache.get_data_path(directory='WMB-10Xv2', file_name=f'{feature_matrix_label}/log2')
print(f"Expression matrix path: {file}")

Expression matrix path: /gpfs/scratch/blukacsy/abc_atlas/expression_matrices/WMB-10Xv2/20230630/WMB-10Xv2-TH-log2.h5ad


In [13]:
adata = anndata.read_h5ad(file) #backed='r'
print(f"AnnData shape: {adata.shape}")

AnnData shape: (131212, 32285)


In [14]:
adata

AnnData object with n_obs × n_vars = 131212 × 32285
    obs: 'cell_barcode', 'library_label', 'anatomical_division_label'
    var: 'gene_symbol'
    uns: 'normalization', 'parent', 'parent_layer', 'parent_rows'

In [15]:
pred = cell_extended['feature_matrix_label'] == feature_matrix_label
cell_subset = cell_extended[pred]
print(f"Cells in subset: {len(cell_subset)}")

Cells in subset: 130555


In [16]:
adata_subset = adata[cell_subset.index, :]
print(f"Filtered AnnData shape: {adata_subset.shape}")

Filtered AnnData shape: (130555, 32285)


In [17]:
adata_subset

View of AnnData object with n_obs × n_vars = 130555 × 32285
    obs: 'cell_barcode', 'library_label', 'anatomical_division_label'
    var: 'gene_symbol'
    uns: 'normalization', 'parent', 'parent_layer', 'parent_rows'

In [18]:
unique_classes = sorted(cell_subset['class'].unique())
class_to_int = {top_level: i for i, top_level in enumerate(unique_classes)}

cell_subset = cell_subset.copy()
cell_subset['class_label'] = cell_subset['class'].map(class_to_int)
print(f"Number of unique classes: {len(unique_classes)}")

class_mapping = pd.DataFrame({
    'class': list(class_to_int.keys()),
    'label': list(class_to_int.values())
})

Number of unique classes: 27


In [21]:
cell_subset['class_label']

cell_label
CAGGTGCAGGCTAGCA-040_C01     9
TGCGCAGGTTGCGCAC-045_C01    10
CGATGTATCTTGCCGT-042_B01    16
GACTAACGTCCTCTTG-040_B01    10
GATCGTACAACTGCTA-040_B01    10
                            ..
GTACTCCGTAGTACCT-103_B01    25
GTGAAGGTCAGCTCTC-103_B01    25
ACACCCTCATGCGCAC-103_C01    25
CCTAAAGCAAGAGGCT-103_C01    25
TCTGGAAAGATGAGAG-103_C01    25
Name: class_label, Length: 130555, dtype: int64

In [22]:
class_mapping

Unnamed: 0,class,label
0,01 IT-ET Glut,0
1,03 OB-CR Glut,1
2,04 DG-IMN Glut,2
3,05 OB-IMN GABA,3
4,06 CTX-CGE GABA,4
5,07 CTX-MGE GABA,5
6,08 CNU-MGE GABA,6
7,09 CNU-LGE GABA,7
8,10 LSX GABA,8
9,11 CNU-HYa GABA,9


In [23]:
X = adata_subset.X

In [24]:
X.shape

(130555, 32285)

In [25]:
X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 498465283 stored elements and shape (130555, 32285)>

In [28]:
if scipy.sparse.issparse(X):
    print("matrix is sparse")
    X = X.toarray()

matrix is sparse


In [29]:
X

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 7.7246933, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [7.578836 , 0.       , 6.5863624, ..., 0.       , 0.       ,
        6.5863624],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        8.734062 ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]], dtype=float32)

In [30]:
print(X.min())
print(X.max())

0.0
18.1334


In [80]:
# top level classes (27 of 34 total) represented here are extremely imbalanced
cell_subset['class_label'].value_counts()

class_label
14    68546
24    30384
15     9475
10     9158
16     5313
13     4694
23     1114
25      724
12      395
20      306
26      149
11       85
7        36
19       28
9        27
8        24
18       17
2        16
17       15
6        12
1        10
22       10
4         8
21        4
0         2
5         2
3         1
Name: count, dtype: int64

In [32]:
y = cell_subset['class_label'].values

In [33]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

In [50]:
indices = cell_subset.index.to_numpy()

In [57]:
indices

array(['CAGGTGCAGGCTAGCA-040_C01', 'TGCGCAGGTTGCGCAC-045_C01',
       'CGATGTATCTTGCCGT-042_B01', ..., 'ACACCCTCATGCGCAC-103_C01',
       'CCTAAAGCAAGAGGCT-103_C01', 'TCTGGAAAGATGAGAG-103_C01'],
      dtype=object)

In [58]:
train_features, test_features, train_labels, test_labels, train_index, test_index = train_test_split(X, y, indices, test_size = 0.2, random_state=813)
test_features, val_features, test_labels, val_labels, test_index, val_index = train_test_split(test_features, test_labels, test_index, test_size = 0.5, random_state=995)
weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)

In [61]:
train_features = np.array(train_features)
test_features = np.array(test_features)
val_features = np.array(val_features)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)
weights = np.array(weights)

train_index = np.array(train_index)
test_index = np.array(test_index)
val_index = np.array(val_index)
       
print('Train features shape:', train_features.shape)
print('Val features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
print('Train labels shape:', train_labels.shape)
print('Val labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)
print('Weights shape:', weights.shape)
print()
print('Train index shape:', train_index.shape)
print('Test index shape:', test_index.shape)
print('Val index shape:', val_index.shape)

Train features shape: (104444, 32285)
Val features shape: (13056, 32285)
Test features shape: (13055, 32285)
Train labels shape: (104444,)
Val labels shape: (13056,)
Test labels shape: (13055,)
Weights shape: (27,)

Train index shape: (104444,)
Test index shape: (13055,)
Val index shape: (13056,)


In [82]:
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/train_features_classes.npy', train_features)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/test_features_classes.npy', test_features)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/val_features_classes.npy', val_features)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/train_labels_classes.npy', train_labels)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/test_labels_classes.npy', test_labels)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/val_labels_classes.npy', val_labels)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/weights_classes.npy', weights)

np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/train_index_classes.npy', train_index)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/test_index_classes.npy', test_index)
np.save('/gpfs/scratch/blukacsy/abc_atlas/arrays/val_index_classes.npy', val_index)

In [83]:
adata_subset

View of AnnData object with n_obs × n_vars = 130555 × 32285
    obs: 'cell_barcode', 'library_label', 'anatomical_division_label'
    var: 'gene_symbol'
    uns: 'normalization', 'parent', 'parent_layer', 'parent_rows'

In [87]:
adata_subset.obs['library_label']

cell_label
CAGGTGCAGGCTAGCA-040_C01    L8TX_180815_01_E08
TGCGCAGGTTGCGCAC-045_C01    L8TX_180829_01_C10
CGATGTATCTTGCCGT-042_B01    L8TX_180829_01_B09
GACTAACGTCCTCTTG-040_B01    L8TX_180815_01_D08
GATCGTACAACTGCTA-040_B01    L8TX_180815_01_D08
                                   ...        
GTACTCCGTAGTACCT-103_B01    L8TX_190321_01_G03
GTGAAGGTCAGCTCTC-103_B01    L8TX_190321_01_G03
ACACCCTCATGCGCAC-103_C01    L8TX_190321_01_H03
CCTAAAGCAAGAGGCT-103_C01    L8TX_190321_01_H03
TCTGGAAAGATGAGAG-103_C01    L8TX_190321_01_H03
Name: library_label, Length: 130555, dtype: category
Categories (26, object): ['L8TX_180815_01_A07', 'L8TX_180815_01_A08', 'L8TX_180815_01_B07', 'L8TX_180815_01_B08', ..., 'L8TX_180829_01_G09', 'L8TX_190321_01_F03', 'L8TX_190321_01_G03', 'L8TX_190321_01_H03']

In [88]:
adata_subset.obs['library_label'].value_counts()

library_label
L8TX_180829_01_E09    9689
L8TX_180829_01_C10    7120
L8TX_180815_01_G08    6938
L8TX_180829_01_G09    6933
L8TX_180815_01_D08    6930
L8TX_180815_01_C08    6274
L8TX_180815_01_E08    6237
L8TX_180815_01_F07    5178
L8TX_180815_01_H08    5136
L8TX_180815_01_D07    5135
L8TX_180829_01_F09    5100
L8TX_180815_01_H07    5035
L8TX_190321_01_H03    4956
L8TX_180815_01_A07    4752
L8TX_190321_01_F03    4721
L8TX_180815_01_G07    4611
L8TX_180829_01_A09    4414
L8TX_180829_01_B09    4278
L8TX_190321_01_G03    4237
L8TX_180815_01_E07    3976
L8TX_180829_01_C09    3817
L8TX_180815_01_B07    3766
L8TX_180815_01_A08    3340
L8TX_180815_01_C07    3297
L8TX_180815_01_B08    2761
L8TX_180815_01_F08    1924
Name: count, dtype: int64

In [None]:
# are library labels batches? 

In [None]:
# highly variable genes?

In [None]:
# pca?

In [None]:
# combat, harmony, scanorama?