# Braun, Linnarson

- https://www.biorxiv.org/content/10.1101/2022.10.24.513487v1


In [1]:
import sys
import numpy as np
import scanpy as sc
import h5py
import anndata
import scipy.sparse
import pandas as pd

In [2]:
# Load dataset
hdf=h5py.File('/home/sonic/scData/HCA_Braun/HumanFetalBrainPool.h5','r')
mat=hdf['shoji']['Expression']

In [3]:
# make adata and then add other metadata as adata.obs
expr = hdf['shoji']['Expression'][:1665937]
a = sc.AnnData(expr)

  a = sc.AnnData(expr)


In [4]:
del expr

In [5]:
# extract cell names and gene names and add to adata
genes = np.array(hdf['shoji']['Gene'][:], dtype=str)
cells = np.array(hdf['shoji']['CellID'][:1665937], dtype=str)

In [6]:
hdf['shoji'].keys()

<KeysViewHDF5 ['Accession', 'Age', 'AnnotationDefinition', 'AnnotationDescription', 'AnnotationName', 'AnnotationPosterior', 'CellClass', 'CellCycleFraction', 'CellID', 'Chemistry', 'Chromosome', 'Class', 'ClusterID', 'Clusters', 'Donor', 'DoubletFlag', 'DoubletScore', 'DropletClass', 'Embedding', 'End', 'Enrichment', 'Expression', 'Factors', 'Gene', 'GeneNonzeros', 'GeneTotalUMIs', 'Linkage', 'Loadings', 'ManifoldIndices', 'ManifoldRadius', 'ManifoldWeights', 'MeanAge', 'MeanCellCycle', 'MeanDoubletScore', 'MeanExpression', 'MeanTotalUMI', 'MitoFraction', 'NCells', 'NGenes', 'Nonzeros', 'OverallTotalUMIs', 'PrevClusters', 'Recipe', 'Region', 'SampleID', 'SelectedFeatures', 'Sex', 'Species', 'Start', 'StdevExpression', 'Subdivision', 'Subregion', 'Tissue', 'TopLevelCluster', 'TotalUMIs', 'Trinaries', 'UnsplicedFraction', 'ValidCells', 'ValidGenes']>

In [7]:
# Barcodes

In [8]:
# Add meta information
a.obs['CellID'] = cells
a.obs['Age'] = np.array(hdf['shoji']['Age'][:], dtype='float32')
a.obs['Assay'] = np.array(hdf['shoji']['Chemistry'][:], dtype=str)
a.obs.loc[a.obs['Assay'] == 'v3', 'Assay'] = "10x Chromium 3' v3"
a.obs.loc[a.obs['Assay'] == 'v2', 'Assay'] = "10x Chromium 3' v2"
a.obs['cluster_original'] = np.array(hdf['shoji']['CellClass'][:], dtype=str)
a.obs['batch'] = np.array(hdf['shoji']['Donor'][:], dtype=str)
a.obs['sampleID'] = np.array(hdf['shoji']['SampleID'][:], dtype=str)
a.obs['Region'] = np.array(hdf['shoji']['Region'][:], dtype=str)
a.obs['Sex'] = np.array(hdf['shoji']['Sex'][:], dtype=str)
a.obs.loc[a.obs['Sex'] == '', 'Sex'] = 'Unknown'
a.obs['Subdivision'] = np.array(hdf['shoji']['Subdivision'][:], dtype=str)
a.obs['Subregion'] = np.array(hdf['shoji']['Subregion'][:], dtype=str)
a.obs['Tissue'] = np.array(hdf['shoji']['Tissue'][:], dtype=str)
a.obs['Race'] = 'Unknown'
a.obs['Stage'] = 'Fetal'
a.obs['PMI'] = 'Unknown'
a.obs['Hemisphere'] = 'A'
a.obs['Library'] = 'scRNA'

# Update information
a.obs['cluster_main'] = 'Unknown'
a.obs.loc[a.obs['cluster_original'] == 'Oligo', 'cluster_main'] = 'OD'
a.obs.loc[a.obs['cluster_original'] == 'Erythrocyte', 'cluster_main'] = 'Others'
a.obs.loc[a.obs['cluster_original'] == 'Fibroblast', 'cluster_main'] = 'Others'
a.obs.loc[a.obs['cluster_original'] == 'Vascular', 'cluster_main'] = 'Others'
a.obs.loc[a.obs['cluster_original'] == 'Placodes', 'cluster_main'] = 'Others'

a.obs.loc[a.obs['cluster_original'] == 'Neuroblast', 'cluster_main'] = 'Neuroblast'
a.obs.loc[a.obs['cluster_original'] == 'Glioblast', 'cluster_main'] = 'Glioblast'
a.obs.loc[a.obs['cluster_original'] == 'Neural crest', 'cluster_main'] = 'Neural crest'

a.obs.loc[a.obs['cluster_original'] == 'Radial glia', 'cluster_main'] = 'RG'
a.obs.loc[a.obs['cluster_original'] == 'Immune', 'cluster_main'] = 'MG'
a.obs.loc[a.obs['cluster_original'] == 'Neuron', 'cluster_main'] = 'Ext'
a.obs.loc[a.obs['cluster_original'] == 'Neuronal IPC', 'cluster_main'] = 'Ext'

# Set up index
a.obs = a.obs.set_index('CellID')

In [9]:
a.obs

Unnamed: 0_level_0,Age,Assay,cluster_original,batch,sampleID,Region,Sex,Subdivision,Subregion,Tissue,Race,Stage,PMI,Hemisphere,Library,cluster_main
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10X89_1:AAACGGGAGGCTACGA,8.0,10x Chromium 3' v2,Erythrocyte,BRC2006,10X89_1,Telencephalon,Unknown,Cortex,Cortex,Cortex,Unknown,Fetal,Unknown,A,scRNA,Others
10X89_1:ACGAGGAAGAGCCTAG,8.0,10x Chromium 3' v2,Erythrocyte,BRC2006,10X89_1,Telencephalon,Unknown,Cortex,Cortex,Cortex,Unknown,Fetal,Unknown,A,scRNA,Others
10X89_1:ACGCCAGTCGATAGAA,8.0,10x Chromium 3' v2,Erythrocyte,BRC2006,10X89_1,Telencephalon,Unknown,Cortex,Cortex,Cortex,Unknown,Fetal,Unknown,A,scRNA,Others
10X89_1:CCGGGATCAGACACTT,8.0,10x Chromium 3' v2,Erythrocyte,BRC2006,10X89_1,Telencephalon,Unknown,Cortex,Cortex,Cortex,Unknown,Fetal,Unknown,A,scRNA,Others
10X89_1:CGACTTCTCCCACTTG,8.0,10x Chromium 3' v2,Erythrocyte,BRC2006,10X89_1,Telencephalon,Unknown,Cortex,Cortex,Cortex,Unknown,Fetal,Unknown,A,scRNA,Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10X302_7:AGAGAATAGAGGATCC,7.0,10x Chromium 3' v3,Vascular,XDD:398,10X302_7,Midbrain,Unknown,Midbrain,Midbrain,Midbrain,Unknown,Fetal,Unknown,A,scRNA,Others
10X302_8:AAGCCATTCCACCTCA,7.0,10x Chromium 3' v3,Vascular,XDD:398,10X302_8,Midbrain,Unknown,Midbrain,Midbrain,Midbrain,Unknown,Fetal,Unknown,A,scRNA,Others
10X302_8:ACGGTTAGTTAGGCCC,7.0,10x Chromium 3' v3,Vascular,XDD:398,10X302_8,Midbrain,Unknown,Midbrain,Midbrain,Midbrain,Unknown,Fetal,Unknown,A,scRNA,Others
10X302_8:CATGCCTCAACCCGCA,7.0,10x Chromium 3' v3,Vascular,XDD:398,10X302_8,Midbrain,Unknown,Midbrain,Midbrain,Midbrain,Unknown,Fetal,Unknown,A,scRNA,Others


In [10]:
sum(a.obs.index.duplicated())

0

In [11]:
a.obs.Tissue.value_counts()

Forebrain                195398
Mesencephalon            163064
Cortex                   149766
Cerebellum               142362
Medulla                  127683
Thalamus                 109783
Striatum                  95228
Brain                     85999
Pons                      62564
Subcortical forebrain     54002
Hindbrain                 49456
Hypothalamus              46667
Dorsal midbrain           42931
Hippocampus               35157
Diencephalon              34579
Midbrain                  33639
Pons/Cereb                33488
Subcortex                 28814
Frontotemporal cortex     22270
Lower cortex              18304
Ventral midbrain          16800
Tel/diencephalon          16797
Forebrain cortex          15788
Pons/Medulla              13782
Upper cortex              13486
Occipital cortex          13353
Caudate-Putamen           13184
Enthorinal cortex         10839
Telencephalon              7052
Cortical hem               6901
Head                       6801
Name: Ti

In [12]:
a.obs.groupby(['Region','Subdivision']).size()

Region         Subdivision        
Brain          Brain                   85999
Cerebellum     Cerebellum             142362
Diencephalon   Diencephalon            34579
               Hypothalamus            46667
               Thalamus               109783
Forebrain      Forebrain              212195
               Subcortex              102368
Head           Head                     6801
Hindbrain      Hindbrain               63238
Medulla        Medulla                127683
Midbrain       Midbrain               196703
               Midbrain dorsal         42931
               Midbrain ventral        16800
Pons           Pons                    96052
Telencephalon  Caudate+Putamen         13184
               Cortex                 145113
               Cortex entorhinal       10839
               Cortex frontal          22270
               Cortex hemisphere A      7242
               Cortex hemisphere B     13199
               Cortex occipital        13353
               Corte

In [13]:
a.obs.groupby(['cluster_main','cluster_original']).size()

cluster_main  cluster_original
Ext           Neuron              561754
              Neuronal IPC        110260
Glioblast     Glioblast           132420
MG            Immune                8102
Neural crest  Neural crest           871
Neuroblast    Neuroblast          285699
OD            Oligo                 6190
Others        Erythrocyte           8958
              Fibroblast           19161
              Placodes               873
              Vascular             11741
RG            Radial glia         519908
dtype: int64

In [14]:
# Add genes
df = pd.DataFrame(genes)
df.columns = ['gene']
df.set_index('gene', inplace=True)
df

marker-DsRed
marker-Cherry
marker-GFP
marker-Tomato
marker-cre
...
5_8S_rRNA
AC007325.3
AC007325.1
U6
U1


In [15]:
a.var = df
a.var_names_make_unique()



In [16]:
list(a.var.index[0:22])

['marker-DsRed',
 'marker-Cherry',
 'marker-GFP',
 'marker-Tomato',
 'marker-cre',
 'marker-CFP',
 'marker-mOrange',
 'marker-HcRed',
 'marker-RFP',
 'transgen-Cas9',
 'marker-EGFP',
 'marker-WPRE',
 'Malat1_gRNA10',
 'Malat1_gRNA11',
 'Malat1_gRNA12',
 'Malat1_gRNA13',
 'Malat1_gRNA14',
 'Malat1_gRNA15',
 'Malat1_gRNA16',
 'Malat1_gRNA17',
 'marker-YFP',
 'DDX11L1']

In [17]:
genes_to_remove = list(a.var.index[0:21])
genes_to_keep = list(a.var.index[21:])
a = a[:,genes_to_keep]

In [18]:
a.var

DDX11L1
WASH7P
MIR6859-1
MIR1302-2
OR4G4P
...
5_8S_rRNA-8
AC007325.3
AC007325.1
U6-29
U1-5


In [19]:
# a1 = a[a.obs.Subdivision=='Cortex frontal']
# a1.obs['Brain_Region'] = 'Cortex frontal'
# a1.obs['Dataset'] = 'BraunCortexFrontal'
# sc.pp.filter_cells(a1, min_counts=50)
# a1.write('/home/sonic/scData/anndata/HCA_Braun.subset_cortexFrontal.h5ad')

In [20]:
# a1 = a[a.obs.Subdivision=='Cortex']
# a1.obs['Brain_Region'] = 'Cortex'
# a1.obs['Dataset'] = 'BraunCortex'
# sc.pp.filter_cells(a1, min_counts=50)
# a1.write('/home/sonic/scData/anndata/HCA_Braun.subset_cortex.h5ad')

In [21]:
# a1 = a[a.obs.Region=='Telencephalon']
# a1.obs['Brain_Region'] = 'Telencephalon'
# a1.obs['Dataset'] = 'BraunTelencephalon'
# sc.pp.filter_cells(a1, min_counts=50)
# a1.write('/home/sonic/scData/anndata/HCA_Braun.subset_Telencephalon.h5ad')

In [22]:
# a1 = a[a.obs.Subdivision=='Striatum']
# a1.obs['Brain_Region'] = 'Striatum'
# a1.obs['Dataset'] = 'BraunStriatum'
# sc.pp.filter_cells(a1, min_counts=50)
# a1.write('/home/sonic/scData/anndata/HCA_Braun.subset_Striatum.h5ad')

In [23]:
sc.pp.subsample(a, fraction=0.3, random_state=42)

In [24]:
a

AnnData object with n_obs × n_vars = 499781 × 59459
    obs: 'Age', 'Assay', 'cluster_original', 'batch', 'sampleID', 'Region', 'Sex', 'Subdivision', 'Subregion', 'Tissue', 'Race', 'Stage', 'PMI', 'Hemisphere', 'Library', 'cluster_main'

In [25]:
sc.pp.filter_genes(a, min_cells=1)
a

AnnData object with n_obs × n_vars = 499781 × 46461
    obs: 'Age', 'Assay', 'cluster_original', 'batch', 'sampleID', 'Region', 'Sex', 'Subdivision', 'Subregion', 'Tissue', 'Race', 'Stage', 'PMI', 'Hemisphere', 'Library', 'cluster_main'
    var: 'n_cells'

In [26]:
# sc.pp.filter_cells(a, min_counts=200)
# a

In [27]:
a.obs['Brain_Region'] = a.obs['Subdivision']
a.obs['Dataset'] = 'Braun'
a.write('/home/sonic/scData/anndata/HCA_Braun_minCell_1_pct30.h5ad')

In [28]:
a.obs.Subdivision.value_counts()

Forebrain              63753
Midbrain               58885
Cortex                 43757
Cerebellum             42856
Medulla                38442
Thalamus               32956
Subcortex              30800
Pons                   28741
Striatum               28687
Brain                  25650
Hindbrain              18863
Hypothalamus           14076
Midbrain dorsal        12788
Diencephalon           10318
Cortex frontal          6641
Cortex temporal         5435
Midbrain ventral        5132
Hippocampus             4649
Cortex occipital        3985
Cortex hemisphere B     3944
Caudate+Putamen         3931
Cortex parietal         3921
Cortex entorhinal       3199
Telencephalon           2140
Cortex hemisphere A     2134
Cortical hem            2057
Head                    2041
Name: Subdivision, dtype: int64