In [2]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [3]:
DATASET_ID = "Pryhuber_01_HuBMAP"
RDS_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.RDS"
ENESEMBL_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/features.tsv"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [4]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [5]:
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [6]:
%%R -i RDS_PATH

suppressPackageStartupMessages(library(Seurat))
rds <- readRDS(file=RDS_PATH)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

package ‘tools’ was built under R version 4.3.3 


In [7]:
%%R -o adata

adata <- as.SingleCellExperiment(rds)
adata

class: SingleCellExperiment 
dim: 29800 348012 
metadata(0):
assays(1): counts
rownames(29800): A1BG A1BG-AS1 ... ZYX ZZEF1
rowData names(0):
colnames(348012): LAP40_AAACCCAAGATGCTAA LAP40_AAACCCAAGATTGACA ...
  LAP191_TTTGTTGTCGAACCAT-1 LAP191_TTTGTTGTCGGCTGGT-1
colData names(7): orig.ident nCount_RNA ... subclass.l5 ident
reducedDimNames(0):
mainExpName: RNA
altExpNames(0):


1: Layer ‘data’ is empty 
2: Layer ‘scale.data’ is empty 


  return AnnData(exprs, obs, var, uns, obsm, layers=layers)


In [8]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,sublass.l4,CellOntology.ID,subclass.l5,ident
LAP40_AAACCCAAGATGCTAA,LAP40,3962.0,1859,AT2,0002063,AT2-1,LAP40
LAP40_AAACCCAAGATTGACA,LAP40,5413.0,2587,AF1,4028004,AF1-1,LAP40
LAP40_AAACCCAGTTTCTATC,LAP40,2113.0,1140,AM,0000583,AM,LAP40
LAP40_AAACCCATCCAACTAG,LAP40,645.0,480,RAS,1000272,PreTBSC,LAP40
LAP40_AAACCCATCGCGGTAC,LAP40,7475.0,2958,AM,0000583,AM,LAP40
...,...,...,...,...,...,...,...
LAP191_TTTGGTTTCTGCCTCA-1,LAP191,865.0,691,CAP1,4028002,CAP1,LAP191
LAP191_TTTGTTGGTTTGCAGT-1,LAP191,2191.0,1408,AT2,0002063,AT2-2,LAP191
LAP191_TTTGTTGTCACGTAGT-1,LAP191,559.0,479,CAP2,4028003,CAP2,LAP191
LAP191_TTTGTTGTCGAACCAT-1,LAP191,1047.0,832,Ciliated,0002145,Ciliated-2,LAP191


In [9]:
l4_annot = pd.read_csv("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Pryhuber_01_HuBMAP/subclass.l4 Annotations Markers Lung 10X snRNA-seq HuBMAP1.csv", usecols=range(0,5))
l5_annot = pd.read_csv("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Pryhuber_01_HuBMAP/subclass.l5 Annotations Markers Lung 10X snRNA-seq HuBMAP1.csv", usecols=range(0,5))

l4_col_dict = {
    'Full name': AUTHOR_CELL_TYPE_DESCRIPTION_L0,
    'subclass.l4': AUTHOR_CELL_TYPE_L0,
    'CL ID Label': CELL_TYPE_ONTOLOGY_LABEL_L0,
    'CL ID': CELL_TYPE_ONTOLOGY_ID_L0,
    'Marker Genes': MARKER_GENES_L0
}

l5_col_dict = {
    'Full name': AUTHOR_CELL_TYPE_DESCRIPTION_L1,
    'subclass.l5': AUTHOR_CELL_TYPE_L1,
    'CL ID Label': CELL_TYPE_ONTOLOGY_LABEL_L1,
    'CL ID': CELL_TYPE_ONTOLOGY_ID_L1,
    'Marker Genes': MARKER_GENES_L1
}

l4_annot = l4_annot.rename(columns=l4_col_dict)
l5_annot = l5_annot.rename(columns=l5_col_dict)

l5_annot

Unnamed: 0,author_cell_type_description_level_1,author_cell_type_level_1,cell_type_ontology_term_label_level_1,cell_type_ontology_term_id_level_1,author_cell_type_markers_level_1
0,Basal cell,Basal,basal epithelial cell of tracheobronchial tree,CL:0002329,TP63; KRT15; KRT5
1,Distal basal cell SFTPB+ subset 1,Basal.SFTPB-1,basal epithelial cell of tracheobronchial tree...,CL:0002329,TP63; KRT15; SFTPB; IGFBP2; CNTN6; GPC3
2,Distal basal cell SFTPB+ high subset 2,Basal.SFTPB-2,basal epithelial cell of tracheobronchial tree...,CL:0002329,TP63; KRT15; SFTPB; MFSD2A; TMEM163; SLC34A2
3,Basal proliferating cell,Basal.p,basal epithelial cell of tracheobronchial tree...,CL:0002329,"TP63; KRT15; KRT5; MKI67, TOP2A, CIT"
4,Suprabasal cell,Suprabasal,respiratory suprabasal cell\t,CL:4033048,TP63; KRT15; SERPPINB3; SERPINB13; NOTCH3; K...
...,...,...,...,...,...
56,Neutrophil,Neutrophil,neutrophil,CL:0000775,CSF3R
57,Neutrophil activated,Neu.activ,neutrophil;activated,CL:0000775,CSF3R; FCGR3B; MXD1; PADI4
58,Neutrophil degranulating,Neu.degran,neutrophil;degranulating,CL:0000775,CSF3R; BPI; AZU1; DEFA3
59,Neutrophil proliferating,Neu.p,neutrophil;proliferating,CL:0000775,"CSF3R; MKI67, TOP2A, CIT"


In [10]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [11]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Cell identification by single nuclear RNA sequ...,"Elizabeth Duong, Kun Zhang, James Hagood, Xin ...",orig.ident,,"consented for release, protected under embargo",


In [12]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,D231-RML-10A4,D231,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20201216A_10X-R,HBM443.VFRD.453,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
1,D231-RML-10B2,D231,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20211020A_10X-R,HBM443.VFRD.453,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
2,D231-RML-2A3,D231,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20210514A_10X-R,HBM443.VFRD.453,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
3,D231-RML-4B4,D231,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20210514B_10X-R,HBM443.VFRD.453,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
4,D231-RML-6A4,D231,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20210514C_10X-R,HBM443.VFRD.453,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
5,D239-RML-12A3,D239,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20200317M-10X-R,HBM943.SCQQ.877,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
6,D239-RML-12A4,D239,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20200317N-10X-R,HBM943.SCQQ.877,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
7,D239-RML-3A4,D239,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20200317H-10X-R,HBM943.SCQQ.877,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
8,D239-RML-7A2,D239,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20200317I-10X-R,HBM943.SCQQ.877,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
9,D239-RML-7A3,D239,Master protocol (includes subprotocls for all ...,University of Rochester (tissue source and pre...,URMC,,LAPMAP_20200317J-10X-R,HBM943.SCQQ.877,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238


In [13]:
# Add celltype annotatino to anndata

orig_obs = adata.obs.copy()

adata_dict = {
    'sublass.l4': AUTHOR_CELL_TYPE_L0,
    'subclass.l5': AUTHOR_CELL_TYPE_L1
}
orig_obs = orig_obs.rename(columns=adata_dict)
orig_obs = orig_obs.drop(columns='CellOntology.ID')

# L4 has multiple AT2 subsets, which however have the same cell name
# making it unique
l4_annot = l4_annot.drop_duplicates(subset=AUTHOR_CELL_TYPE_L0, keep='first')

left_index = orig_obs.index
orig_obs = orig_obs.merge(l4_annot, how='left', on=AUTHOR_CELL_TYPE_L0)
new_obs = orig_obs.merge(l5_annot, how='left', on=AUTHOR_CELL_TYPE_L1)
new_obs.index = left_index

new_obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,author_cell_type_level_0,author_cell_type_level_1,ident,author_cell_type_description_level_0,cell_type_ontology_term_label_level_0,cell_type_ontology_term_id_level_0,author_cell_type_markers_level_0,author_cell_type_description_level_1,cell_type_ontology_term_label_level_1,cell_type_ontology_term_id_level_1,author_cell_type_markers_level_1
LAP40_AAACCCAAGATGCTAA,LAP40,3962.0,1859,AT2,AT2-1,LAP40,Alveolar epithelial type 2 cell,type II pneumocyte,CL:0002063,SFTPB; SFTPC; BMP1; ABCA3; LAMP3,Alveolar epithelial type 2 cell subset 1,type II pneumocyte; CCDC141 positive,CL:0002063,SFTPB; SFTPC; BMP1; ABCA3; LAMP3; CCDC141
LAP40_AAACCCAAGATTGACA,LAP40,5413.0,2587,AF1,AF1-1,LAP40,Alveolar Fibroblast 1,alveolar type 1 fibroblast,CL:4028004,ITGA8; CCBE1; GALNT17; NKD1,Alveolar Fibroblast 1 subset 1,alveolar type 1 fibroblast; PIEZO2 low,CL:4028004,ITGA8; CCBE1; GALNT17; NKD1
LAP40_AAACCCAGTTTCTATC,LAP40,2113.0,1140,AM,AM,LAP40,Alveolar macrophage cell,alveolar macrophage,CL:0000583,PPARG; MARCO,Alveolar macrophage cell,alveolar macrophage,CL:0000583,PPARG; MARCO
LAP40_AAACCCATCCAACTAG,LAP40,645.0,480,RAS,PreTBSC,LAP40,Respiratory airway secretory cell,Lung secretory cell; Respiratory airway secret...,CL:1000272,SCGB3A2; GDF15; SFTPB,Pre-terminal bronchiole secretory cell,Lung secretory cell; RAS; Pre-terminal bronchiole,CL:1000272,SCGB3A2; GDF15; SFTPB; SCGB1A1
LAP40_AAACCCATCGCGGTAC,LAP40,7475.0,2958,AM,AM,LAP40,Alveolar macrophage cell,alveolar macrophage,CL:0000583,PPARG; MARCO,Alveolar macrophage cell,alveolar macrophage,CL:0000583,PPARG; MARCO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LAP191_TTTGGTTTCTGCCTCA-1,LAP191,865.0,691,CAP1,CAP1,LAP191,Capillary general endothelial cell,alveolar capillary type 1 endothelial cell,CL:4028002,BTNL9; FCN3; IL7R,Capillary general endothelial cell,alveolar capillary type 1 endothelial cell,CL:4028002,BTNL9; FCN3; IL7R
LAP191_TTTGTTGGTTTGCAGT-1,LAP191,2191.0,1408,AT2,AT2-2,LAP191,Alveolar epithelial type 2 cell,type II pneumocyte,CL:0002063,SFTPB; SFTPC; BMP1; ABCA3; LAMP3,Alveolar epithelial type 2 cell subset 2,type II pneumocyte; LRP2 positive,CL:0002063,SFTPB; SFTPC; BMP1; ABCA3; LAMP3; LRP2; SCN1A;...
LAP191_TTTGTTGTCACGTAGT-1,LAP191,559.0,479,CAP2,CAP2,LAP191,Capillary aerocyte endothelial cell,alveolar capillary type 2 endothelial cell,CL:4028003,HPGD; EDNRB,Capillary aerocyte endothelial cell,alveolar capillary type 2 endothelial cell,CL:4028003,HPGD; EDNRB
LAP191_TTTGTTGTCGAACCAT-1,LAP191,1047.0,832,Ciliated,Ciliated-2,LAP191,Ciliated cell,ciliated columnar cell of tracheobronchial tree,CL:0002145,FOXJ1; CDHR3; RSPH1,Ciliated cell subset 2,tracheobronchial ciliated columnar cell; DHRS9...,CL:0002145,FOXJ1; CDHR3; RSPH1; PRMT8; KCNN3


In [14]:
adata.obs = new_obs

adata

AnnData object with n_obs × n_vars = 348012 × 29800
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'author_cell_type_level_0', 'author_cell_type_level_1', 'ident', 'author_cell_type_description_level_0', 'cell_type_ontology_term_label_level_0', 'cell_type_ontology_term_id_level_0', 'author_cell_type_markers_level_0', 'author_cell_type_description_level_1', 'cell_type_ontology_term_label_level_1', 'cell_type_ontology_term_id_level_1', 'author_cell_type_markers_level_1'

# Validate obs and uns from adata

In [15]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'author_cell_type_level_0',
       'author_cell_type_level_1', 'ident',
       'author_cell_type_description_level_0',
       'cell_type_ontology_term_label_level_0',
       'cell_type_ontology_term_id_level_0',
       'author_cell_type_markers_level_0',
       'author_cell_type_description_level_1',
       'cell_type_ontology_term_label_level_1',
       'cell_type_ontology_term_id_level_1',
       'author_cell_type_markers_level_1'],
      dtype='object')

In [16]:
adata.obs['orig.ident'].value_counts().index.tolist()

['LAP186',
 'LAP178',
 'LAP102',
 'LAP109',
 'LAP107',
 'LAP187',
 'LAP108',
 'LAP182',
 'LAP180',
 'LAP104',
 'LAP106',
 'LAP103',
 'LAP184',
 'LAP192',
 'LAP41',
 'LAP181',
 'LAP101',
 'LAP183',
 'LAP100',
 'LAP177',
 'LAP95',
 'LAP179',
 'LAP94',
 'LAP176',
 'LAP96',
 'LAP97',
 'LAP194',
 'LAP90',
 'LAP174',
 'LAP93',
 'LAP193',
 'LAP91',
 'LAP99',
 'LAP92',
 'LAP40',
 'LAP190',
 'LAP195',
 'LAP188',
 'LAP191',
 'LAP89',
 'LAP185',
 'LAP98',
 'LAP88',
 'LAP87',
 'LAP105',
 'LAP189']

In [17]:
# check non overlap between adata.obs['ident'] and obs['sample_ID']
adata_ids = adata.obs['orig.ident'].value_counts().index.tolist()
obs_ids = obs['library_preparation_batch'].tolist()

set(adata_ids).symmetric_difference(set(obs_ids))

set()

In [18]:
non_overlap = set(adata_ids) - set(obs_ids)
non_overlap_other_side = set(obs_ids) - set(adata_ids)

In [19]:
non_overlap

set()

In [20]:
non_overlap_other_side

set()

In [21]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 348012 × 29800
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'author_cell_type_level_0', 'author_cell_type_level_1', 'ident', 'author_cell_type_description_level_0', 'cell_type_ontology_term_label_level_0', 'cell_type_ontology_term_id_level_0', 'author_cell_type_markers_level_0', 'author_cell_type_description_level_1', 'cell_type_ontology_term_label_level_1', 'cell_type_ontology_term_id_level_1', 'author_cell_type_markers_level_1'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'

In [22]:
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'orig.ident',
    df_col = 'library_preparation_batch',
    skip = None
)

adata.obs.rename(columns={'orig.ident': 'library_preparation_batch'}, inplace=True)

adata.obs

Unnamed: 0,library_preparation_batch,nCount_RNA,nFeature_RNA,author_cell_type_level_0,author_cell_type_level_1,ident,author_cell_type_description_level_0,cell_type_ontology_term_label_level_0,cell_type_ontology_term_id_level_0,author_cell_type_markers_level_0,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
LAP40_AAACCCAAGATGCTAA,LAP40,3962.0,1859,AT2,AT2-1,LAP40,Alveolar epithelial type 2 cell,type II pneumocyte,CL:0002063,SFTPB; SFTPC; BMP1; ABCA3; LAMP3,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
LAP40_AAACCCAAGATTGACA,LAP40,5413.0,2587,AF1,AF1-1,LAP40,Alveolar Fibroblast 1,alveolar type 1 fibroblast,CL:4028004,ITGA8; CCBE1; GALNT17; NKD1,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
LAP40_AAACCCAGTTTCTATC,LAP40,2113.0,1140,AM,AM,LAP40,Alveolar macrophage cell,alveolar macrophage,CL:0000583,PPARG; MARCO,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
LAP40_AAACCCATCCAACTAG,LAP40,645.0,480,RAS,PreTBSC,LAP40,Respiratory airway secretory cell,Lung secretory cell; Respiratory airway secret...,CL:1000272,SCGB3A2; GDF15; SFTPB,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
LAP40_AAACCCATCGCGGTAC,LAP40,7475.0,2958,AM,AM,LAP40,Alveolar macrophage cell,alveolar macrophage,CL:0000583,PPARG; MARCO,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LAP191_TTTGGTTTCTGCCTCA-1,LAP191,865.0,691,CAP1,CAP1,LAP191,Capillary general endothelial cell,alveolar capillary type 1 endothelial cell,CL:4028002,BTNL9; FCN3; IL7R,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000237
LAP191_TTTGTTGGTTTGCAGT-1,LAP191,2191.0,1408,AT2,AT2-2,LAP191,Alveolar epithelial type 2 cell,type II pneumocyte,CL:0002063,SFTPB; SFTPC; BMP1; ABCA3; LAMP3,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000237
LAP191_TTTGTTGTCACGTAGT-1,LAP191,559.0,479,CAP2,CAP2,LAP191,Capillary aerocyte endothelial cell,alveolar capillary type 2 endothelial cell,CL:4028003,HPGD; EDNRB,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000237
LAP191_TTTGTTGTCGAACCAT-1,LAP191,1047.0,832,Ciliated,Ciliated-2,LAP191,Ciliated cell,ciliated columnar cell of tracheobronchial tree,CL:0002145,FOXJ1; CDHR3; RSPH1,...,3 prime tag,EFO_0008637,true,GRCh38,v110,cell ranger 3.0,yes,PATO:0000461,unknown,HsapDv:0000237


# Add author cell type markers to UNS

In [23]:
adata.uns[MARKER_GENES_L0] = l4_annot
adata.uns[MARKER_GENES_L1] = l5_annot

adata

AnnData object with n_obs × n_vars = 348012 × 29800
    obs: 'library_preparation_batch', 'nCount_RNA', 'nFeature_RNA', 'author_cell_type_level_0', 'author_cell_type_level_1', 'ident', 'author_cell_type_description_level_0', 'cell_type_ontology_term_label_level_0', 'cell_type_ontology_term_id_level_0', 'author_cell_type_markers_level_0', 'author_cell_type_description_level_1', 'cell_type_ontology_term_label_level_1', 'cell_type_ontology_term_id_level_1', 'author_cell_type_markers_level_1', 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded',

# Check author cell type annotations and Cell Ontology IDs

In [24]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs[AUTHOR_CELL_TYPE_L1]
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs[CELL_TYPE_ONTOLOGY_ID_L1]
adata.obs[CELL_TYPE_ONTOLOGY_LABEL] = adata.obs[CELL_TYPE_ONTOLOGY_LABEL_L1]
adata.obs[AUTHOR_CELL_TYPE_DESCRIPTION] = adata.obs[AUTHOR_CELL_TYPE_DESCRIPTION_L1]
adata.obs[MARKER_GENES] = adata.obs[MARKER_GENES_L1]

adata

AnnData object with n_obs × n_vars = 348012 × 29800
    obs: 'library_preparation_batch', 'nCount_RNA', 'nFeature_RNA', 'author_cell_type_level_0', 'author_cell_type_level_1', 'ident', 'author_cell_type_description_level_0', 'cell_type_ontology_term_label_level_0', 'cell_type_ontology_term_id_level_0', 'author_cell_type_markers_level_0', 'author_cell_type_description_level_1', 'cell_type_ontology_term_label_level_1', 'cell_type_ontology_term_id_level_1', 'author_cell_type_markers_level_1', 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded',

# Check whether ENSEMBL IDs in var

In [25]:
adata.var

A1BG
A1BG-AS1
A1CF
A2M
A2M-AS1
...
ZXDC
ZYG11A
ZYG11B
ZYX
ZZEF1


In [26]:
ensembl_ids = pd.read_csv(ENESEMBL_PATH, sep='\t', header=None)
ensembl_ids

Unnamed: 0,0,1,2
0,ENSG00000243485,MIR1302-2HG,Gene Expression
1,ENSG00000237613,FAM138A,Gene Expression
2,ENSG00000186092,OR4F5,Gene Expression
3,ENSG00000238009,AL627309.1,Gene Expression
4,ENSG00000239945,AL627309.3,Gene Expression
...,...,...,...
33533,ENSG00000277856,AC233755.2,Gene Expression
33534,ENSG00000275063,AC233755.1,Gene Expression
33535,ENSG00000271254,AC240274.1,Gene Expression
33536,ENSG00000277475,AC213203.1,Gene Expression


In [27]:
len(np.intersect1d(adata.var.index.tolist(), ensembl_ids[1].tolist()))

29785

In [28]:
ensembl_mapping = dict(zip(ensembl_ids[1], ensembl_ids[0]))
ensembl_mapping

{'MIR1302-2HG': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'AL627309.1': 'ENSG00000238009',
 'AL627309.3': 'ENSG00000239945',
 'AL627309.2': 'ENSG00000239906',
 'AL627309.4': 'ENSG00000241599',
 'AL732372.1': 'ENSG00000236601',
 'OR4F29': 'ENSG00000284733',
 'AC114498.1': 'ENSG00000235146',
 'OR4F16': 'ENSG00000284662',
 'AL669831.2': 'ENSG00000229905',
 'AL669831.5': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'AL645608.7': 'ENSG00000272438',
 'AL645608.3': 'ENSG00000230699',
 'AL645608.5': 'ENSG00000241180',
 'AL645608.1': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'PERM1': 'ENSG00000187642',
 'AL645608.8': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AL645608.2': 'ENSG00000224969',
 'AGRN': 'ENSG00000188157',
 'AL645608.9': 'ENSG00000273443',
 'RNF223

In [29]:
adata.var.index.name = 'index'
adata.var['gene_symbol'] = adata.var.index.astype('category')
adata.var.index = adata.var.index.astype(str)
adata.var['ensembl_id'] = adata.var['gene_symbol'].map(ensembl_mapping)

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,A1BG,ENSG00000121410
A1BG-AS1,A1BG-AS1,ENSG00000268895
A1CF,A1CF,ENSG00000148584
A2M,A2M,ENSG00000175899
A2M-AS1,A2M-AS1,ENSG00000245105
...,...,...
ZXDC,ZXDC,ENSG00000070476
ZYG11A,ZYG11A,ENSG00000203995
ZYG11B,ZYG11B,ENSG00000162378
ZYX,ZYX,ENSG00000159840


# Check raw data

In [30]:
adata.X

<348012x29800 sparse matrix of type '<class 'numpy.float32'>'
	with 580867730 stored elements in Compressed Sparse Row format>

In [49]:
adata.obs.columns

Index(['library_preparation_batch', 'nCount_RNA', 'nFeature_RNA',
       'author_cell_type_level_0', 'author_cell_type_level_1', 'ident',
       'author_cell_type_description_level_0',
       'cell_type_ontology_term_label_level_0',
       'cell_type_ontology_term_id_level_0',
       'author_cell_type_markers_level_0',
       'author_cell_type_description_level_1',
       'cell_type_ontology_term_label_level_1',
       'cell_type_ontology_term_id_level_1',
       'author_cell_type_markers_level_1', 'sample_ID', 'donor_id',
       'protocol_URL', 'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enric

In [58]:
# make checks for adata before saving raw
adata.obs = adata.obs.astype('category')

# Make specific columns numeric
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')

numeric_cols = ['nCount_RNA', 'nFeature_RNA']
for col in numeric_cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors='coerce')

# Make df indices str
adata.obs.index = adata.obs.index.astype('str')
adata.var.index = adata.var.index.astype('str')
adata.var_names = adata.var_names.astype('str')

adata.var = adata.var.astype(str)

In [41]:
adata.uns.keys()

odict_keys(['title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments', 'author_cell_type_markers_level_0', 'author_cell_type_markers_level_1'])

In [59]:
adata.uns['author_cell_type_markers_level_0'] = adata.uns['author_cell_type_markers_level_0'].astype(str)
adata.uns['author_cell_type_markers_level_1'] = adata.uns['author_cell_type_markers_level_1'].astype(str)

In [60]:
adata.X = adata.X.astype(np.int64)

In [61]:
adata.raw = adata

In [35]:
adata.X

<348012x29800 sparse matrix of type '<class 'numpy.int64'>'
	with 580867730 stored elements in Compressed Sparse Row format>

In [35]:
adata.raw.X

<348012x29800 sparse matrix of type '<class 'numpy.int64'>'
	with 580867730 stored elements in Compressed Sparse Row format>

In [None]:
adata.X.toarray().max()

: 

: 

In [None]:
adata.raw.X.toarray().max()

5322

In [52]:
adata.var_names

Index(['A1BG', 'A1BG-AS1', 'A1CF', 'A2M', 'A2M-AS1', 'A2ML1', 'A2ML1-AS1',
       'A3GALT2', 'A4GALT', 'A4GNT',
       ...
       'ZW10', 'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B',
       'ZYX', 'ZZEF1'],
      dtype='object', name='index', length=29800)

In [53]:
adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,A1BG,ENSG00000121410
A1BG-AS1,A1BG-AS1,ENSG00000268895
A1CF,A1CF,ENSG00000148584
A2M,A2M,ENSG00000175899
A2M-AS1,A2M-AS1,ENSG00000245105
...,...,...
ZXDC,ZXDC,ENSG00000070476
ZYG11A,ZYG11A,ENSG00000203995
ZYG11B,ZYG11B,ENSG00000162378
ZYX,ZYX,ENSG00000159840


## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object

# Data Submission Status

- CHECK: Data in X
- CHECJ: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs and gene symbols in var


Revision Status:
DONE

In [62]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))