In [185]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [186]:
DATASET_ID = "Morrisey_publ"
DATA_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [187]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [188]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [189]:
file_names = os.listdir(DATA_PATH)
file_names_csv = [f for f in file_names if f.endswith(".csv")]

for file in file_names_csv:
    print(f"Processing {file}")
    df = pd.read_csv(join(DATA_PATH, file))
    print(df.columns)
    print(df.shape)
    print("-----------------------")

Processing EEM-scRNA-R42-2_meta.csv
Index(['cellbarcode', 'orig.ident', 'nCount_RNA', 'nFeature_RNA',
       'scrublet_score', 'scrublet_call', 'ident', 'cxds_score', 'cxds_call',
       'bcds_score', 'bcds_call', 'hybrid_score', 'hybrid_call',
       'percent.mito', 'S.Score', 'G2M.Score', 'Phase', 'nCount_SCT',
       'nFeature_SCT', 'SCT_snn_res.0.6', 'seurat_clusters', 'orig_cluster',
       'predicted.lineage_level1.score', 'predicted.lineage_level1',
       'predicted.lineage_level2.score', 'predicted.lineage_level2',
       'predicted.celltype_level1.score', 'predicted.celltype_level1',
       'predicted.celltype_level2.score', 'predicted.celltype_level2',
       'predicted.celltype_level3.score', 'predicted.celltype_level3'],
      dtype='object')
(11184, 32)
-----------------------
Processing EEM-scRNA-021_meta.csv
Index(['cellbarcode', 'orig.ident', 'nCount_RNA', 'nFeature_RNA',
       'scrublet_score', 'scrublet_call', 'ident', 'cxds_score', 'cxds_call',
       'bcds_score',

In [190]:
file_names = os.listdir(DATA_PATH)
file_names = [f for f in file_names if not f.endswith(".csv")]

adatas = {}
for file_name in file_names:
        adata = sc.read_10x_mtx(join(DATA_PATH, file_name))
        metadata = pd.read_csv(join(DATA_PATH, f"{file_name}_meta.csv"))
        if len(np.intersect1d(adata.obs.index, metadata.cellbarcode)) == 0:
            metadata['cellbarcode'] = metadata['cellbarcode'].apply(lambda x: x + "-1")
        print(f"Processing {file_name}")
        print(adata)
        print(np.intersect1d(adata.obs.index, metadata.cellbarcode).shape)
        print("-----------------------")
        adata.obs = pd.merge(
            adata.obs, metadata,
            left_index=True,
            right_on='cellbarcode',
            how='left').set_index('cellbarcode')
        adata.obs.index.name = None
        adatas[file_name] = adata

#adatas

Processing EEM-scRNA-R37-2
AnnData object with n_obs × n_vars = 10343 × 33538
    var: 'gene_ids', 'feature_types'
(10338,)
-----------------------
Processing EEM-scRNA-021
AnnData object with n_obs × n_vars = 14627 × 36601
    var: 'gene_ids', 'feature_types'
(14627,)
-----------------------
Processing EEM-scRNA-R37-1
AnnData object with n_obs × n_vars = 10591 × 33538
    var: 'gene_ids', 'feature_types'
(10585,)
-----------------------
Processing EEM-scRNA-R42-2
AnnData object with n_obs × n_vars = 11169 × 33538
    var: 'gene_ids', 'feature_types'
(11153,)
-----------------------
Processing EEM-scRNA-R42-1
AnnData object with n_obs × n_vars = 11169 × 33538
    var: 'gene_ids', 'feature_types'
(11155,)
-----------------------
Processing EEM-scRNA-R43-2
AnnData object with n_obs × n_vars = 12809 × 33538
    var: 'gene_ids', 'feature_types'
(11727,)
-----------------------
Processing EEM-scRNA-R43-1
AnnData object with n_obs × n_vars = 9722 × 33538
    var: 'gene_ids', 'feature_types'


In [191]:
gene_symbols = []
ensembl_ids = []

for adata in adatas.values():
    gene_symbols.append(adata.var.index)
    ensembl_ids.append(adata.var['gene_ids'])

# create mapping dict from gene symbol to ensembl id
gene_symbol_to_ensembl_id = {}

for gene_symbol, ensembl_id in zip(gene_symbols, ensembl_ids):
    for gs, ei in zip(gene_symbol, ensembl_id):
        gene_symbol_to_ensembl_id[gs] = ei

gene_symbol_to_ensembl_id

{'MIR1302-2HG': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'AL627309.1': 'ENSG00000238009',
 'AL627309.3': 'ENSG00000239945',
 'AL627309.2': 'ENSG00000239906',
 'AL627309.4': 'ENSG00000241599',
 'AL732372.1': 'ENSG00000236601',
 'OR4F29': 'ENSG00000284733',
 'AC114498.1': 'ENSG00000235146',
 'OR4F16': 'ENSG00000284662',
 'AL669831.2': 'ENSG00000229905',
 'AL669831.5': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'AL645608.7': 'ENSG00000272438',
 'AL645608.3': 'ENSG00000230699',
 'AL645608.5': 'ENSG00000241180',
 'AL645608.1': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'PERM1': 'ENSG00000187642',
 'AL645608.8': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AL645608.2': 'ENSG00000224969',
 'AGRN': 'ENSG00000188157',
 'AL645608.9': 'ENSG00000273443',
 'RNF223

In [192]:
adata = ad.concat(adatas, join="outer", label="library_ID", index_unique="_")

In [193]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Human distal airways contain a multipotent sec...,"Edward,E,Morrisey",,UMAP,published,


In [194]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,copd1_periph,COPD1,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-R37-1,GSM5133588,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
1,copd1_periph,COPD1,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-R37-2,GSM5133589,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
2,copd2_periph,COPD2,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-R42-1,GSM5133590,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
3,copd2_periph,COPD2,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-R42-2,GSM5133591,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
4,copd3_periph,COPD3,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-R43-1,GSM5133592,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
5,copd3_periph,COPD3,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-R43-2,GSM5133593,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
6,copd4_periph,COPD4,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-021,GSM5133594,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,MONDO:0005002,unknown,HsapDv:0000241


# Validate obs and uns from adata

In [195]:
adata_ids = adata.obs['library_ID'].value_counts().index.tolist()
obs_ids = obs['library_ID'].value_counts().index.tolist()

non_overlap = list(set(adata_ids) - set(obs_ids))
non_overlap_other_side = list(set(obs_ids) - set(adata_ids))

In [196]:
non_overlap

[]

In [197]:
non_overlap_other_side

[]

In [198]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 80430 × 38224
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'scrublet_score', 'scrublet_call', 'ident', 'cxds_score', 'cxds_call', 'bcds_score', 'bcds_call', 'hybrid_score', 'hybrid_call', 'percent.mito', 'S.Score', 'G2M.Score', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.6', 'seurat_clusters', 'orig_cluster', 'predicted.lineage_level1.score', 'predicted.lineage_level1', 'predicted.lineage_level2.score', 'predicted.lineage_level2', 'predicted.celltype_level1.score', 'predicted.celltype_level1', 'predicted.celltype_level2.score', 'predicted.celltype_level2', 'predicted.celltype_level3.score', 'predicted.celltype_level3', 'library_ID'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'

In [199]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'library_ID',
    df_col = 'library_ID',
    skip = None,
)

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,scrublet_score,scrublet_call,ident,cxds_score,cxds_call,bcds_score,bcds_call,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
AAACCTGAGAGCTGGT-1_EEM-scRNA-R37-2,EEM-scRNA-R37-2,1549.0,706.0,0.085013,False,EEM-scRNA-R37-2,66165.987685,False,0.009831,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
AAACCTGAGAGGACGG-1_EEM-scRNA-R37-2,EEM-scRNA-R37-2,5117.0,1514.0,0.049505,False,EEM-scRNA-R37-2,94836.254494,False,0.007264,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
AAACCTGAGCCCTAAT-1_EEM-scRNA-R37-2,EEM-scRNA-R37-2,5568.0,1558.0,0.040581,False,EEM-scRNA-R37-2,120219.071339,True,0.038610,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
AAACCTGAGCGGCTTC-1_EEM-scRNA-R37-2,EEM-scRNA-R37-2,1954.0,997.0,0.055944,False,EEM-scRNA-R37-2,91977.388982,False,0.045413,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
AAACCTGAGGTTCCTA-1_EEM-scRNA-R37-2,EEM-scRNA-R37-2,808.0,442.0,0.073210,False,EEM-scRNA-R37-2,44686.899631,False,0.014858,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACATACTG-1_EEM-scRNA-R43-1,EEM-scRNA-R43-1,14007.0,4487.0,0.224417,False,EEM-scRNA-R43-1,39772.229950,True,0.013615,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
TTTGTTGCACTCCGGA-1_EEM-scRNA-R43-1,EEM-scRNA-R43-1,668.0,486.0,0.135468,False,EEM-scRNA-R43-1,10336.037901,False,0.003144,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
TTTGTTGGTAATGTGA-1_EEM-scRNA-R43-1,EEM-scRNA-R43-1,803.0,186.0,0.038084,False,EEM-scRNA-R43-1,4941.719629,False,0.001179,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241
TTTGTTGGTCTGGTTA-1_EEM-scRNA-R43-1,EEM-scRNA-R43-1,742.0,505.0,0.037082,False,EEM-scRNA-R43-1,6453.623600,False,0.000816,False,...,3 prime tag,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,CellRanger 3.0.0,no,MONDO:0005002,unknown,HsapDv:0000241


In [200]:
adata

AnnData object with n_obs × n_vars = 80430 × 38224
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'scrublet_score', 'scrublet_call', 'ident', 'cxds_score', 'cxds_call', 'bcds_score', 'bcds_call', 'hybrid_score', 'hybrid_call', 'percent.mito', 'S.Score', 'G2M.Score', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.6', 'seurat_clusters', 'orig_cluster', 'predicted.lineage_level1.score', 'predicted.lineage_level1', 'predicted.lineage_level2.score', 'predicted.lineage_level2', 'predicted.celltype_level1.score', 'predicted.celltype_level1', 'predicted.celltype_level2.score', 'predicted.celltype_level2', 'predicted.celltype_level3.score', 'predicted.celltype_level3', 'library_ID', 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sam

# Add author cell type markers to UNS

Lungmap CellRef was used for reference mapping

In [201]:
adata.obs[AUTHOR_CELL_TYPE_L0] = adata.obs['predicted.celltype_level1']
adata.obs[AUTHOR_CELL_TYPE_L1] = adata.obs['predicted.celltype_level2']
adata.obs[AUTHOR_CELL_TYPE_L2] = adata.obs['predicted.celltype_level3']

In [202]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['predicted.celltype_level3']

In [203]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False).index.tolist()

['AT2',
 'AM',
 'CAP1',
 'AT1',
 'CD4_T',
 'AF1',
 'Plasma',
 'VSMC',
 'CAP2',
 'VEC',
 'Pericyte',
 'LEC',
 'B',
 'Serous',
 'RAS',
 'AEC',
 'SCMF',
 'Ciliated',
 'SVEC',
 'NK',
 'Secretory',
 'AF2',
 nan,
 'CD8_T',
 'IM',
 'Mast/Basophil',
 'ASMC',
 'Mesothelial',
 'Basal',
 'Suprabasal',
 'iMON',
 'Neutrophil',
 'cDC2',
 'PNEC',
 'pDC',
 'pMON',
 'Goblet',
 'cDC1',
 'maDC',
 'Tuft',
 'Treg',
 'Ionocyte',
 'Megakaryocyte/Platelet',
 'SMG_Basal/Duct']

In [204]:
ontology_marker_genes_df = pd.read_csv("../lungMAP/lungMAP_ontologies_marker_genes.csv")
ontology_marker_genes_df

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,Comment,full_name,abbreviation,author_cell_type_markers
0,Alveolar fibroblast 2 (AF2),CL:4028006,,Alveolar fibroblast 2,AF2,MFAP5; SCARA5; CDON; DCN; PLA2G2A; SFRP2; LUM;...
1,Alveolar fibroblast 1 (AF1),CL:4028004,,Alveolar fibroblast 1,AF1,TCF21; PCDH15; WNT2; ROBO2; LUM; SLIT2; DCN; P...
2,Alveolar macrophage (AM),CL:0000583,,Alveolar macrophage,AM,FABP4; MARCO; CYP27A1; PPARG; ABCG1; SIGLEC1; ...
3,Alveolar type 1 cell (AT1),CL:0002062,,Alveolar type 1 cell,AT1,AGER; RTKN2; SEMA3B; NTM; NCKAP5; LMO7; KHDRBS...
4,Alveolar type 2 cell (AT2),CL:0002063,,Alveolar type 2 cell,AT2,ABCA3; LAMP3; KCNJ15; SFTPC; SFTPA1; SFTPA2; S...
5,B cell (B),CL:0000236,,B cell,B,BANK1; MS4A1; CD19; BACH2; IGHM; EBF1; CD79A; ...
6,Basal cell (Basal),CL:0002633,,Basal cell,Basal,KRT5; TP63; NGFR; KRT15; KRT17; MMP10; S100A2;...
7,CD4+ T cell (CD4 T),CL:0000624,,CD4+ T cell,CD4 T,CD3E; LEF1; CD40LG; MAL; CD4; CD69; ITK; LTB; ...
8,CD8+ T cell (CD8 T),CL:0000625,,CD8+ T cell,CD8 T,CD8A; CD3E; CD8B; CCL5; NKG7; GZMA; GZMH; IL32...
9,Inflammatory monocyte (iMON),CL:0000860,,Inflammatory monocyte,iMON,VCAN; FCN1; CD14; S100A8; JARID2; S100A12; THB...


In [205]:
len(adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False).index.tolist())

44

In [206]:
len(np.intersect1d(adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=True).index.tolist(), ontology_marker_genes_df['abbreviation']))

40

In [207]:
# show non overlapping cell types

non_overlap = list(set(adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False).index.tolist()) - set(ontology_marker_genes_df['abbreviation']))
non_overlap

['SMG_Basal/Duct', nan, 'CD8_T', 'CD4_T']

In [208]:
# replace in ontology_marker_genes_df['abbreviation'] every ' ' with a '_'
ontology_marker_genes_df['abbreviation'] = ontology_marker_genes_df['abbreviation'].apply(lambda x: x.replace(' ', '_'))

list(set(adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False).index.tolist()) - set(ontology_marker_genes_df['abbreviation']))

[nan]

In [209]:
ontology_marker_genes_df.drop(columns=AUTHOR_CELL_TYPE, inplace=True)
ontology_marker_genes_df.rename(columns={'abbreviation': AUTHOR_CELL_TYPE}, inplace=True)

ontology_marker_genes_df

Unnamed: 0,cell_type_ontology_term_id,Comment,full_name,author_cell_type,author_cell_type_markers
0,CL:4028006,,Alveolar fibroblast 2,AF2,MFAP5; SCARA5; CDON; DCN; PLA2G2A; SFRP2; LUM;...
1,CL:4028004,,Alveolar fibroblast 1,AF1,TCF21; PCDH15; WNT2; ROBO2; LUM; SLIT2; DCN; P...
2,CL:0000583,,Alveolar macrophage,AM,FABP4; MARCO; CYP27A1; PPARG; ABCG1; SIGLEC1; ...
3,CL:0002062,,Alveolar type 1 cell,AT1,AGER; RTKN2; SEMA3B; NTM; NCKAP5; LMO7; KHDRBS...
4,CL:0002063,,Alveolar type 2 cell,AT2,ABCA3; LAMP3; KCNJ15; SFTPC; SFTPA1; SFTPA2; S...
5,CL:0000236,,B cell,B,BANK1; MS4A1; CD19; BACH2; IGHM; EBF1; CD79A; ...
6,CL:0002633,,Basal cell,Basal,KRT5; TP63; NGFR; KRT15; KRT17; MMP10; S100A2;...
7,CL:0000624,,CD4+ T cell,CD4_T,CD3E; LEF1; CD40LG; MAL; CD4; CD69; ITK; LTB; ...
8,CL:0000625,,CD8+ T cell,CD8_T,CD8A; CD3E; CD8B; CCL5; NKG7; GZMA; GZMH; IL32...
9,CL:0000860,,Inflammatory monocyte,iMON,VCAN; FCN1; CD14; S100A8; JARID2; S100A12; THB...


In [210]:
adata.uns[MARKER_GENES] = ontology_marker_genes_df

# Check author cell type annotations and Cell Ontology IDs

In [211]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

AT2                       13700
AM                         8448
CAP1                       6457
AT1                        5658
CD4_T                      4060
AF1                        3909
Plasma                     3362
VSMC                       2857
CAP2                       2692
VEC                        2506
Pericyte                   2406
LEC                        2239
B                          2174
Serous                     2147
RAS                        2014
AEC                        1929
SCMF                       1793
Ciliated                   1785
SVEC                       1742
NK                         1557
Secretory                  1317
AF2                        1206
NaN                        1165
CD8_T                       741
IM                          513
Mast/Basophil               481
ASMC                        473
Mesothelial                 206
Basal                       184
Suprabasal                  159
iMON                        158
Neutroph

In [212]:
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs[AUTHOR_CELL_TYPE].map(dict(zip(ontology_marker_genes_df[AUTHOR_CELL_TYPE], ontology_marker_genes_df['cell_type_ontology_term_id'])))

adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063        13700
CL:0000583         8448
CL:4028002         6457
CL:0002062         5658
CL:0000624         4060
CL:4028004         3909
CL:0000786         3362
CL:0000359         2857
CL:4028003         2692
CL:0002543         2506
CL:0009089         2406
CL:0002138         2239
CL:0000236         2174
CL:4033005         2147
UBERON:0001955     2014
CL:1001568         1929
CL:0000186         1793
CL:4030034         1785
UBERON:0001592     1742
CL:0000623         1557
CL:0000158         1317
CL:4028006         1206
NaN                1165
CL:0000625          741
CL:4033043          513
CL:0000097          481
CL:0019019          473
CL:0000077          206
CL:0002633          184
CL:4033048          159
CL:0000860          158
CL:0000775           91
CL:0002399           84
CL:1000223           47
CL:0000784           40
CL:0000875           33
CL:0000160           30
CL:0000990           28
CL:4033045           18
CL:0002075            6
CL:0000815            5
CL:0005006      

In [213]:
adata.obs[MARKER_GENES] = adata.obs[AUTHOR_CELL_TYPE].map(dict(zip(ontology_marker_genes_df[AUTHOR_CELL_TYPE], ontology_marker_genes_df['author_cell_type_markers'])))

adata.obs[MARKER_GENES].value_counts(dropna=False)

ABCA3; LAMP3; KCNJ15; SFTPC; SFTPA1; SFTPA2; SFTPB; PGC; NAPSA; SFTPD            13700
FABP4; MARCO; CYP27A1; PPARG; ABCG1; SIGLEC1; C1QB; APOC1; C1QA; C1QC             8448
FCN3; IL7R; GPIHBP1; TMEM100; SLC6A4; IL1RL1; EDN1; BTNL9; CLEC14A; POSTN         6457
AGER; RTKN2; SEMA3B; NTM; NCKAP5; LMO7; KHDRBS2; LIMCH1; LAMA3; SCEL              5658
CD3E; LEF1; CD40LG; MAL; CD4; CD69; ITK; LTB; TRAC; RHOH                          4060
TCF21; PCDH15; WNT2; ROBO2; LUM; SLIT2; DCN; PRKG1; SCN7A; MFAP4                  3909
CD38; CD27; SDC1; JCHAIN; IGLC3; IGHM; IGHG1; IGHG3; IGHA2; MZB1                  3362
CNN1; NTRK3; ITGA7; TAGLN; MUSTN1; C11orf96; ACTA2; MYH11; TPM2; RERGL            2857
HPGD; EDNRB; CA4; APLN; SOSTDC1; CLDN5; EMCN; RAMP2; IL1RL1; AQP1                 2692
ACKR1; EPHB4; HDAC9; VWF; CCL23; C7; LIFR; PTPRB; SLCO2A1; RAMP3                  2506
PDGFRB; LAMC3; CSPG4; TRPC6; COX4I2; HIGD1B; BGN; NDUFA4L2; PTN; EGFL6            2406
CCL21; MMRN1; PROX1; LYVE1; PPFIBP1; GNG11;

# Check whether ENSEMBL IDs in var

In [214]:
adata.var

A1BG
A1BG-AS1
A1CF
A2M
A2M-AS1
...
ZYG11B
ZYX
ZZEF1
hsa-mir-1253
hsa-mir-423


In [215]:
adata.var['gene_symbol'] = adata.var.index
adata.var.index.name = 'index'
adata.var['ensembl_id'] = adata.var['gene_symbol'].map(gene_symbol_to_ensembl_id)

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
A1BG,A1BG,ENSG00000121410
A1BG-AS1,A1BG-AS1,ENSG00000268895
A1CF,A1CF,ENSG00000148584
A2M,A2M,ENSG00000175899
A2M-AS1,A2M-AS1,ENSG00000245105
...,...,...
ZYG11B,ZYG11B,ENSG00000162378
ZYX,ZYX,ENSG00000159840
ZZEF1,ZZEF1,ENSG00000074755
hsa-mir-1253,hsa-mir-1253,ENSG00000272920


# Check raw data

In [225]:
import pandas as pd

# Make specific columns numeric
adata.obs['cell_viability_percentage'] = pd.to_numeric(
    adata.obs['cell_viability_percentage'], errors='coerce'
)
adata.obs['cell_number_loaded'] = pd.to_numeric(
    adata.obs['cell_number_loaded'], errors='coerce'
)
adata.obs['sample_collection_year'] = pd.to_numeric(
    adata.obs['sample_collection_year'], errors='coerce'
)

numeric_cols = [
    'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'scrublet_score',
    'cxds_score', 'bcds_score', 'hybrid_score', 'S.Score', 'G2M.Score',
    'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.6', 'seurat_clusters',
    'orig_cluster', 'predicted.lineage_level1.score',
    'predicted.lineage_level2.score', 'predicted.celltype_level1.score',
    'predicted.celltype_level2.score', 'predicted.celltype_level3.score'
]
for col in numeric_cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors='coerce')

bool_cols = ['scrublet_call', 'cxds_call', 'bcds_call', 'hybrid_call']
for col in bool_cols:
    adata.obs[col] = adata.obs[col].astype('bool')

# Make df indices str
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)
adata.var_names = adata.var_names.astype(str)


In [232]:
adata.uns[MARKER_GENES] = adata.uns[MARKER_GENES].astype(str)

In [177]:
adata.X.toarray().max()

47908

In [233]:
adata.X = adata.X.astype(np.int64)

In [234]:
adata.raw = adata

In [180]:
adata.X

<80430x38224 sparse matrix of type '<class 'numpy.int64'>'
	with 146200515 stored elements in Compressed Sparse Row format>

In [181]:
adata.raw.X

<80430x38224 sparse matrix of type '<class 'numpy.int64'>'
	with 146200515 stored elements in Compressed Sparse Row format>

In [182]:
adata.X.toarray().max()

47908

In [183]:
adata.raw.X.toarray().max()

47908

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK
- OK

# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs in var

### Revision:

DONE

In [235]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))