In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "lungMAP"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
ONTOLOGIES_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/CellRef.ontologyID-mapping_20240508.xlsx"
MARKERS_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/cellref_marker_genes.xlsx"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'
CELL_TYPE_ONTOLOGY_ID_L2 = 'cell_type_ontology_term_id_level_2'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'
MARKER_GENES_L2 = 'author_cell_type_markers_level_2'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
MARKER_GENES = 'author_cell_type_markers'

# Load data

In [4]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")
adata = sc.read_h5ad(H5AD_PATH)

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Cells of human bronchus submucosal gland and l...,"Whitsett, Xu, Morrisey",DataID,UMAP,protected under embargo,


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,CCHMC_SMG_Donor27,CCHMC_SMG_Donor27,In method section of CellRef (PMID: 37516747),CCHMC,CCHMC,,CCHMC_SMG_Donor27,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000239
1,CCHMC_SMG_Donor28,CCHMC_SMG_Donor28,In method section of CellRef (PMID: 37516747),CCHMC,CCHMC,,CCHMC_SMG_Donor28,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000237
2,CCHMC_SMG_Donor29,CCHMC_SMG_Donor29,In method section of CellRef (PMID: 37516747),CCHMC,CCHMC,,CCHMC_SMG_Donor29,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000264
3,CCHMC_SMG_Donor33,CCHMC_SMG_Donor33,In method section of CellRef (PMID: 37516747),CCHMC,CCHMC,,CCHMC_SMG_Donor33,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000240
4,CCHMC_SMG_Donor38,CCHMC_SMG_Donor38,In method section of CellRef (PMID: 37516747),CCHMC,CCHMC,,CCHMC_SMG_Donor38,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000238
5,EEM-scRNA-005,UPENN0424,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-005,GSM5133604,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000237
6,EEM-scRNA-006,UPENN0070,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-006,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000237
7,EEM-scRNA-022,UPENN0056,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-022,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000268
8,EEM-scRNA-026,UPENN0004,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-026,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000264
9,EEM-scRNA-062,UPENN0457,dx.doi.org/10.17504/protocols.io.b54gq8tw,University of Pennsylvania,University of Pennsylvania,,EEM-scRNA-062,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000240


# Validate obs and uns from adata

In [7]:
adata

AnnData object with n_obs × n_vars = 347970 × 32284
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'S.Score', 'G2M.Score', 'Phase', 'pMT', 'DataID', 'DonorID', 'Dataset', 'Age', 'Sex', 'lineage_level1', 'lineage_level2', 'celltype_level1', 'celltype_level2', 'celltype_level3', 'celltype_level3_fullname'
    var: '_index', 'features'
    obsm: 'X_pca', 'X_umap'

In [8]:
# When cross-checking lungMAP CellRef and HLCA v1, all were overlapping except for the following:
# Melms et al. 2021: A molecular single-cell lung atlas of lethal COVID-19
# https://www.nature.com/articles/s41586-021-03569-1

In [9]:
# subset to unpublished lungMAP datasets
adata = adata[adata.obs['Dataset'].isin(['UPenn_LungMAP', 'CCHMC_LungMAP'])].copy()

In [10]:
# check overlap between adata and obs
adata_ids = adata.obs['DataID'].value_counts().index.tolist()
obs_ids = obs['sample_ID'].value_counts().index.tolist()

non_overlap = list(set(adata_ids) - set(obs_ids))
non_overlap_other_side = list(set(obs_ids) - set(adata_ids))

In [11]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 122445 × 32284
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'S.Score', 'G2M.Score', 'Phase', 'pMT', 'DataID', 'DonorID', 'Dataset', 'Age', 'Sex', 'lineage_level1', 'lineage_level2', 'celltype_level1', 'celltype_level2', 'celltype_level3', 'celltype_level3_fullname'
    var: '_index', 'features'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'
    obsm: 'X_pca', 'X_umap'

In [12]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'DataID',
    df_col = 'sample_ID',
    skip = None
)

adata.obs['sample_ID'] = adata.obs['DataID']

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_SCT,nFeature_SCT,S.Score,G2M.Score,Phase,pMT,DataID,...,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID
Donor27_Donor27_AAACGAACACCGTGCA-1,CCHMC_SMG_Donor27,78067.0,5526,18752.0,2854,-0.034438,-0.037647,G1,3.942845,CCHMC_SMG_Donor27,...,EFO:0008565,true,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27
Donor27_Donor27_AAAGAACAGCGCCATC-1,CCHMC_SMG_Donor27,46370.0,4765,18071.0,3371,0.000111,-0.053899,S,6.671827,CCHMC_SMG_Donor27,...,EFO:0008565,true,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27
Donor27_Donor27_AAAGAACCAGGCAATG-1,CCHMC_SMG_Donor27,35041.0,4904,18520.0,4414,0.006502,-0.042120,S,6.043456,CCHMC_SMG_Donor27,...,EFO:0008565,true,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27
Donor27_Donor27_AAAGGATAGTAGCTCT-1,CCHMC_SMG_Donor27,6221.0,2345,11826.0,2439,-0.018128,-0.012995,G1,5.891771,CCHMC_SMG_Donor27,...,EFO:0008565,true,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27
Donor27_Donor27_AAAGTCCCATCCGGCA-1,CCHMC_SMG_Donor27,3455.0,1415,11196.0,1945,-0.052171,0.041244,G2M,8.294931,CCHMC_SMG_Donor27,...,EFO:0008565,true,GRCh38,GENCODE v32/Ensembl98,cell ranger 4.0.0,no,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EEM-scRNA-079_TTTGGTTTCAGACCTA,AICU191-1,2683.0,1224,10964.0,2190,-0.029333,-0.050301,G1,10.266864,EEM-scRNA-079,...,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079
EEM-scRNA-079_TTTGTTGAGGTAGGCT,AICU191-1,5823.0,1916,12200.0,2186,-0.060900,0.021290,G2M,0.290499,EEM-scRNA-079,...,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079
EEM-scRNA-079_TTTGTTGAGTTAACAG,AICU191-1,15731.0,3454,13043.0,3451,-0.049283,-0.004591,G1,4.864282,EEM-scRNA-079,...,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079
EEM-scRNA-079_TTTGTTGCACACGCCA,AICU191-1,3861.0,2051,10594.0,2663,-0.049358,-0.021161,G1,6.606452,EEM-scRNA-079,...,EFO:0008565,true,GRCh38,GENCODE v44/Ensembl110,STARsolo 2.7.9a_EmptyDrops_CR,no,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079


# Add author cell type markers to UNS

In [13]:
ontologies = pd.read_excel(ONTOLOGIES_PATH, sheet_name = 'cellref celltype ontology ID')
ontologies

Unnamed: 0,LungMAP Human Lung CellRef,Cell Ontology,NOTE,Unnamed: 3,the mapping is updated
0,Alveolar fibroblast 2 (AF2),CL:4028006,,,
1,Alveolar fibroblast 1 (AF1),CL:4028004,,,
2,Alveolar macrophage (AM),CL:0000583,,,
3,Alveolar type 1 cell (AT1),CL:0002062,,,
4,Alveolar type 2 cell (AT2),CL:0002063,,,
5,B cell (B),CL:0000236,,,
6,Basal cell (Basal),CL:0002633,,,
7,CD4+ T cell (CD4 T),CL:0000624,,,
8,CD8+ T cell (CD8 T),CL:0000625,,,
9,Inflammatory monocyte (iMON),CL:0000860,,,


In [14]:
ontologies['Cell Ontology'] = ontologies['Cell Ontology'].str.replace('_', ':')

# replace 'CL:0000097/CL:0000767' in 'Cell Ontology' with 'CL:0000097'
ontologies['Cell Ontology'] = ontologies['Cell Ontology'].str.replace('/CL:0000767', '')

# Replace 'if only a term is allowed, I think CL:0000097 is a better match' in 'NOTE' with 'Not unambiguous, could be either CL:0000097 or CL:0000767'
ontologies['NOTE'] = ontologies['NOTE'].str.replace('if only a term is allowed, I think CL:0000097 is a better match', 'Not unambiguous, could be either CL:0000097 or CL:0000767')

ontologies.drop(columns = ['Unnamed: 3', 'the mapping is updated'], inplace = True)

ontologies.columns = [AUTHOR_CELL_TYPE, CELL_TYPE_ONTOLOGY_ID, 'Comment']

ontologies[['full_name', 'abbreviation']] = ontologies[AUTHOR_CELL_TYPE].str.extract(r'^(.*) \((.*)\)$')

ontologies['abbreviation'] = ontologies['abbreviation'].fillna(ontologies['author_cell_type'])
ontologies['full_name'] = ontologies['full_name'].fillna(ontologies['author_cell_type'])

ontologies

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,Comment,full_name,abbreviation
0,Alveolar fibroblast 2 (AF2),CL:4028006,,Alveolar fibroblast 2,AF2
1,Alveolar fibroblast 1 (AF1),CL:4028004,,Alveolar fibroblast 1,AF1
2,Alveolar macrophage (AM),CL:0000583,,Alveolar macrophage,AM
3,Alveolar type 1 cell (AT1),CL:0002062,,Alveolar type 1 cell,AT1
4,Alveolar type 2 cell (AT2),CL:0002063,,Alveolar type 2 cell,AT2
5,B cell (B),CL:0000236,,B cell,B
6,Basal cell (Basal),CL:0002633,,Basal cell,Basal
7,CD4+ T cell (CD4 T),CL:0000624,,CD4+ T cell,CD4 T
8,CD8+ T cell (CD8 T),CL:0000625,,CD8+ T cell,CD8 T
9,Inflammatory monocyte (iMON),CL:0000860,,Inflammatory monocyte,iMON


In [15]:
cell_markers = pd.read_excel(MARKERS_PATH, sheet_name = 'Data S5', skiprows=2)

grouped_data = cell_markers.groupby('Cell type')['Genes'].agg(lambda x: '; '.join(x)).reset_index()
grouped_data.columns = ['Cell type', 'Marker genes']

grouped_data

Unnamed: 0,Cell type,Marker genes
0,AEC,DKK2; GJA5; BMX; IGFBP3; FAM107A; MT1A; PTPRB;...
1,AF1,TCF21; PCDH15; WNT2; ROBO2; LUM; SLIT2; DCN; P...
2,AF2,MFAP5; SCARA5; CDON; DCN; PLA2G2A; SFRP2; LUM;...
3,AM,FABP4; MARCO; CYP27A1; PPARG; ABCG1; SIGLEC1; ...
4,ASMC,DES; LGR6; BCHE; PLN; ASPN; PRUNE2; HSPB7; GRE...
5,AT1,AGER; RTKN2; SEMA3B; NTM; NCKAP5; LMO7; KHDRBS...
6,AT2,ABCA3; LAMP3; KCNJ15; SFTPC; SFTPA1; SFTPA2; S...
7,B,BANK1; MS4A1; CD19; BACH2; IGHM; EBF1; CD79A; ...
8,Basal,KRT5; TP63; NGFR; KRT15; KRT17; MMP10; S100A2;...
9,CAP1,FCN3; IL7R; GPIHBP1; TMEM100; SLC6A4; IL1RL1; ...


In [16]:
ontologies[MARKER_GENES] = ontologies['abbreviation'].map(dict(zip(grouped_data['Cell type'], grouped_data['Marker genes'])))

ontologies

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,Comment,full_name,abbreviation,author_cell_type_markers
0,Alveolar fibroblast 2 (AF2),CL:4028006,,Alveolar fibroblast 2,AF2,MFAP5; SCARA5; CDON; DCN; PLA2G2A; SFRP2; LUM;...
1,Alveolar fibroblast 1 (AF1),CL:4028004,,Alveolar fibroblast 1,AF1,TCF21; PCDH15; WNT2; ROBO2; LUM; SLIT2; DCN; P...
2,Alveolar macrophage (AM),CL:0000583,,Alveolar macrophage,AM,FABP4; MARCO; CYP27A1; PPARG; ABCG1; SIGLEC1; ...
3,Alveolar type 1 cell (AT1),CL:0002062,,Alveolar type 1 cell,AT1,AGER; RTKN2; SEMA3B; NTM; NCKAP5; LMO7; KHDRBS...
4,Alveolar type 2 cell (AT2),CL:0002063,,Alveolar type 2 cell,AT2,ABCA3; LAMP3; KCNJ15; SFTPC; SFTPA1; SFTPA2; S...
5,B cell (B),CL:0000236,,B cell,B,BANK1; MS4A1; CD19; BACH2; IGHM; EBF1; CD79A; ...
6,Basal cell (Basal),CL:0002633,,Basal cell,Basal,KRT5; TP63; NGFR; KRT15; KRT17; MMP10; S100A2;...
7,CD4+ T cell (CD4 T),CL:0000624,,CD4+ T cell,CD4 T,CD3E; LEF1; CD40LG; MAL; CD4; CD69; ITK; LTB; ...
8,CD8+ T cell (CD8 T),CL:0000625,,CD8+ T cell,CD8 T,CD8A; CD3E; CD8B; CCL5; NKG7; GZMA; GZMH; IL32...
9,Inflammatory monocyte (iMON),CL:0000860,,Inflammatory monocyte,iMON,VCAN; FCN1; CD14; S100A8; JARID2; S100A12; THB...


In [None]:
ontologies.to_csv("lungMAP_ontologies_marker_genes.csv", index = False)

In [18]:
adata.uns[MARKER_GENES] = ontologies.astype('category')
adata.uns[MARKER_GENES_L2] = ontologies.astype('category')
adata

AnnData object with n_obs × n_vars = 122445 × 32284
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT', 'nFeature_SCT', 'S.Score', 'G2M.Score', 'Phase', 'pMT', 'DataID', 'DonorID', 'Dataset', 'Age', 'Sex', 'lineage_level1', 'lineage_level2', 'celltype_level1', 'celltype_level2', 'celltype_level3', 'celltype_level3_fullname', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_d

# Check author cell type annotations and Cell Ontology IDs

In [19]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_SCT',
       'nFeature_SCT', 'S.Score', 'G2M.Score', 'Phase', 'pMT', 'DataID',
       'DonorID', 'Dataset', 'Age', 'Sex', 'lineage_level1', 'lineage_level2',
       'celltype_level1', 'celltype_level2', 'celltype_level3',
       'celltype_level3_fullname', 'donor_id', 'protocol_URL', 'institute',
       'sample_collection_site', 'sample_collection_relative_time_point',
       'library_ID', 'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fra

In [20]:
# celltype_level3 == finest annotation and maps to cell ontology ID dataframe
len(adata.obs['celltype_level3'].value_counts())

47

In [21]:
# show non- overlap between celltype_level3 and 'abbreviation' of ontologies
non_overlap = list(set(adata.obs['celltype_level3'].value_counts().index.tolist()) - set(ontologies['abbreviation'].tolist()))
non_overlap_other_side = list(set(ontologies['abbreviation'].tolist()) - set(adata.obs['celltype_level3'].value_counts().index.tolist()))

In [22]:
# Deuterosomal in annotation but not in AnnData object
non_overlap_other_side

['Deuterosomal']

In [23]:
# Specific annotations
adata.obs[AUTHOR_CELL_TYPE_L0] = adata.obs['celltype_level1']
adata.obs[AUTHOR_CELL_TYPE_L1] = adata.obs['celltype_level2']
adata.obs[AUTHOR_CELL_TYPE_L2] = adata.obs['celltype_level3']

adata.obs[CELL_TYPE_ONTOLOGY_ID_L2] = adata.obs['celltype_level3'].map(dict(zip(ontologies['abbreviation'], ontologies[CELL_TYPE_ONTOLOGY_ID])))

# Generic annotations
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['celltype_level3']
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['celltype_level3'].map(dict(zip(ontologies['abbreviation'], ontologies[CELL_TYPE_ONTOLOGY_ID])))

# Check whether ENSEMBL IDs in var

In [24]:
adata.var

Unnamed: 0,_index,features
0,MIR1302-2HG,MIR1302-2HG
1,FAM138A,FAM138A
2,OR4F5,OR4F5
3,AL627309.1,AL627309.1
4,AL627309.3,AL627309.3
...,...,...
32279,AC141272.1,AC141272.1
32280,AC023491.2,AC023491.2
32281,AC007325.1,AC007325.1
32282,AC007325.4,AC007325.4


In [25]:
adata.var.index = adata.var['features']
adata.var.drop(columns = '_index', inplace = True)
adata.var.rename(columns={'features': 'gene_symbol'}, inplace = True)
adata.var.index.name = 'index'
adata.var

Unnamed: 0_level_0,gene_symbol
index,Unnamed: 1_level_1
MIR1302-2HG,MIR1302-2HG
FAM138A,FAM138A
OR4F5,OR4F5
AL627309.1,AL627309.1
AL627309.3,AL627309.3
...,...
AC141272.1,AC141272.1
AC023491.2,AC023491.2
AC007325.1,AC007325.1
AC007325.4,AC007325.4


In [27]:
# get ensembl ids 
ensembl_mapping = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/CellRef_genes.txt", sep="\t", header=0)

ensembl_mapping

Unnamed: 0,id,symbol
0,ENSG00000243485,MIR1302-2HG
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,AL627309.1
4,ENSG00000239945,AL627309.3
...,...,...
32279,ENSG00000277836,AC141272.1
32280,ENSG00000278633,AC023491.2
32281,ENSG00000276017,AC007325.1
32282,ENSG00000278817,AC007325.4


In [28]:
adata.var['ensembl_id'] = adata.var['gene_symbol'].map(dict(zip(ensembl_mapping['symbol'], ensembl_mapping['id'])))

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-2HG,MIR1302-2HG,ENSG00000243485
FAM138A,FAM138A,ENSG00000237613
OR4F5,OR4F5,ENSG00000186092
AL627309.1,AL627309.1,ENSG00000238009
AL627309.3,AL627309.3,ENSG00000239945
...,...,...
AC141272.1,AC141272.1,ENSG00000277836
AC023491.2,AC023491.2,ENSG00000278633
AC007325.1,AC007325.1,ENSG00000276017
AC007325.4,AC007325.4,ENSG00000278817


In [30]:
adata.var.dtypes

gene_symbol    object
ensembl_id     object
dtype: object

# Check raw data

In [31]:
adata.raw.X

<122445x32284 sparse matrix of type '<class 'numpy.float64'>'
	with 271059277 stored elements in Compressed Sparse Row format>

In [32]:
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')
adata.obs['manner_of_death'] = pd.to_numeric(adata.obs['manner_of_death'], errors='coerce')

In [33]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_SCT,nFeature_SCT,S.Score,G2M.Score,Phase,pMT,DataID,...,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID,author_cell_type_level_0,author_cell_type_level_1,author_cell_type_level_2,cell_type_ontology_term_id_level_2,author_cell_type,cell_type_ontology_term_id
Donor27_Donor27_AAACGAACACCGTGCA-1,CCHMC_SMG_Donor27,78067.0,5526,18752.0,2854,-0.034438,-0.037647,G1,3.942845,CCHMC_SMG_Donor27,...,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27,Serous,Serous,Serous,CL:4033005,Serous,CL:4033005
Donor27_Donor27_AAAGAACAGCGCCATC-1,CCHMC_SMG_Donor27,46370.0,4765,18071.0,3371,0.000111,-0.053899,S,6.671827,CCHMC_SMG_Donor27,...,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27,Serous,Serous,Serous,CL:4033005,Serous,CL:4033005
Donor27_Donor27_AAAGAACCAGGCAATG-1,CCHMC_SMG_Donor27,35041.0,4904,18520.0,4414,0.006502,-0.042120,S,6.043456,CCHMC_SMG_Donor27,...,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27,Serous,Serous,Serous,CL:4033005,Serous,CL:4033005
Donor27_Donor27_AAAGGATAGTAGCTCT-1,CCHMC_SMG_Donor27,6221.0,2345,11826.0,2439,-0.018128,-0.012995,G1,5.891771,CCHMC_SMG_Donor27,...,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27,LEC,LEC,LEC,CL:0002138,LEC,CL:0002138
Donor27_Donor27_AAAGTCCCATCCGGCA-1,CCHMC_SMG_Donor27,3455.0,1415,11196.0,1945,-0.052171,0.041244,G2M,8.294931,CCHMC_SMG_Donor27,...,PATO:0000461,unknown,HsapDv:0000239,CCHMC_SMG_Donor27,T,CD8 T,CD8 T,CL:0000625,CD8 T,CL:0000625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EEM-scRNA-079_TTTGGTTTCAGACCTA,AICU191-1,2683.0,1224,10964.0,2190,-0.029333,-0.050301,G1,10.266864,EEM-scRNA-079,...,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079,AF1,AF1,AF1,CL:4028004,AF1,CL:4028004
EEM-scRNA-079_TTTGTTGAGGTAGGCT,AICU191-1,5823.0,1916,12200.0,2186,-0.060900,0.021290,G2M,0.290499,EEM-scRNA-079,...,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079,AT2,AT2,AT2,CL:0002063,AT2,CL:0002063
EEM-scRNA-079_TTTGTTGAGTTAACAG,AICU191-1,15731.0,3454,13043.0,3451,-0.049283,-0.004591,G1,4.864282,EEM-scRNA-079,...,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079,AM,AM,AM,CL:0000583,AM,CL:0000583
EEM-scRNA-079_TTTGTTGCACACGCCA,AICU191-1,3861.0,2051,10594.0,2663,-0.049358,-0.021161,G1,6.606452,EEM-scRNA-079,...,PATO:0000461,unknown,HsapDv:0000239,EEM-scRNA-079,AT1,AT1,AT1,CL:0002062,AT1,CL:0002062


In [34]:
adata.X.toarray().max()

8.94130496645434

In [35]:
adata.X = adata.raw.X.astype(np.int64)

In [36]:
adata.raw = adata

In [27]:
adata.X

<122445x32284 sparse matrix of type '<class 'numpy.int64'>'
	with 271059277 stored elements in Compressed Sparse Row format>

In [28]:
adata.raw.X

<122445x32284 sparse matrix of type '<class 'numpy.int64'>'
	with 271059277 stored elements in Compressed Sparse Row format>

In [29]:
adata.X.toarray().max()

61871

In [30]:
adata.raw.X.toarray().max()

61871

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object  same as Google Sheet

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object same as Google Sheet

# Data Submission Status

- CHECK: Raw data in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- MISSING: ENSEMBL IDs in var

# Revisions:

DONE

In [46]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')