In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "Jones_Yosef_Teichmann_Farber_Sims_unpubl"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/LNG_BAL.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [11]:
adata.obsm['IA_citeseq']

Unnamed: 0,CD29,CD158_KIR2DL1S1S3S5,CD13,CD26,CD328,CD24,CD81,CD335_NKp46,CD267,CD64,...,CD163,HLA_ABC,CLEC12A,CD25,TIGIT,GPR56,CD272_BTLA,CD161,CD71,Ig_light_chain_kappa
CGAATGTCAATAAGCA-1_CZINY-0536-14,2.865287,0.938573,2.229138,1.445073,0.497841,0.814310,1.591212,1.222092,0.924770,0.767056,...,3.207902,0.805624,2.166313,0.735361,1.131723,1.262107,1.249455,0.767000,3.413924,1.687741
ATTGGTGCAGGACCCT-1_CZINY-0296-7,3.150692,1.108785,0.000000,1.741231,2.636025,1.624155,1.576952,0.806773,1.134909,0.000000,...,2.227445,1.610850,0.000000,1.447411,1.202476,0.000000,1.587987,1.370273,3.223774,2.652998
GATCTAGCAAGCTGGA-1_CZI-IA10034924-0,1.346032,0.745392,0.000000,2.907756,1.000948,0.598141,0.494489,0.479987,0.587279,0.000000,...,0.465401,0.811864,1.242215,0.277549,0.730812,0.000000,0.718137,0.970610,2.144278,0.635351
GTACTCCTCATGTCCC-1_CZINY-0415-14,2.821273,1.163267,0.906886,1.338294,0.000000,0.000000,1.078555,1.624472,0.432306,1.212327,...,0.000000,1.178647,0.000000,0.564379,0.987174,2.744332,0.692202,1.262532,0.900506,0.820304
CTCGAGGAGCAAATCA-1_CZINY-0718-5,0.760329,1.295633,1.778761,2.025420,0.000000,0.617703,1.056568,1.194696,1.630790,0.000000,...,2.385144,0.589961,1.195846,0.497666,0.572293,0.772411,1.350455,0.614312,4.233798,1.091276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACACCTCCAACCAA-1_CZINY-0533-11,1.358824,0.000000,1.603847,0.609587,0.000000,1.820306,1.088400,1.401133,1.041646,0.837987,...,1.248043,0.931391,0.000000,1.320854,1.163974,0.722213,1.358763,0.566485,0.904706,3.839344
AACACGTGTAAGCACG-1_CZINY-0289-0,2.002050,0.754334,0.895048,1.095168,1.338161,1.421913,1.103527,1.491194,1.126372,0.000000,...,1.987500,1.035060,0.000000,0.884831,1.239298,0.000000,0.944579,0.867654,1.410110,1.834831
GAATGAAAGGCGCTCT-1_CZINY-0521-0,2.931868,1.124898,1.681039,1.662697,0.895162,1.127001,1.422002,0.962425,1.108449,1.845154,...,2.962512,0.651585,0.540787,0.621478,1.155079,0.715027,1.311276,1.243934,1.822557,2.619655
AGTCTCCTCTCAGGCG-1_CZINY-0051-2,0.000000,3.556541,0.000000,0.000000,1.154812,0.000000,0.701526,0.000000,0.000000,0.696846,...,0.000000,1.239386,0.000000,0.000000,0.706267,1.983207,0.715932,2.056127,0.692224,1.401790


# Validate obs and uns from Tier 1 Metadata Template

In [30]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,"LNG samples from donors 637C, 640C, D496 and D...","Jo Jones, Sarah Teichmann, Donna Farber, Peter...","donor, chemistry",X_umap,"UNPUBLISHED data, please do not release",Lung tissue was collected from Both University...


In [31]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,583B_LNG,583B,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,CZI-IA9924325,E-MTAB-11536,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000237
1,591C_LNG,591C,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,"CZI-IA10034920,CZI-IA10034921,CZI-IA10034922,C...",E-MTAB-11536,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238
2,637C_LNG,637C,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,CZI-IA10466285,E-MTAB-11536,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000240
3,640C_LNG,640C,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,"CZI-IA10471911,CZI-IA10471912,CZI-IA10471913",E-MTAB-11536,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000242
4,689C_LNG,689C,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,"CZI-IA11485685,CZI-IA11485686",Not public,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000239
5,768B_LNG,768B,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,"CZI-IA13073099,CZI-IA13073100,CZI-IA13073101,C...",Not public,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Unknown,HsapDv:0000237
6,778C_LNG,778C,doi:10.17504/protocols.io.bz4qp8vw,"University of Cambridge, UK",IA_site_1,,"CZI-IA13003091,CZI-IA13003092,CZI-IA13003093,C...",Not public,,NCBITaxon:9606,...,5 prime tag,EFO_0008637,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Unknown,HsapDv:0000240
7,D496_LNG,D496,doi:10.17504/protocols.io.bwr9pd96,"Columbia University, New York, US",IA_site_2,,"CZINY-0097,CZINY-0098,CZINY-0099,CZINY-0100,CZ...",E-MTAB-11536,,NCBITaxon:9606,...,3 prime tag,,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000240
8,D533_LNG,D533,doi:10.17504/protocols.io.bwr9pd96,"Columbia University, New York, US",IA_site_2,,"CZINY-0481,CZINY-0482,CZINY-0483,CZINY-0484,CZ...",Not public,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Asian,HsapDv:0000242
9,D503_BAL,D503,doi:10.17504/protocols.io.bwrjpd4n,"Columbia University, New York, US",IA_site_2,,"CZINY-0049,CZINY-0050,CZINY-0051,CZINY-0052,CZ...",Not public,,NCBITaxon:9606,...,3 prime tag,,True,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000241


# Validate obs and uns from adata

In [32]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata_merged = merger.add_uns_metadata()

adata_merged.uns.keys()

dict_keys(['gross_annotation_colors', 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'])

In [33]:
# Create sample_id in anndata for unique matching
adata.obs['sample_id'] = adata.obs['donor'].astype(str) + "_" + adata.obs['tissue'].astype(str)

adata.obs

Unnamed: 0,n_genes,n_genes_by_counts,total_counts,library,donor,site,sex,sample,chemistry,tissue,batch,MMoCHi_annotation,Cell_ontology,gross_annotation,cmv,ages,sample_id
CGAATGTCAATAAGCA-1_CZINY-0536-14,4809,4809,24285.0,CZINY-0536,D534,NY,Male,D534-BAL-1,5'v2,BAL,166,macrophage,macrophage,m,positive,33,D534_BAL
ATTGGTGCAGGACCCT-1_CZINY-0296-7,1651,1651,3246.0,CZINY-0296,D523,NY,Female,D523-LNG-1,5'v2,LNG,135,macrophage,macrophage,m,positive,20,D523_LNG
GATCTAGCAAGCTGGA-1_CZI-IA10034924-0,1708,1708,4865.0,CZI-IA10034924,591C,UK,Male,591C-LNG-3,5'v2,LNG,17,cd4_tem,"effector memory CD4-positive, alpha-beta T cell",t,positive,35-39,591C_LNG
GTACTCCTCATGTCCC-1_CZINY-0415-14,1323,1323,2611.0,CZINY-0415,D529,NY,Female,D529-LNG-1,5'v2,LNG,154,nk_cd56dim,"CD16-positive, CD56-dim natural killer cell, h...",nk_ilc,positive,68,D529_LNG
CTCGAGGAGCAAATCA-1_CZINY-0718-5,622,622,1470.0,CZINY-0718,D570,NY,Male,D570-LNG-1,5'v2,LNG,202,macrophage,macrophage,m,positive,73,D570_LNG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACACCTCCAACCAA-1_CZINY-0533-11,1786,1786,4039.0,CZINY-0533,D534,NY,Male,D534-BAL-1,5'v2,BAL,166,b_naive,naive b cell,b,positive,33,D534_BAL
AACACGTGTAAGCACG-1_CZINY-0289-0,1387,1387,3420.0,CZINY-0289,D523,NY,Female,D523-BAL-1,5'v2,BAL,128,b_memory,memory b cell,b,positive,20,D523_BAL
GAATGAAAGGCGCTCT-1_CZINY-0521-0,4026,4026,17359.0,CZINY-0521,D534,NY,Male,D534-BAL-1,5'v2,BAL,166,macrophage,macrophage,m,positive,33,D534_BAL
AGTCTCCTCTCAGGCG-1_CZINY-0051-2,2270,2270,5138.0,CZINY-0051,D503,NY,Female,D503-BAL-1,3'v3,BAL,102,nk_cd56dim,"CD16-positive, CD56-dim natural killer cell, h...",nk_ilc,positive,67,D503_BAL


In [34]:
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata_merged = merger.add_obs_metadata(
    adata_col = 'sample_id',
    df_col = 'sample_ID',
    skip = None
)

adata_merged.obs

Unnamed: 0,n_genes,n_genes_by_counts,total_counts,library,donor,site,sex,sample,chemistry,tissue,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
CGAATGTCAATAAGCA-1_CZINY-0536-14,4809,4809,24285.0,CZINY-0536,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238
ATTGGTGCAGGACCCT-1_CZINY-0296-7,1651,1651,3246.0,CZINY-0296,D523,NY,Female,D523-LNG-1,5'v2,LNG,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000237
GATCTAGCAAGCTGGA-1_CZI-IA10034924-0,1708,1708,4865.0,CZI-IA10034924,591C,UK,Male,591C-LNG-3,5'v2,LNG,...,5 prime tag,EFO_0008637,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238
GTACTCCTCATGTCCC-1_CZINY-0415-14,1323,1323,2611.0,CZINY-0415,D529,NY,Female,D529-LNG-1,5'v2,LNG,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000241
CTCGAGGAGCAAATCA-1_CZINY-0718-5,622,622,1470.0,CZINY-0718,D570,NY,Male,D570-LNG-1,5'v2,LNG,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACACCTCCAACCAA-1_CZINY-0533-11,1786,1786,4039.0,CZINY-0533,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238
AACACGTGTAAGCACG-1_CZINY-0289-0,1387,1387,3420.0,CZINY-0289,D523,NY,Female,D523-BAL-1,5'v2,BAL,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000237
GAATGAAAGGCGCTCT-1_CZINY-0521-0,4026,4026,17359.0,CZINY-0521,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,5 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238
AGTCTCCTCTCAGGCG-1_CZINY-0051-2,2270,2270,5138.0,CZINY-0051,D503,NY,Female,D503-BAL-1,3'v3,BAL,...,3 prime tag,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000241


# Add author cell type markers to UNS

In [35]:
marker_genes = pd.read_excel("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Jones_Yosef_Teichmann_Farber_Sims_unpubl/Jones_Yosef_Teichmann_Farber_Sims_unpubl_marker_genes.xlsx")

marker_genes.drop(marker_genes.columns[0], axis=1, inplace=True)

marker_genes

Unnamed: 0,Subset,Parent,Positive Features,Negative Features
0,lymphocyte,All,"2 of (CD3, CD19, CD20, JCHAIN_gex, CD2_gex, LI...","CD163, MRC1_gex, CD64, OLR1_gex, MPO_gex, ELAN..."
1,myelocyte,All,"1 of (CD163, CD64, OLR1_gex) OR 2 of (LAMP3_ge...","CD2_gex, CD19, CD20, CD127, TPSB2_gex, CD3, MM..."
2,mast_cell,All,"CD33 AND 1 of (TPSB2_gex, CPA3_gex)","MRC1_gex, OLR1_gex, CD64, CLEC9A_gex, LILRA4_g..."
3,neutrophil,All,"1 of (S100A8_gex, S100A9_gex) AND 2 of (MMP9_g...","MRC1_gex, OLR1_gex, CLEC9A_gex, LILRA4_gex, PL..."
4,mo_mac,myelocyte,"1 of (MARCO_gex, MRC1_gex, SELENOP_gex, MERTK_...","CD1c, CD1C_gex, MPO_gex, ELANE_gex, PRSS57_gex..."
5,dc,myelocyte,"1 of (CD1C_gex, CD1c, CLEC9A_gex, CCL19_gex, L...","MPO_gex, ELANE_gex, PRSS57_gex, CYTL1_gex, FCN..."
6,mpdc,myelocyte,"2 of (LILRA4_gex, PLD4_gex, JCHAIN_gex, CD123)","MARCO_gex, MRC1_gex, SELENOP_gex, C1QA_gex, C1..."
7,macrophage,mo_mac,"1 of (C1QA_gex, C1QB_gex, C1QC_gex, MARCO_gex,...","FCN1_gex, CLEC12A_lo, CD99"
8,monocyte_classical,mo_mac,"FCN1_gex, CD33_hi AND 2 of (S100A9_gex, S100A8...","MERTK_gex, MARCO_gex, SELENOP_gex, MS4A7_gex, ..."
9,monocyte_nonclassical,mo_mac,"2 of (CX3CR1, MS4A7_gex, C1QA_gex, C1QB_gex, C...","MARCO_gex, SELENOP_gex, SELL_gex, APOE_gex, AP..."


In [36]:
adata.uns[MARKER_GENES] = marker_genes.astype('category')

adata.uns.keys()

dict_keys(['gross_annotation_colors', 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments', 'author_cell_type_markers'])

# Check author cell type annotations and Cell Ontology IDs

In [37]:
adata.obs.columns

Index(['n_genes', 'n_genes_by_counts', 'total_counts', 'library', 'donor',
       'site', 'sex', 'sample', 'chemistry', 'tissue', 'batch',
       'MMoCHi_annotation', 'Cell_ontology', 'gross_annotation', 'cmv', 'ages',
       'sample_ID', 'donor_id', 'protocol_URL', 'institute',
       'sample_collection_site', 'sample_collection_relative_time_point',
       'library_ID', 'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 

In [38]:
# MMoCHi_annotation in obs corresponds to author cell type annotation, so creating new obs column

adata.obs[AUTHOR_CELL_TYPE] = adata.obs['MMoCHi_annotation']

adata.obs

Unnamed: 0,n_genes,n_genes_by_counts,total_counts,library,donor,site,sex,sample,chemistry,tissue,...,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,author_cell_type
CGAATGTCAATAAGCA-1_CZINY-0536-14,4809,4809,24285.0,CZINY-0536,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,macrophage
ATTGGTGCAGGACCCT-1_CZINY-0296-7,1651,1651,3246.0,CZINY-0296,D523,NY,Female,D523-LNG-1,5'v2,LNG,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000237,macrophage
GATCTAGCAAGCTGGA-1_CZI-IA10034924-0,1708,1708,4865.0,CZI-IA10034924,591C,UK,Male,591C-LNG-3,5'v2,LNG,...,EFO_0008637,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,cd4_tem
GTACTCCTCATGTCCC-1_CZINY-0415-14,1323,1323,2611.0,CZINY-0415,D529,NY,Female,D529-LNG-1,5'v2,LNG,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000241,nk_cd56dim
CTCGAGGAGCAAATCA-1_CZINY-0718-5,622,622,1470.0,CZINY-0718,D570,NY,Male,D570-LNG-1,5'v2,LNG,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000242,macrophage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACACCTCCAACCAA-1_CZINY-0533-11,1786,1786,4039.0,CZINY-0533,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,b_naive
AACACGTGTAAGCACG-1_CZINY-0289-0,1387,1387,3420.0,CZINY-0289,D523,NY,Female,D523-BAL-1,5'v2,BAL,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000237,b_memory
GAATGAAAGGCGCTCT-1_CZINY-0521-0,4026,4026,17359.0,CZINY-0521,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,macrophage
AGTCTCCTCTCAGGCG-1_CZINY-0051-2,2270,2270,5138.0,CZINY-0051,D503,NY,Female,D503-BAL-1,3'v3,BAL,...,,true,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000241,nk_cd56dim


In [39]:
# Cell_ontology does not correspond to ID format -> Revise
adata.obs['Cell_ontology'].value_counts()

macrophage                                                                    111047
CD16-positive, CD56-dim natural killer cell, human                             22196
classical monocyte                                                             18166
central memory CD4-positive, alpha-beta T cell                                  7632
effector memory CD4-positive, alpha-beta T cell                                 7575
effector memory CD8-positive, alpha-beta T cell, terminally differentiated      5224
non-classical monocyte                                                          4558
CD4-positive, alpha-beta T cell                                                 4304
CD16-negative, CD56-bright natural killer cell, human                           4185
CD4-positive, CD25-positive, alpha-beta regulatory T cell                       3793
CD8-positive, alpha-beta T cell                                                 3112
effector memory CD8-positive, alpha-beta T cell                  

In [40]:
cell_ontology_mapping = {
    'macrophage': 'CL:0000235',
    'CD16-positive, CD56-dim natural killer cell, human': 'CL:0000939',
    'classical monocyte': 'CL:0000860',
    'central memory CD4-positive, alpha-beta T cell': 'CL:0000904',
    'effector memory CD4-positive, alpha-beta T cell': 'CL:0000905',
    'effector memory CD8-positive, alpha-beta T cell, terminally differentiated': 'CL:0001062',
    'non-classical monocyte': 'CL:0000875',
    'CD4-positive, alpha-beta T cell': 'CL:0000624',
    'CD16-negative, CD56-bright natural killer cell, human': 'CL:0000938',
    'CD4-positive, CD25-positive, alpha-beta regulatory T cell': 'CL:0000792',
    'CD8-positive, alpha-beta T cell': 'CL:0000625',
    'effector memory CD8-positive, alpha-beta T cell': 'CL:0000913',
    'mast cell': 'CL:0000097',
    'immature innate lymphoid cell': 'CL:0001082',
    'naive thymus-derived CD4-positive, alpha-beta T cell': 'CL:0000895',
    'memory b cell': 'CL:0000787',
    'gamma-delta T cell': 'CL:0000798',
    'dendritic cell': 'CL:0000451',
    'naive b cell': 'CL:0000788',
    'effector memory CD4-positive, alpha-beta T cellra': 'CL:0000905',
    'naive thymus-derived CD8-positive, alpha-beta T cell': 'CL:0000900',
    'mucosal invariant T cell': 'CL:0000940',
    'plasmablast': 'CL:0000980',
    'progenitor cell': 'CL:0011026',
    'plasma cell': 'CL:0000786',
    'group 3 innate lymphoid cell': 'CL:0001071',
    'group 1 innate lymphoid cell': 'CL:0001067',
    'plasmacytoid dendritic cell': 'CL:0000784',
    'germinal center B cell': 'CL:0000844'
}

In [41]:
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['Cell_ontology'].map(cell_ontology_mapping)

adata.obs[CELL_TYPE_ONTOLOGY_LABEL] = adata.obs['Cell_ontology']

adata.obs[CELL_TYPE_ONTOLOGY_LABEL].value_counts(dropna=False)

macrophage                                                                    111047
CD16-positive, CD56-dim natural killer cell, human                             22196
classical monocyte                                                             18166
central memory CD4-positive, alpha-beta T cell                                  7632
effector memory CD4-positive, alpha-beta T cell                                 7575
effector memory CD8-positive, alpha-beta T cell, terminally differentiated      5224
non-classical monocyte                                                          4558
CD4-positive, alpha-beta T cell                                                 4304
CD16-negative, CD56-bright natural killer cell, human                           4185
CD4-positive, CD25-positive, alpha-beta regulatory T cell                       3793
CD8-positive, alpha-beta T cell                                                 3112
effector memory CD8-positive, alpha-beta T cell                  

In [42]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

macrophage               111047
nk_cd56dim                22196
monocyte_classical        18166
cd4_tem                    7575
cd4_cm                     7395
cd8_temra                  5224
monocyte_nonclassical      4558
cd4_trm                    4304
nk_cd56br                  4185
cd4_treg                   3793
cd8_trm                    3112
cd8_tem                    2948
mast_cell                  2629
nk_ilc_precursor           2158
cd4_naive                  2076
b_memory                   1998
cd3_gd                     1721
b_naive                    1262
dc2                        1205
cd4_temra                  1201
cd8_naive                   995
cd8_mait                    779
plasmablast                 625
progenitor                  339
plasma_cell                 271
ilc3                        266
ilc1                        242
cd8_cm                      237
pdc                         228
dc_migratory                221
dc1                         149
b_age   

In [43]:
# make mapping dict with unique values from adata.obs[AUTHOR_CELL_TYPE] and adata.obs[CELL_TYPE_ONTOLOGY_ID]

ontology_mapping = dict(zip(adata.obs[AUTHOR_CELL_TYPE], adata.obs[CELL_TYPE_ONTOLOGY_ID]))

ontology_mapping

{'macrophage': 'CL:0000235',
 'cd4_tem': 'CL:0000905',
 'nk_cd56dim': 'CL:0000939',
 'cd8_tem': 'CL:0000913',
 'monocyte_classical': 'CL:0000860',
 'cd4_trm': 'CL:0000624',
 'monocyte_nonclassical': 'CL:0000875',
 'cd4_treg': 'CL:0000792',
 'ilc1': 'CL:0001067',
 'cd8_temra': 'CL:0001062',
 'cd4_cm': 'CL:0000904',
 'dc_migratory': 'CL:0000451',
 'cd8_mait': 'CL:0000940',
 'cd4_temra': 'CL:0000905',
 'dc2': 'CL:0000451',
 'mast_cell': 'CL:0000097',
 'cd8_trm': 'CL:0000625',
 'cd3_gd': 'CL:0000798',
 'b_memory': 'CL:0000787',
 'b_naive': 'CL:0000788',
 'nk_cd56br': 'CL:0000938',
 'cd4_naive': 'CL:0000895',
 'plasmablast': 'CL:0000980',
 'nk_ilc_precursor': 'CL:0001082',
 'cd8_cm': 'CL:0000904',
 'plasma_cell': 'CL:0000786',
 'cd8_naive': 'CL:0000900',
 'ilc3': 'CL:0001071',
 'dc1': 'CL:0000451',
 'progenitor': 'CL:0011026',
 'pdc': 'CL:0000784',
 'b_age': 'CL:0000787',
 'b_gc': 'CL:0000844'}

In [44]:
marker_genes[CELL_TYPE_ONTOLOGY_ID] = marker_genes['Subset'].map(ontology_mapping)
marker_genes

Unnamed: 0,Subset,Parent,Positive Features,Negative Features,cell_type_ontology_term_id
0,lymphocyte,All,"2 of (CD3, CD19, CD20, JCHAIN_gex, CD2_gex, LI...","CD163, MRC1_gex, CD64, OLR1_gex, MPO_gex, ELAN...",
1,myelocyte,All,"1 of (CD163, CD64, OLR1_gex) OR 2 of (LAMP3_ge...","CD2_gex, CD19, CD20, CD127, TPSB2_gex, CD3, MM...",
2,mast_cell,All,"CD33 AND 1 of (TPSB2_gex, CPA3_gex)","MRC1_gex, OLR1_gex, CD64, CLEC9A_gex, LILRA4_g...",CL:0000097
3,neutrophil,All,"1 of (S100A8_gex, S100A9_gex) AND 2 of (MMP9_g...","MRC1_gex, OLR1_gex, CLEC9A_gex, LILRA4_gex, PL...",
4,mo_mac,myelocyte,"1 of (MARCO_gex, MRC1_gex, SELENOP_gex, MERTK_...","CD1c, CD1C_gex, MPO_gex, ELANE_gex, PRSS57_gex...",
5,dc,myelocyte,"1 of (CD1C_gex, CD1c, CLEC9A_gex, CCL19_gex, L...","MPO_gex, ELANE_gex, PRSS57_gex, CYTL1_gex, FCN...",
6,mpdc,myelocyte,"2 of (LILRA4_gex, PLD4_gex, JCHAIN_gex, CD123)","MARCO_gex, MRC1_gex, SELENOP_gex, C1QA_gex, C1...",
7,macrophage,mo_mac,"1 of (C1QA_gex, C1QB_gex, C1QC_gex, MARCO_gex,...","FCN1_gex, CLEC12A_lo, CD99",CL:0000235
8,monocyte_classical,mo_mac,"FCN1_gex, CD33_hi AND 2 of (S100A9_gex, S100A8...","MERTK_gex, MARCO_gex, SELENOP_gex, MS4A7_gex, ...",CL:0000860
9,monocyte_nonclassical,mo_mac,"2 of (CX3CR1, MS4A7_gex, C1QA_gex, C1QB_gex, C...","MARCO_gex, SELENOP_gex, SELL_gex, APOE_gex, AP...",CL:0000875


In [45]:
adata.uns[MARKER_GENES] = marker_genes.astype('category')

# Check whether ENSEMBL IDs in var

In [46]:
adata.var

Unnamed: 0,gene_ids,is_highly_variable_gene_batch_key_donor_id
MIR1302-2HG,ENSG00000243485,True
FAM138A,ENSG00000237613,False
OR4F5,ENSG00000186092,False
AL627309.1,ENSG00000238009,False
AL627309.3,ENSG00000239945,False
...,...,...
AC141272.1,ENSG00000277836,True
AC023491.2,ENSG00000278633,False
AC007325.1,ENSG00000276017,False
AC007325.4,ENSG00000278817,False


In [47]:
adata.var['gene_symbol'] = adata.var.index
adata.var.index.name = 'index'
adata.var.rename(columns={'gene_ids': 'ensembl_id'}, inplace=True)
adata.var

Unnamed: 0_level_0,ensembl_id,is_highly_variable_gene_batch_key_donor_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MIR1302-2HG,ENSG00000243485,True,MIR1302-2HG
FAM138A,ENSG00000237613,False,FAM138A
OR4F5,ENSG00000186092,False,OR4F5
AL627309.1,ENSG00000238009,False,AL627309.1
AL627309.3,ENSG00000239945,False,AL627309.3
...,...,...,...
AC141272.1,ENSG00000277836,True,AC141272.1
AC023491.2,ENSG00000278633,False,AC023491.2
AC007325.1,ENSG00000276017,False,AC007325.1
AC007325.4,ENSG00000278817,False,AC007325.4


In [48]:
adata.var.dtypes

ensembl_id                                    object
is_highly_variable_gene_batch_key_donor_id      bool
gene_symbol                                   object
dtype: object

# Check raw data

In [49]:
adata.X.toarray().max()

28535.0

In [50]:
adata.obs

Unnamed: 0,n_genes,n_genes_by_counts,total_counts,library,donor,site,sex,sample,chemistry,tissue,...,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,author_cell_type,cell_type_ontology_term_id,cell_type_ontology_term_label
CGAATGTCAATAAGCA-1_CZINY-0536-14,4809,4809,24285.0,CZINY-0536,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,macrophage,CL:0000235,macrophage
ATTGGTGCAGGACCCT-1_CZINY-0296-7,1651,1651,3246.0,CZINY-0296,D523,NY,Female,D523-LNG-1,5'v2,LNG,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000237,macrophage,CL:0000235,macrophage
GATCTAGCAAGCTGGA-1_CZI-IA10034924-0,1708,1708,4865.0,CZI-IA10034924,591C,UK,Male,591C-LNG-3,5'v2,LNG,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,cd4_tem,CL:0000905,"effector memory CD4-positive, alpha-beta T cell"
GTACTCCTCATGTCCC-1_CZINY-0415-14,1323,1323,2611.0,CZINY-0415,D529,NY,Female,D529-LNG-1,5'v2,LNG,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000241,nk_cd56dim,CL:0000939,"CD16-positive, CD56-dim natural killer cell, h..."
CTCGAGGAGCAAATCA-1_CZINY-0718-5,622,622,1470.0,CZINY-0718,D570,NY,Male,D570-LNG-1,5'v2,LNG,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000242,macrophage,CL:0000235,macrophage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTACACCTCCAACCAA-1_CZINY-0533-11,1786,1786,4039.0,CZINY-0533,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,b_naive,CL:0000788,naive b cell
AACACGTGTAAGCACG-1_CZINY-0289-0,1387,1387,3420.0,CZINY-0289,D523,NY,Female,D523-BAL-1,5'v2,BAL,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,Hispanic,HsapDv:0000237,b_memory,CL:0000787,memory b cell
GAATGAAAGGCGCTCT-1_CZINY-0521-0,4026,4026,17359.0,CZINY-0521,D534,NY,Male,D534-BAL-1,5'v2,BAL,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000238,macrophage,CL:0000235,macrophage
AGTCTCCTCTCAGGCG-1_CZINY-0051-2,2270,2270,5138.0,CZINY-0051,D503,NY,Female,D503-BAL-1,3'v3,BAL,...,GRCh38,,cell ranger 6.0.0,yes,PATO:0000461,White,HsapDv:0000241,nk_cd56dim,CL:0000939,"CD16-positive, CD56-dim natural killer cell, h..."


In [51]:
#adata.obs = adata.obs.astype('category')

adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')

numeric_cols = ['n_genes', 'n_genes_by_counts', 'total_counts', 'batch', 'ages']
for col in numeric_cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors='coerce')

adata.uns[MARKER_GENES] = adata.uns[MARKER_GENES].astype('category')

adata.obs.index = adata.obs.index.astype('str')
adata.var.index = adata.var.index.astype('str')
adata.uns[MARKER_GENES].index = adata.uns[MARKER_GENES].index.astype('str')
adata.var_names = adata.var_names.astype('str')


In [52]:
adata.X = adata.X.astype(np.int64)

In [53]:
adata.raw = adata

In [54]:
adata.X

<213154x36587 sparse matrix of type '<class 'numpy.int64'>'
	with 536495167 stored elements in Compressed Sparse Row format>

In [55]:
adata.raw.X

<213154x36587 sparse matrix of type '<class 'numpy.int64'>'
	with 536495167 stored elements in Compressed Sparse Row format>

In [56]:
adata.X.toarray().max()

28535

In [57]:
adata.raw.X.toarray().max()

28535

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object  same as Google Sheet

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object same as Google Sheet

# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs and gene symbols in var

# Revisions:

DONE


In [58]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))