In [2]:
import scanpy as sc
import pandas as pd
import anndata as ad
#import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [4]:
DATASET_ID = "Kropski_Banovich_unpubl"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'
CELL_TYPE_ONTOLOGY_ID_L2 = 'cell_type_ontology_term_id_level_2'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'
MARKER_GENES_L2 = 'author_cell_type_markers_level_2'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
MARKER_GENES = 'author_cell_type_markers'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [5]:
indices_split = np.array_split(range(adata.shape[0]), 4)
indices_split

[array([    0,     1,     2, ..., 72274, 72275, 72276]),
 array([ 72277,  72278,  72279, ..., 144550, 144551, 144552]),
 array([144553, 144554, 144555, ..., 216826, 216827, 216828]),
 array([216829, 216830, 216831, ..., 289102, 289103, 289104])]

In [6]:
# Parse adata in right format for Archmap mapping
adata.obs_names_make_unique()
adata.var.index = adata.var['gene_id'].astype(str)
adata.var_names_make_unique()
adata.obs['scanvi_label'] = 'unlabeled'
adata.obs['dataset'] = 'Kropski_Banovich_unpubl'

for i, index_chunk in enumerate(indices_split):
    adata_chunk = adata[index_chunk].copy()
    adata_chunk.write_h5ad(join(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}", f'{DATASET_ID}_adata_chunk_{i}.h5ad'))

In [7]:
mapped_adata_files = [f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/"+x for x in os.listdir(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}") if x.endswith('mapped.h5ad')]

mapped_adata_files

['/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kropski_Banovich_unpubl/chunk_1_mapped.h5ad',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kropski_Banovich_unpubl/chunk_0_mapped.h5ad',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kropski_Banovich_unpubl/chunk_2_mapped.h5ad',
 '/home/icb/raphael.kfuri-rubens/data/hlca_v2/Kropski_Banovich_unpubl/chunk_3_mapped.h5ad']

In [8]:
# Load chunked and mapped Archmap AnnData objects
mapped_adatas = []
for adata_file in mapped_adata_files:
    adata = sc.read_h5ad(adata_file)
    mapped_adatas.append(adata)

adata_archmap = ad.concat(mapped_adatas,axis=0)

adata_archmap

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 2628881 × 2000
    obs: 'dataset', 'scanvi_label', '_scvi_batch', '_scvi_labels', 'type', 'query', 'scanvi_label_uncertainty_euclidean', 'uncertainty_mahalanobis', 'prediction_xgb', 'donor_id', 'tissue_ontology_term_id', 'sample_collection_site', 'disease_ontology_term_id', 'library_ID', 'suspension_type', 'cell_number_loaded', 'sample_collection_relative_time_point', 'is_primary_data', 'protocol_URL', 'sample_collection_year', 'tissue_type', 'development_stage_ontology_term_id', 'sampled_site_condition', 'author_batch_notes', 'scanvi_label_user_input', 'sequenced_fragment', 'sample_ID', 'cell_size', 'intron_inclusion', 'cell_viability_percentage', 'manner_of_death', 'sample_source', 'sequencing_platform', 'reference_genome', 'alignment_software', 'sample_collection_method', 'gene_annotation_version', 'cell_enrichment', 'library_sequencing_run', 'tissue_free_text', 'sample_preservation_method', 'institute', 'prediction_xgb_filtered_by_uncert>0.5', '

In [9]:
adata_archmap = adata_archmap[adata_archmap.obs['dataset'] == 'Kropski_Banovich_unpubl'].copy()

In [10]:
non_overlap_barcodes = set(adata.obs.index) - set(adata_archmap.obs.index)
non_overlap_barcodes_other_side = set(adata_archmap.obs.index) - set(adata.obs.index)

In [11]:
len(non_overlap_barcodes)

584944

In [12]:
len(non_overlap_barcodes_other_side)

216829

In [None]:
pd.DataFrame({
    'non_overlap_barcodes': list(non_overlap_barcodes),
    'non_overlap_barcodes_other_side': list(non_overlap_barcodes_other_side)
})

In [14]:
# add to non_overlap_barcodes in adata '-1' at the end of the barcode
adata.obs['barcode'] = adata.obs.index
adata.obs['barcode'] = adata.obs['barcode'].apply(lambda x: x + '-1' if x in non_overlap_barcodes else x)
adata.obs.index = adata.obs['barcode']

In [15]:
set(adata.obs.index) - set(adata_archmap.obs.index)

{'GACGGCTCAGGGAGAG_F01157-1',
 'GGCTGGTCAATCGAAA-1-HCATisStab7747199-1',
 'ATTACCTTCAAATGCC_SC84-1',
 'CCCTCCTGTTATCACG_HD67-1',
 'GCCTCTATCCGATATG_T85-1',
 '7239220_TTGACTTCAAGCTGGA-1',
 'GTTACAGTCCTCGCAT-1-2-1',
 'TCGCGAGTCTCATTCA_HD101-1',
 'CGGACTGCAATGCCAT_F02522-1',
 'P2_1_TCAATCTAGGATGGTC-1',
 'GTGTGCGCATGCATGT_F01851-1',
 'TCTTCGGTCTGATAC_GRO-08_biopsy-1',
 'TTATGCTGTGTGCCTG_SC29-1',
 'D344_Brus_Dis1_AACTCAGCAACCGCCA-1-14-1',
 'P3_3_TCTTTCCAGTGCGTGA-1',
 'D353_Brus_Nas1_GGAGCAATCACCTCGT-1-16-1',
 'GTATTCTTCGGCGCTA-1-4-1',
 'D344_Biop_Int1_CACATAGCAGTCAGAG-1-13-1',
 'TAAGCGTCATCTGGTA-SC45-1',
 'D337_Brus_Dis1CGAACATCATGACGGA-1-6-1',
 'TACAGTGTCTTTACAC_GRO-09_biopsy-1',
 'CTCGTCACAAGCCGTC-1-HCATisStab7659970-1',
 'TGATTTCTCAGCTCGG-1-5-1',
 'GACTACACATAACCTG_SC18-1',
 'P3_7_CTCTGGTGTTCCGGCA-1',
 'GTCAAGTAGTCATCCA_T137-1',
 'TTAAGCAGGCCGAAT_GRO-09_biopsy-1',
 'ACTGCGAGTATCGAA_GRO-10_biopsy-1',
 'P3_5_TTCTCAAAGTTAGCGG-1',
 'D322_Biop_Nas1_ATAAGAGAGTACGATA-1-0-1',
 'P3_7_GCGCCAAAGCAG

In [16]:
# one cell in adata_archmap has a different barcode
set(adata_archmap.obs.index) - set(adata.obs.index)

{'CATTCGCCAGTACACT-1',
 'ATAAGAGCATCTACGA-1-1',
 'TCGGTAAGTATCACCA-1',
 'GGGCATCCACATTAGC-1',
 'ACTGATGCAACTTGAC-1',
 'ATCGAGTTCTAACTCT-1',
 'CCCATACAGTTAGCGG-1',
 'AGTCTTTTCCGTTGTC-1',
 'CATTCGCAGACCTAGG-1-1',
 'CCTTTCTCAGTCTTCC-1-1',
 'GGCGACTCACCCTATC-1',
 'GAAATGAGTCGAAAGC-1',
 'TGAGGGAGTCTCCCTA-1',
 'ATAAGAGGTATATCCG-1',
 'CTGCCTACAGGCTGAA-1',
 'TGTATTCTCTAACGGT-1',
 'TAAACCGTCAGCGATT-1',
 'AGGGATGAGTGACATA-1',
 'TAGACCAGTTGGTGGA-1',
 'TGGGAAGTCGGTCCGA-1',
 'GATCGTAAGCGTGAGT-1',
 'TTTATGCTCTTGCAAG-1',
 'ACGGGTCAGTCTTGCA-1',
 'AGATTGCCATGGTTGT-1',
 'CTCGGAGCATTATCTC-1-1',
 'GCTGGGTTCTACGAGT-1',
 'CCACGGAGTCAATACC-1',
 'GGATGTTGTCAATACC-1',
 'ATAACGCCAATGGAGC-1',
 'TTGCGTCCAACGATGG-1',
 'CCGTGGAAGCCTTGAT-1',
 'CTGGTCTGTAGCTTGT-1',
 'GGTATTGCATGTCTCC-1',
 'CCGTACTAGCCTATGT-1',
 'GGGCACTGTGAGTGAC-1',
 'ATTACTCGTACGACCC-1',
 'TCATTACTCACTCCTG-1',
 'TATTACCAGATCCGAG-1',
 'TGCCCATTCTCGAGTA-1-1',
 'GCCTCTAGTCGAGATG-1',
 'CAGTAACAGCCAGTTT-1',
 'TACACGACAAGAAAGG-1',
 'ATCATGGCACGGTGTC-1',
 

In [17]:
cell_annotation_dict = dict(zip(adata_archmap.obs.index, adata_archmap.obs['prediction_xgb']))

In [18]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs.index.map(cell_annotation_dict)

adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

NaN                             584634
EC venous pulmonary              26642
3_AT2                            16614
Adventitial fibroblasts          14555
EC aerocyte capillary             4545
3_AT1                             2418
3_Lymphatic EC mature             2246
B cells                           1307
3_Lymphatic EC proliferating      1240
3_Smooth muscle FAM83D+           1013
AT2 proliferating                  731
CD4 T cells                        518
EC general capillary               489
3_Myofibroblasts                   174
Classical monocytes                 35
CD8 T cells                         28
2_Smooth muscle                     21
2_Hematopoietic stem cells           3
Club                                 3
DC1                                  2
2_Mesothelium                        2
Name: author_cell_type, dtype: int64

In [None]:
adata.obs_names_make_unique()

# Validate obs and uns from Tier 1 Metadata Template

In [None]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,"Control and COPD lung, 2024-04-30","Jonathan A. Kropski, Nicholas E. Banovich",library_preparation_batch,,protected under embargo,


In [None]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,THD0002,THD0002,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F01174,GSM4037324,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000238
1,THD0024,THD0024,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F03576,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000239
2,THD0025,THD0025,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F03574,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000240
3,THD0027,THD0027,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F04005,F04005,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000239
4,THD0028,THD0028,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F03861,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000239
5,THD0029,THD0029,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F04006,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000240
6,THD0030,THD0030,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F04007,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000240
7,THD0031,THD0031,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F04038,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000239
8,THD0032,THD0032,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F04127,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000241
9,THD0034,THD0034,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,TGen/VUMC,,F04244,,,NCBITaxon:9606,...,5 prime end bias,EFO_0008637,True,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000240


# Validate obs and uns from adata

In [None]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 289105 × 38224
    obs: 'background_fraction', 'cell_probability', 'cell_size', 'droplet_efficiency', 'dataset', 'barcode', 'author_cell_type'
    var: 'ambient_expression', 'feature_type', 'genome', 'gene_id', 'cellbender_analyzed', 'reference'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'
    obsm: 'gene_expression_encoding'

In [None]:
adata.obs['dataset'].value_counts(dropna=False).index.tolist()

['RICH1',
 'COPD49',
 'VUHD116',
 'RICH3',
 'THD0036',
 'VUHD122',
 'THD0044',
 'THD0043',
 'THD0041',
 'RICH7',
 'THD0032',
 'THD0025',
 'RICH9',
 'THD0040',
 'THD0039',
 'THD0024',
 'THD0034',
 'THD0042',
 'THD0002',
 'RICH10',
 'VUHD65',
 'THD0031',
 'THD0028',
 'THD0029',
 'THD0027',
 'THD0030',
 'THD0037',
 'THD0035',
 'VUHD075',
 'VUHD072',
 'THD0038',
 'VUHD073']

In [None]:
obs['sample_ID'].tolist()

['THD0002',
 'THD0024',
 'THD0025',
 'THD0027',
 'THD0028',
 'THD0029',
 'THD0030',
 'THD0031',
 'THD0032',
 'THD0034',
 'THD0035',
 'THD0036',
 'THD0037',
 'THD0038',
 'THD0039',
 'THD0040',
 'THD0041',
 'THD0042',
 'THD0043',
 'THD0044',
 'VUHD65',
 'VUHD072',
 'VUHD073',
 'VUHD075',
 'VUHD116',
 'VUHD122',
 'COPD49',
 'RICH1',
 'RICH3',
 'RICH7',
 'RICH9',
 'RICH10']

In [None]:
non_intersect_sample_id = set(obs['sample_ID'].tolist()) - set(adata.obs['dataset'].value_counts(dropna=False).index.tolist())
non_intersect_sample_id_other_side = set(adata.obs['dataset'].value_counts(dropna=False).index.tolist()) - set(obs['sample_ID'].tolist())

In [None]:
non_intersect_sample_id

set()

In [None]:
non_intersect_sample_id_other_side

set()

In [None]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'dataset',
    df_col = 'sample_ID',
    skip = None,
)

adata.obs['sample_ID'] = adata.obs['dataset']

adata.obs

Unnamed: 0_level_0,background_fraction,cell_probability,cell_size,droplet_efficiency,dataset,barcode,author_cell_type,donor_id,protocol_URL,institute,...,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACTGAGTCAATAGAGT-1,0.000161,0.999955,2951.735596,2.070680,VUHD075,ACTGAGTCAATAGAGT-1,3_AT2,VUHD075,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000238,VUHD075
CACATTTTCCGAATGT-1,0.000000,0.999955,2962.802246,1.950150,VUHD075,CACATTTTCCGAATGT-1,3_Smooth muscle FAM83D+,VUHD075,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000238,VUHD075
ATTGGACCACAGAGGT-1,0.000000,0.999955,2777.117188,1.762267,VUHD075,ATTGGACCACAGAGGT-1,B cells,VUHD075,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000238,VUHD075
GGCCGATCATTTGCCC-1,0.000000,0.999955,2599.234375,1.788824,VUHD075,GGCCGATCATTTGCCC-1,3_AT2,VUHD075,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000238,VUHD075
TTCTACATCCAAAGTC-1,0.000210,0.999955,2611.243408,1.770930,VUHD075,TTCTACATCCAAAGTC-1,EC venous pulmonary,VUHD075,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000238,VUHD075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TACGGGCAGGAATTAC-1-1,0.249995,0.639784,144.171844,0.663551,THD0040,TACGGGCAGGAATTAC-1-1,3_AT2,THD0040,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000241,THD0040
TAGCCGGTCAAGATCC-1,0.319142,0.893551,144.771790,0.644018,THD0040,TAGCCGGTCAAGATCC-1,3_AT2,THD0040,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000241,THD0040
TGTCCCAAGATCGATA-1-1,0.127657,0.752691,145.850082,0.634361,THD0040,TGTCCCAAGATCGATA-1-1,3_AT2,THD0040,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000241,THD0040
TGACAACCAGTGACAG-1,0.199996,0.536261,139.792664,0.662750,THD0040,TGACAACCAGTGACAG-1,3_AT2,THD0040,https://www.science.org/doi/full/10.1126/sciad...,Translation Genomics Research Institute,...,EFO_0008637,true,GRCh38,v98,cell ranger 7.2.0,yes,PATO:0000461,unknown,HsapDv:0000241,THD0040


In [None]:
adata

AnnData object with n_obs × n_vars = 289105 × 38224
    obs: 'background_fraction', 'cell_probability', 'cell_size', 'droplet_efficiency', 'dataset', 'barcode', 'author_cell_type', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_data', 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intron_inclusion', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term

# Add author cell type markers to UNS

In [100]:
# correspond to HLCA v1 cell type markers

# Check author cell type annotations and Cell Ontology IDs

In [100]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

EC venous pulmonary             101051
3_AT2                            87542
Adventitial fibroblasts          47105
3_Lymphatic EC mature            11227
EC aerocyte capillary             8168
3_AT1                             6984
B cells                           6406
3_Lymphatic EC proliferating      5991
3_Smooth muscle FAM83D+           5458
AT2 proliferating                 2766
EC general capillary              2505
CD4 T cells                       1968
3_Myofibroblasts                  1200
CD8 T cells                        256
Classical monocytes                201
2_Smooth muscle                    163
Club                                52
2_Hematopoietic stem cells          28
DC1                                 23
2_Mesothelium                        6
DC2                                  3
3_Mast cells                         2
Name: author_cell_type, dtype: int64

In [101]:
# from all cell types, remove '<number>_' prefix if present
adata.obs[AUTHOR_CELL_TYPE] = adata.obs[AUTHOR_CELL_TYPE].apply(lambda x: x.split('_')[1] if '_' in x else x)
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

EC venous pulmonary           101051
AT2                            87542
Adventitial fibroblasts        47105
Lymphatic EC mature            11227
EC aerocyte capillary           8168
AT1                             6984
B cells                         6406
Lymphatic EC proliferating      5991
Smooth muscle FAM83D+           5458
AT2 proliferating               2766
EC general capillary            2505
CD4 T cells                     1968
Myofibroblasts                  1200
CD8 T cells                      256
Classical monocytes              201
Smooth muscle                    163
Club                              52
Hematopoietic stem cells          28
DC1                               23
Mesothelium                        6
DC2                                3
Mast cells                         2
Name: author_cell_type, dtype: int64

In [102]:
ontology_mapping = pd.read_csv("/home/icb/raphael.kfuri-rubens/git/hlca-v2/notebooks/core_datasets/01_wrangling_validation_assembling/HLCA_v1/hlca_v1_annot_ontology_id_mapping_all.csv")

ontology_mapping_dict = dict(zip(ontology_mapping['cell_type_annotation'], ontology_mapping['cell_type_ontology_term_id']))

ontology_mapping_dict

{'Immune': 'CL:0000583',
 'Epithelial': 'CL:0002063',
 'Endothelial': 'CL:0002543',
 'Stroma': 'CL:2000093',
 'Myeloid': 'CL:0000583',
 'Lymphoid': 'CL:0000623',
 'Alveolar epithelium': 'CL:0002063',
 'Airway epithelium': 'CL:0002633',
 'Blood vessels': 'CL:0002543',
 'Fibroblast lineage': 'CL:2000093',
 'Smooth muscle': 'CL:0019019',
 'Submucosal Gland': 'CL:0019001',
 'Lymphatic EC': 'CL:0002138',
 'Mesothelium': 'CL:0000077',
 'Hematopoietic stem cells': 'CL:0000037',
 'Macrophages': 'CL:0000583',
 'Innate lymphoid cell NK': 'CL:0000623',
 'AT2': 'CL:0002063',
 'Basal': 'CL:0002633',
 'EC venous': 'CL:0002543',
 'T cell lineage': 'CL:0000625',
 'EC arterial': 'CL:1001568',
 'Fibroblasts': 'CL:2000093',
 'AT1': 'CL:0002062',
 'Multiciliated lineage': 'CL:0002145',
 'B cell lineage': 'CL:0000786',
 'Secretory': 'CL:0002480',
 'SM activated stress response': 'CL:0000192',
 'Monocytes': 'CL:0000860',
 'Submucosal Secretory': 'CL:0019001',
 'EC capillary': 'CL:0002144',
 'Lymphatic EC ma

In [103]:
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs[AUTHOR_CELL_TYPE].map(ontology_mapping_dict)

adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002543    101051
CL:0002063     90308
CL:4028006     47105
CL:0002138     17218
CL:0002144     10673
CL:0002062      6984
CL:0000236      6406
CL:0000499      5458
CL:0000624      1968
CL:0000186      1200
CL:0000625       256
CL:0000860       201
CL:0019019       163
CL:0000158        52
CL:0000037        28
CL:0000990        23
CL:0000077         6
CL:0002399         3
CL:0000097         2
Name: cell_type_ontology_term_id, dtype: int64

# Check whether ENSEMBL IDs in var

In [104]:
adata.var['gene_symbol'] = adata.var.index
adata.var.index.name = 'index'
adata.var.rename(columns={'gene_id': 'ensembl_id'}, inplace=True)
adata.var

Unnamed: 0_level_0,ambient_expression,feature_type,genome,ensembl_id,cellbender_analyzed,reference,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MIR1302-2HG,0.000000e+00,Gene Expression,GRCh38,ENSG00000243485,False,GRCh38,MIR1302-2HG
FAM138A,0.000000e+00,Gene Expression,GRCh38,ENSG00000237613,False,GRCh38,FAM138A
OR4F5,0.000000e+00,Gene Expression,GRCh38,ENSG00000186092,False,GRCh38,OR4F5
AL627309.1,0.000000e+00,Gene Expression,GRCh38,ENSG00000238009,False,GRCh38,AL627309.1
AL627309.3,0.000000e+00,Gene Expression,GRCh38,ENSG00000239945,False,GRCh38,AL627309.3
...,...,...,...,...,...,...,...
AC010086.3,0.000000e+00,Gene Expression,GRCh38,ENSG00000288057,False,GRCh38,AC010086.3
AC024236.1,0.000000e+00,Gene Expression,GRCh38,ENSG00000286187,False,GRCh38,AC024236.1
PRYP3,0.000000e+00,Gene Expression,GRCh38,ENSG00000169763,False,GRCh38,PRYP3
AC213203.2,0.000000e+00,Gene Expression,GRCh38,ENSG00000277475,False,GRCh38,AC213203.2


# Check raw data

In [105]:
adata.X.toarray().max()

32962

In [106]:
adata.X = adata.X.astype(np.int64)

In [107]:
adata.raw = adata

In [108]:
adata.X

<289105x38224 sparse matrix of type '<class 'numpy.int64'>'
	with 339344761 stored elements in Compressed Sparse Row format>

In [109]:
adata.raw.X

<289105x38224 sparse matrix of type '<class 'numpy.int64'>'
	with 339344761 stored elements in Compressed Sparse Row format>

In [110]:
adata.X.toarray().max()

32962

In [111]:
adata.raw.X.toarray().max()

32962

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK
- OK


# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- MISSING: Marker genes correspond to HLCA v1 as mapped with Archmap
- CHECK: ENSEMBL IDs in var

### Revision:

DONE

In [None]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))

In [5]:
adata = sc.read_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))

  utils.warn_names_duplicates("obs")


In [22]:
adata.obs.index = adata.obs.index.astype(str)
adata.obs_names_make_unique()

In [26]:
# check whether obs index unique
adata.obs.index.is_unique

False

In [24]:
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))

In [25]:
adata = sc.read_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))

  utils.warn_names_duplicates("obs")
