In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [2]:
DATASET_ID = "Schiller_unpubl"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
MARKER_GENES_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/marker_genes_cell_ontologies.xlsx"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Schiller_single_nuc,Herbert Schiller,"condition, donor_id",X_hlca,protected under embargo,Samples from patients with the same health sta...


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,ASK612,ASK612,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,Non-Smoker,not applicable,"same personnel, batch 1",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000240
1,ASK620,ASK620,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,Non-Smoker,not applicable,"same personnel, batch 1",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000242
2,ASK633,ASK633,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,Non-Smoker,not applicable,"same personnel, batch 1",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000242
3,ASK621,ASK621,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,Smoker,not applicable,"same personnel, batch 1",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000242
4,ASK627,ASK627,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,Smoker,not applicable,"same personnel, batch 1",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000242
5,ASK634,ASK634,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,Smoker,not applicable,"same personnel, batch 1",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000240
6,ASK543,ASK543,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,COPDII,not applicable,"same personnel, batch 2",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000239
7,ASK611,ASK611,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,COPDII,not applicable,"same personnel, batch 2",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000241
8,ASK631,ASK631,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","Asklepios Lungenklinik Gauting, Munich",day1,COPDII,not applicable,"same personnel, batch 2",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000242
9,M434WRC,M434WRC,"same multiplexed protocol, unpublished","CPC/LHI, Helmholtz Center Munich","LMU Klinikum, Munich",day1,COPDIV,not applicable,"same personnel, batch 2",NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000241


# Validate obs and uns from adata

In [7]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'sample_ID',
    df_col = 'sample_ID',
    skip = None,
)

adata.obs

Unnamed: 0,condition,genotype,status,demultiplex_sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
AAACCCAAGCGAAACC-1,COPDII,1,singlet,COPDII_1,1617,7.388946,2898.0,7.972121,17.563837,26.293996,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000239
AAACCCACAATCTGCA-1,COPDII,1,singlet,COPDII_1,2051,7.626570,3268.0,8.092239,12.301102,19.033048,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,unknown
AAACCCACACACTTAG-1,COPDII,1,singlet,COPDII_1,1958,7.580189,3132.0,8.049747,14.112388,21.328225,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000239
AAACCCAGTACCACGC-1,COPDII,1,doublet,COPDII_1,2184,7.689371,3527.0,8.168487,11.256025,17.833853,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,unknown
AAACCCAGTGACTGAG-1,COPDII,1,singlet,COPDII_1,1456,7.284135,2514.0,7.830028,19.809069,28.758950,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGTCGCTAATG-1,Smoker,0,singlet,Smoker_0,704,6.558198,1130.0,7.030858,23.893805,36.460177,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,unknown
TTTGGTTGTAGCTAAA-1,Smoker,0,singlet,Smoker_0,728,6.591674,1222.0,7.109062,26.595745,38.952537,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,unknown
TTTGGTTTCTCATTGT-1,Smoker,0,singlet,Smoker_0,674,6.514713,1089.0,6.993933,28.466483,40.404040,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000242
TTTGTTGTCCGTATGA-1,Smoker,0,singlet,Smoker_0,813,6.701960,1661.0,7.415777,31.727875,42.865744,...,3 prime tag,EFO:0008637,true,GRCh38,v107,cell ranger 6.1.2.,yes,PATO:0000461,unknown,unknown


In [8]:
marker_genes_df = pd.read_excel(MARKER_GENES_PATH, sheet_name='Sheet1')

# remove rows with all NaNs
marker_genes_df = marker_genes_df.dropna(how='all')

# add to all values in 'ontology' column the prefix 'CL:'
marker_genes_df['ontology'] = 'CL:' + marker_genes_df['ontology'].astype(str)

marker_genes_df.columns = [AUTHOR_CELL_TYPE, CELL_TYPE_ONTOLOGY_ID, MARKER_GENES]


marker_genes_df[AUTHOR_CELL_TYPE] = marker_genes_df[AUTHOR_CELL_TYPE].replace('Macrophae MERTK+', 'Macrophage MERTK+')

marker_genes_df

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,author_cell_type_markers
0,AT2,CL:0002063,RTKN2; AGER; NCKAP5; CLIC5; SCEL; CAV2; COL4A...
1,AT1,CL:0002062,SFTPC; SFTPA1; NAPSA; SFTPA2; WIF1; ACOXL; AGB...
2,Basal cell,CL:0000646,KRT17; TP63; CACHD1; KRT5; KRT15; AQP3; FGFR3
3,Multiciliated,CL:4030034,DNAH12;CCDC78; CAPS; ZBBX; HYDIN; CFAP157;ERI...
4,Goblet,CL:0000160,BPIFB1;MUC5B;CP; PLEKHS1; BMPR1B; ERN2; MUC4; ...
5,Club,CL:0000158,MET; STEAP4; BMP6; SCGB3A2; SFTPB;
7,Alveolar macrophages,CL:0000583,MARCO; INHBA; TREM1; ABHD5; PPARG; FABP4;SLC11A1
8,Macrophage CHIT1+,CL:0000253,SLC1A3; SDC2; CHIT1
9,Macrophage interstitial,CL:4033043,F13A1; STAB1; MRC1; RBPJ; F13A1; CD163
10,Macrophage MERTK+,CL:0000253,MERTK; SLC9A9; RGL1;HK2; CTSL; SAT1; TNS3


In [9]:
# Michael's object with cell type and ontology annotations
adata_new = sc.read_h5ad("/home/icb/raphael.kfuri-rubens/data/hlca_v2/Schiller_unpubl/COPD_29_04_2024.h5ad")

In [10]:
df = adata.obs.merge(adata_new.obs[['cell_type', 'Level_3', 'Level_2', 'Level_1', 'group']], left_index=True, right_index=True)

df

Unnamed: 0,condition,genotype,status,demultiplex_sample,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,...,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,cell_type,Level_3,Level_2,Level_1,group
AAACCCAAGCGAAACC-1,COPDII,1,singlet,COPDII_1,1617,7.388946,2898.0,7.972121,17.563837,26.293996,...,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000239,Lipofibroblast,Fibroblasts,Fibroblast lineage,Stroma,outside
AAACCCACAATCTGCA-1,COPDII,1,singlet,COPDII_1,2051,7.626570,3268.0,8.092239,12.301102,19.033048,...,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,unknown,Alveolar macrophages,Macrophages,Myeloid,Immune,outside
AAACCCAGTACCACGC-1,COPDII,1,doublet,COPDII_1,2184,7.689371,3527.0,8.168487,11.256025,17.833853,...,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,unknown,Lymphatic EC,Lymphatic EC,Lymphatic EC,Endothelial,within
AAACCCAGTGACTGAG-1,COPDII,1,singlet,COPDII_1,1456,7.284135,2514.0,7.830028,19.809069,28.758950,...,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,HsapDv:0000239,EC arterial,EC arterial,Blood vessels,Endothelial,outside
AAACGAAAGATCCCGC-1,COPDII,1,singlet,COPDII_1,1469,7.293018,2574.0,7.853605,17.055167,26.107226,...,cell ranger 6.1.2.,yes,MONDO:0005002,unknown,unknown,AT1,AT1,Alveolar epithelium,Epithelial,outside
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGTCGCTAATG-1,Smoker,0,singlet,Smoker_0,704,6.558198,1130.0,7.030858,23.893805,36.460177,...,cell ranger 6.1.2.,yes,PATO:0000461,unknown,unknown,EC aerocyte capillary,EC capillary,Blood vessels,Endothelial,outside
TTTGGTTGTAGCTAAA-1,Smoker,0,singlet,Smoker_0,728,6.591674,1222.0,7.109062,26.595745,38.952537,...,cell ranger 6.1.2.,yes,PATO:0000461,unknown,unknown,Adventitial fibroblasts,Fibroblasts,Fibroblast lineage,Stroma,outside
TTTGGTTTCTCATTGT-1,Smoker,0,singlet,Smoker_0,674,6.514713,1089.0,6.993933,28.466483,40.404040,...,cell ranger 6.1.2.,yes,PATO:0000461,unknown,HsapDv:0000242,Lipofibroblast,Fibroblasts,Fibroblast lineage,Stroma,outside
TTTGTTGTCCGTATGA-1,Smoker,0,singlet,Smoker_0,813,6.701960,1661.0,7.415777,31.727875,42.865744,...,cell ranger 6.1.2.,yes,PATO:0000461,unknown,unknown,EC general capillary,EC capillary,Blood vessels,Endothelial,outside


In [11]:
adata = adata[adata.obs.index.isin(adata_new.obs.index)].copy()
adata

AnnData object with n_obs × n_vars = 42241 × 20669
    obs: 'condition', 'genotype', 'status', 'demultiplex_sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'sample', 'n_counts', 'n_genes', 'doublet_scores', 'scDblFinder_class', 'soupx_groups', 'scrublet', 'Level_1_transfered_label', 'Level_1_transfer_uncert', 'Level_2_transfered_label', 'Level_2_transfer_uncert', 'Level_3_transfered_label', 'Level_3_transfer_uncert', 'Level_4_transfered_label', 'Level_4_transfer_uncert', 'Level_5_transfered_label', 'Level_5_transfer_uncert', 'condition_coarse', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 'leiden_1.0', 'hlca_manual_lv1', 'hlca_manual_lv2', 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relat

In [12]:
adata.obs = df
adata.obs.columns

Index(['condition', 'genotype', 'status', 'demultiplex_sample',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'total_counts_mito',
       'log1p_total_counts_mito', 'pct_counts_mito', 'sample', 'n_counts',
       'n_genes', 'doublet_scores', 'scDblFinder_class', 'soupx_groups',
       'scrublet', 'Level_1_transfered_label', 'Level_1_transfer_uncert',
       'Level_2_transfered_label', 'Level_2_transfer_uncert',
       'Level_3_transfered_label', 'Level_3_transfer_uncert',
       'Level_4_transfered_label', 'Level_4_transfer_uncert',
       'Level_5_transfered_label', 'Level_5_transfer_uncert',
       'condition_coarse', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4',
       'leiden_0.5', 'leiden_1.0', 'hlca_manual_lv1', 'hlca_manual_lv2',
       'sample_ID', 'donor_id', 'protocol_URL', 'institute',
     

In [13]:
# check non-overlap of cell types with AUTHOR_CELL_TYPE in marker_genes_df

cell_type_adata = set(adata.obs['cell_type'].unique())
cell_type_marker_genes = set(marker_genes_df[AUTHOR_CELL_TYPE].unique())

cell_type_adata - cell_type_marker_genes

{'Basal',
 'Interstitial macrophage',
 'Lipofibroblast',
 'Lymphatic EC',
 'Regulatory T cells',
 'Smooth muscle'}

In [14]:
cell_type_marker_genes - cell_type_adata

{'Basal cell',
 'LECs',
 'Lipofibroblasts',
 'Macrophage interstitial',
 'Regulatory T cell',
 'SMCs'}

In [15]:
# create mapping dict for cell types
cell_type_mapping = {
    'Basal': 'Basal cell',
    'Interstitial macrophage': 'Macrophage interstitial',
    'Lipofibroblast': 'Lipofibroblasts',
    'Lymphatic EC': 'LECs',
    'Regulatory T cells': 'Regulatory T cell',
    'Smooth muscle': 'SMCs'
}

adata.obs[AUTHOR_CELL_TYPE] = adata.obs['cell_type'].replace(cell_type_mapping)

In [16]:
# map AUTHOR_CELL_TYPE and CELL_TYPE_ONTOLOGY_ID to cell type annotations
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs[AUTHOR_CELL_TYPE].map(dict(zip(marker_genes_df[AUTHOR_CELL_TYPE], marker_genes_df[CELL_TYPE_ONTOLOGY_ID])))
adata.obs[MARKER_GENES] = adata.obs[AUTHOR_CELL_TYPE].map(dict(zip(marker_genes_df[AUTHOR_CELL_TYPE], marker_genes_df[MARKER_GENES])))

In [17]:
adata.obs[MARKER_GENES].value_counts(dropna=False)

RTKN2; AGER; NCKAP5; CLIC5; SCEL;  CAV2; COL4A3; LAMA3                     9254
ITGA8;MACF1;PIEZO2; SVEP1; CDH11; NCAM2                                    5411
SFTPC; SFTPA1; NAPSA; SFTPA2; WIF1; ACOXL; AGBL1; ABCA3; LAMP3             4953
CA4; EDN1; NOSTRIN; FCN3; SLC6A4; BTNL9                                    3158
MARCO; INHBA; TREM1; ABHD5; PPARG; FABP4;SLC11A1                           2046
COL15A1; PLVAP; VWA1; KCNE3                                                1863
COL3A1; CCDC80; ABCA10; FBLN1; FBN1; CFH; PODN                             1603
EDNRB;  HPGD; EMCN; PDE1C; CYP3A5                                          1481
ITGA1;ITGAE; CD2; CD6                                                      1445
LGR6; ACTA2; MYH11;  LMOD1; MYOCD; IRAG1; TAGLN                            1253
GUCY1A2; PDGFRB; ADARB2; LAMC3; AFF2; ADCY3; TBX5                           869
DNAH12;CCDC78; CAPS;  ZBBX; HYDIN; CFAP157;ERICH3;CFAP47; CFAP73; CDHR4     718
F13A1; STAB1; MRC1; RBPJ; F13A1; CD163  

In [18]:
adata

AnnData object with n_obs × n_vars = 42241 × 20669
    obs: 'condition', 'genotype', 'status', 'demultiplex_sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'sample', 'n_counts', 'n_genes', 'doublet_scores', 'scDblFinder_class', 'soupx_groups', 'scrublet', 'Level_1_transfered_label', 'Level_1_transfer_uncert', 'Level_2_transfered_label', 'Level_2_transfer_uncert', 'Level_3_transfered_label', 'Level_3_transfer_uncert', 'Level_4_transfered_label', 'Level_4_transfer_uncert', 'Level_5_transfered_label', 'Level_5_transfer_uncert', 'condition_coarse', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 'leiden_1.0', 'hlca_manual_lv1', 'hlca_manual_lv2', 'sample_ID', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relat

# Add author cell type markers to UNS

In [19]:
adata.uns[MARKER_GENES] = marker_genes_df

# Check author cell type annotations and Cell Ontology IDs

In [20]:
adata.obs.columns

Index(['condition', 'genotype', 'status', 'demultiplex_sample',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'total_counts_mito',
       'log1p_total_counts_mito', 'pct_counts_mito', 'sample', 'n_counts',
       'n_genes', 'doublet_scores', 'scDblFinder_class', 'soupx_groups',
       'scrublet', 'Level_1_transfered_label', 'Level_1_transfer_uncert',
       'Level_2_transfered_label', 'Level_2_transfer_uncert',
       'Level_3_transfered_label', 'Level_3_transfer_uncert',
       'Level_4_transfered_label', 'Level_4_transfer_uncert',
       'Level_5_transfered_label', 'Level_5_transfer_uncert',
       'condition_coarse', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4',
       'leiden_0.5', 'leiden_1.0', 'hlca_manual_lv1', 'hlca_manual_lv2',
       'sample_ID', 'donor_id', 'protocol_URL', 'institute',
     

In [21]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

AT2                        9254
Lipofibroblasts            5411
AT1                        4953
EC general capillary       3158
Alveolar macrophages       2046
EC venous systemic         1863
Adventitial fibroblasts    1603
EC aerocyte capillary      1481
CD4 T cells TRM            1445
SMCs                       1253
Pericytes                   869
Multiciliated               718
Macrophage interstitial     678
EC arterial                 670
LECs                        628
Monocytes                   608
CD8_EM/EMRA                 590
NK cells                    572
Plasma cells                551
CD4 T cells naive           455
Club                        442
CD8 T cells TRM             368
Macrophage CHIT1+           328
CD8_TRM/EM                  323
DC2                         282
Macrophage MERTK+           272
EC venous pulmonary         261
Mast cells                  251
B cells                     242
Monocytes CSF3R+/CD16-      156
Regulatory T cell           155
Basal ce

In [22]:
adata.obs['Level_1'].value_counts(dropna=False)

Epithelial     15593
Immune          9451
Stroma          9136
Endothelial     8061
Name: Level_1, dtype: int64

In [23]:
adata.obs['Level_2'].value_counts(dropna=False)

Alveolar epithelium    14207
Fibroblast lineage      7883
Blood vessels           7433
Myeloid                 4750
Lymphoid                4701
Airway epithelium       1386
Smooth muscle           1253
Lymphatic EC             628
Name: Level_2, dtype: int64

In [24]:
adata.obs['Level_3'].value_counts(dropna=False)

AT2                        9254
Fibroblasts                7883
AT1                        4953
EC capillary               4639
T cell lineage             3336
Macrophages                3324
EC venous                  2124
Smooth muscle              1253
B cell lineage              793
Monocytes                   764
Multiciliated               718
EC arterial                 670
Lymphatic EC                628
Innate lymphoid cell NK     572
Secretory                   532
Dendritic cells             411
Mast cells                  251
Basal                       136
Name: Level_3, dtype: int64

In [25]:
adata.obs[AUTHOR_CELL_TYPE_L0] = adata.obs['Level_1']
adata.obs[AUTHOR_CELL_TYPE_L1] = adata.obs['Level_2']
adata.obs[AUTHOR_CELL_TYPE_L2] = adata.obs['Level_3']

In [26]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063      9254
CL:0002553      7014
CL:0002062      4953
CL:0002144      3158
CL:0000583      2046
CL:0002543      1863
CL:0002145      1481
CL:4033038      1445
CL:0000192      1253
CL:0009089       869
CL:4030034       718
CL:4033043       678
CL:1000413       670
CL:0009086       628
CL:0000860       608
CL:0000253       600
CL:0000913       590
CL:0000623       572
CL:0000786       551
CL:0000895       455
CL:0000158       442
CL:4033039       368
CL:0000909       323
CL:0001056       308
CL:4033008       261
CL:0000097       251
CL:0000236       242
CL:0000875       156
CL:0000815       155
CL:0000646       136
CL:0000160        90
CL:0000990        56
CL:0001058        47
Name: cell_type_ontology_term_id, dtype: int64

# Check whether ENSEMBL IDs in var

In [40]:
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,highly_variable_nbatches,highly_variable_intersection,ensembl_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ATAD3B,False,0.036800,0.242076,-0.357507,0,False,ENSG00000160072,ATAD3B
PRDM16,False,0.168010,0.664629,0.025617,0,False,ENSG00000142611,PRDM16
PEX10,False,0.010376,0.226664,-0.387448,0,False,ENSG00000157911,PEX10
SKI,False,0.261257,0.610046,-0.378022,0,False,ENSG00000157933,SKI
PEX14,False,0.200704,0.592536,-0.165523,0,False,ENSG00000142655,PEX14
...,...,...,...,...,...,...,...,...
ENSG00000275063,False,0.000176,0.632660,0.991525,0,False,ENSG00000275063,ENSG00000275063
ENSG00000277856,False,0.000064,-0.458416,-2.481308,0,False,ENSG00000277856,ENSG00000277856
ENSG00000271254,False,0.009453,0.267841,-0.261455,0,False,ENSG00000271254,ENSG00000271254
ENSG00000268674,False,0.000401,0.486345,0.468138,0,False,ENSG00000268674,ENSG00000268674


In [41]:
adata.var.index.name = 'index'
adata.var.rename(columns={'gene_name': 'gene_symbol', 'ensembl_ID': 'ensembl_id'}, inplace=True)
adata.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm,highly_variable_nbatches,highly_variable_intersection,ensembl_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ATAD3B,False,0.036800,0.242076,-0.357507,0,False,ENSG00000160072,ATAD3B
PRDM16,False,0.168010,0.664629,0.025617,0,False,ENSG00000142611,PRDM16
PEX10,False,0.010376,0.226664,-0.387448,0,False,ENSG00000157911,PEX10
SKI,False,0.261257,0.610046,-0.378022,0,False,ENSG00000157933,SKI
PEX14,False,0.200704,0.592536,-0.165523,0,False,ENSG00000142655,PEX14
...,...,...,...,...,...,...,...,...
ENSG00000275063,False,0.000176,0.632660,0.991525,0,False,ENSG00000275063,ENSG00000275063
ENSG00000277856,False,0.000064,-0.458416,-2.481308,0,False,ENSG00000277856,ENSG00000277856
ENSG00000271254,False,0.009453,0.267841,-0.261455,0,False,ENSG00000271254,ENSG00000271254
ENSG00000268674,False,0.000401,0.486345,0.468138,0,False,ENSG00000268674,ENSG00000268674


In [51]:
adata.var['gene_symbol'] = adata.var['gene_symbol'].astype(str)
adata.var['ensembl_id'] = adata.var['ensembl_id'].astype(str)
adata.var.index = adata.var.index.astype(str)

In [52]:
adata.var.dtypes

highly_variable                    bool
means                           float64
dispersions                     float64
dispersions_norm                float32
highly_variable_nbatches          int64
highly_variable_intersection       bool
ensembl_id                       object
gene_symbol                      object
dtype: object

# Check raw data

In [53]:
adata.layers['soupX_counts'].toarray().max()

3026.0

In [54]:
adata.X = adata.layers['soupX_counts']

In [55]:
adata.X = adata.X.astype(np.int64)

In [56]:
adata.var.dtypes

highly_variable                    bool
means                           float64
dispersions                     float64
dispersions_norm                float32
highly_variable_nbatches          int64
highly_variable_intersection       bool
ensembl_id                       object
gene_symbol                      object
dtype: object

In [57]:
adata.raw = adata

In [59]:
adata.raw.var.dtypes

highly_variable                    bool
means                           float64
dispersions                     float64
dispersions_norm                float32
highly_variable_nbatches          int64
highly_variable_intersection       bool
ensembl_id                       object
gene_symbol                      object
dtype: object

In [48]:
adata.X

<42241x20669 sparse matrix of type '<class 'numpy.int64'>'
	with 69689379 stored elements in Compressed Sparse Row format>

In [99]:
adata.raw.X

<42241x20669 sparse matrix of type '<class 'numpy.int64'>'
	with 69689379 stored elements in Compressed Sparse Row format>

In [100]:
adata.X.toarray().max()

3026

In [101]:
adata.raw.X.toarray().max()

3026

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object  same as Google Sheet

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object same as Google Sheet


# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs and gene symbols in var

# Revisions:

DONE


In [49]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')