In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [23]:
DATASET_ID = "Shaykhiev_publ"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,A Unique Cellular Organization of Human Distal...,Renat Shaykhiev,sample_ID,umap,published,


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,N1_pre-T,N1,https://cdn.10xgenomics.com/image/upload/v1660...,Weill Cornell Epigenomics Core Facility,UNC,,N1_pre-T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000237
1,N1_T,N1,https://cdn.10xgenomics.com/image/upload/v1660...,Weill Cornell Epigenomics Core Facility,UNC,,N1_T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000237
2,N2_pre-T,N2,https://cdn.10xgenomics.com/image/upload/v1660...,Weill Cornell Epigenomics Core Facility,UNC,,N2_pre-T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000264
3,N2_P,N2,https://cdn.10xgenomics.com/image/upload/v1660...,Weill Cornell Epigenomics Core Facility,UNC,,N2_P,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000264
4,N2_T,N2,https://cdn.10xgenomics.com/image/upload/v1660...,Weill Cornell Epigenomics Core Facility,UNC,,N2_T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000264
5,N3_pre-T,N3,https://assets.ctfassets.net/an68im79xiti/51xG...,Weill Cornell Epigenomics Core Facility,UNC,,N3_pre-T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000241
6,N4_pre-T,N4,https://assets.ctfassets.net/an68im79xiti/51xG...,Weill Cornell Epigenomics Core Facility,UNC,,N4_pre-T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000238
7,N4_P,N4,https://assets.ctfassets.net/an68im79xiti/51xG...,Weill Cornell Epigenomics Core Facility,UNC,,N4_P,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000238
8,N4_T,N4,https://assets.ctfassets.net/an68im79xiti/51xG...,Weill Cornell Epigenomics Core Facility,UNC,,N4_T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008565,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000238
9,N5_pre-T,N5,https://assets.ctfassets.net/an68im79xiti/51xG...,Weill Cornell Epigenomics Core Facility,UNC,,N5_pre-T,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,True,GRCh38,v98,cellranger-6.0.0,no,PATO:0000461,unknown,HsapDv:0000241


# Validate obs and uns from adata

In [7]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 115788 × 36591
    obs: 'sample_ID', 'algorithm_resolution_version', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_data', 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intron_inclusion', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'author_cell_type', 'cell_type_

In [8]:
val_workflow = ValidationWorkflow(
    input = adata,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()

# Add author cell type markers to UNS

In [9]:
adata.obs[MARKER_GENES] = adata.obs['marker_genes']
adata.obs[CELL_TYPE_ONTOLOGY_LABEL] = adata.obs['cell_type_ontology_term']
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['cell_type_ontology_id']
adata.obs.columns

Index(['sample_ID', 'algorithm_resolution_version', 'donor_id', 'protocol_URL',
       'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 'gene_annotation_version', 'alignment_software',
       'intron_inclusion', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id',
       'development_stage

In [10]:
cell_type_mapping_df = adata.obs[[AUTHOR_CELL_TYPE, CELL_TYPE_ONTOLOGY_ID, CELL_TYPE_ONTOLOGY_LABEL, MARKER_GENES]].copy().drop_duplicates(subset=AUTHOR_CELL_TYPE).reset_index(drop=True)
cell_type_mapping_df

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,cell_type_ontology_term_label,author_cell_type_markers
0,"Alveolar epithelial cells, type 2",CL:0002063,type II pneumocyte,SFTPC
1,"Secretory cells, major (common) subtype",CL:0019001,tracheobronchial serous cell,SCGB1A1; negative for BC markers
2,Monocytes,CL:0000576,monocyte,VCAN; FCN1; CD163 (shared with macrophages; bu...
3,"Fibroblasts, common subtype",CL:2000093,bronchus fibroblast of lung,
4,"Macrophages, M1-2 intermediate",CL:0000235;CL:0000890,inflammatory macrophage;alternatively activate...,
5,Terminal airway-enriched secretory cells,CL:1000333,serous cell of epithelium of lobular bronchiole,SFTPB and/or SCGB3A2; a bridge cluster between...
6,"Smooth muscle cells, vascular enriched 2",CL:0019018,blood vessel smooth muscle cell,higher expression of ADIRF than in other SM cl...
7,"Macrophages, non-inflammatory, M2-like",CL:0000890,alternatively activated macrophages,APOE; CCL18; MSR1; MRC1; FABP4
8,"Fibroblasts, matrix and myofibroblasts",,Myofibroblasts,ELN; COL1A1; MMP2 - higher expession of these ...
9,Intermediate cells,CL:4033048,respiratory suprabasal cell,intermediate expression of BC and secretory ma...


In [11]:
adata.uns[MARKER_GENES] = cell_type_mapping_df
adata

AnnData object with n_obs × n_vars = 115788 × 36591
    obs: 'sample_ID', 'algorithm_resolution_version', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_data', 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intron_inclusion', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'author_cell_type', 'cell_type_

# Check author cell type annotations and Cell Ontology IDs

In [12]:
adata.obs[AUTHOR_CELL_TYPE].value_counts()

T cells, central memory and naïve                  10103
Ciliated cells, major (common) subtype              9580
Endothelial cells, venous; fenestrated              6759
Monocytes                                           6247
Alveolar epithelial cells, type 2                   6246
Fibroblasts, common subtype                         5139
T-NK intermediate cells                             4836
Secretory cells, major (common) subtype             4734
B cells                                             4601
Ciliated cells, secretory-like                      4487
Neutrophils                                         4142
CD8+ enriched T, common subtype                     3577
Smooth muscle cells, vascular enriched 1            3523
Endothelial cells, capillary, common                3430
Endothelial cells, capillary, aerocyte-enriched     3420
Basal cells                                         2916
Mast cells                                          2713
Macrophages, non-inflammatory, 

In [13]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts()

CL:0000904;CL:0000895     10103
CL:0002145                 9580
CL:4033008                 6759
CL:0000576                 6247
CL:0002063                 6246
CL:2000093                 5139
CL:0000814                 4836
CL:0019018                 4804
CL:0019001                 4734
CL:0000236                 4601
CL:0000775                 4142
CL:0000900;CL:0000909      3577
CL:2000016                 3430
CL:4028002;CL:4028003      3420
CL:0002633                 2916
CL:0000097                 2713
CL:0000890                 2390
CL:4033048                 2355
CL:0000623                 2303
CL:0000235;CL:0000890      1946
CL:0000990                 1643
CL:0002370                 1544
CL:0000235                 1263
CL:4033005                 1249
CL:4033044                  900
CL:0002138                  870
CL:1000333                  719
CL:4033039                  643
CL:4033017                  634
CL:0000669                  619
CL:0000138                  536
CL:00007

In [14]:
adata.obs[CELL_TYPE_ONTOLOGY_LABEL].value_counts()

central memory CD4-positive, alpha-beta T cell;naive thymus-derived CD4-positive, alpha-beta T cell    10103
ciliated cell of the bronchus                                                                           9580
vein endothelial cell of respiratory system                                                             6759
monocyte                                                                                                6247
type II pneumocyte                                                                                      6246
bronchus fibroblast of lung                                                                             5139
mature NK T cell                                                                                        4836
blood vessel smooth muscle cell                                                                         4804
tracheobronchial serous cell                                                                            4734
B cell             

In [15]:
adata.obs[MARKER_GENES].value_counts()

LEF1; CCR7; SELL; IL7R                                                                         10103
FOXJ1; CAPS; TPPP3                                                                              9580
ACKR1; PLVAP; SELE; POSTN                                                                       6759
VCAN; FCN1; CD163 (shared with macrophages; but distinguishing from Neu which are negative)     6247
SFTPC                                                                                           6246
combined markers of T and NK cells                                                              4836
SCGB1A1; negative for BC markers                                                                4734
MS4A1; BANK1; CD19; CD79A                                                                       4601
ciliated + SAA1; SAA2; SAA4                                                                     4487
CSF3R; S100A8; FCGR3B; IFITM2                                                              

# Check whether ENSEMBL IDs in var

In [17]:
adata.var.index = adata.var['name']
adata.var.index.name = 'index'
adata.var.rename(columns={'gene_ids': 'ensembl_id', 'name': 'gene_symbol'}, inplace=True)

adata.var

Unnamed: 0_level_0,feature_types,ensembl_id,genome,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,Gene Expression,ENSG00000243485,GRCh38,MIR1302-2HG
FAM138A,Gene Expression,ENSG00000237613,GRCh38,FAM138A
OR4F5,Gene Expression,ENSG00000186092,GRCh38,OR4F5
AL627309.1,Gene Expression,ENSG00000238009,GRCh38,AL627309.1
AL627309.3,Gene Expression,ENSG00000239945,GRCh38,AL627309.3
...,...,...,...,...
AC141272.1,Gene Expression,ENSG00000277836,GRCh38,AC141272.1
AC023491.2,Gene Expression,ENSG00000278633,GRCh38,AC023491.2
AC007325.1,Gene Expression,ENSG00000276017,GRCh38,AC007325.1
AC007325.4,Gene Expression,ENSG00000278817,GRCh38,AC007325.4


# Check raw data

In [18]:
if not sp.issparse(adata.X):
    adata.X = sp.csr_matrix(adata.X)

In [19]:
np.max(adata.X.toarray())

34567

In [20]:
adata.raw = adata

In [21]:
adata.raw.X

<115788x36591 sparse matrix of type '<class 'numpy.int64'>'
	with 224323233 stored elements in Compressed Sparse Column format>

In [22]:
adata.obs.columns

Index(['sample_ID', 'algorithm_resolution_version', 'donor_id', 'protocol_URL',
       'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 'gene_annotation_version', 'alignment_software',
       'intron_inclusion', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id',
       'development_stage

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object

# Data Submission Status

- CHECK: Raw data in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs and gene symbols in var

Everything great, just need to check whether comma seperated list of Cell Ontology Term IDs are okay

In [24]:
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')

In [25]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')