In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [2]:
DATASET_ID = "Kim_publ"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers_rna'
MARKER_PROTEINS = 'author_cell_type_markers_protein'
MARKERS = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Cells of COPD lung patients and controls,"Edy Kim, Jeong Yun",patient,wnn.UMAP,"Dataset published, consented for release",


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,donor_id,sample_ID,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,V148_Donor,V148,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-959,,"day4, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000238
1,V15_GOLD_I_II,V15,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-953,,"day3, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,MONDO:0005002,unknown,HsapDv:0000241
2,V159_End_stage_COPD,V159,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-962,,"day4, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,MONDO:0005002,unknown,
3,V161_Donor,V161,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-947,,"day4, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,
4,V162_End_stage_COPD,V162,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-965,,"day4, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,MONDO:0005002,unknown,HsapDv:0000242
5,V169_Donor,V169,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-998,,"N/A, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000240
6,V17_Control,V17,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-980,,"day5, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000240
7,V19_Emphysema,V19,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-933,,"day2, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,MONDO:0004849,unknown,HsapDv:0000240
8,V2_Emphysema,V2,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-977,,"day5, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,MONDO:0004849,unknown,HsapDv:0000240
9,V22_Control,V22,https://pubmed.ncbi.nlm.nih.gov/35649411/,"Brigham and Women’s Hospital, Boston MA",BWH,,BRI-983,,"day5, same personnel",NCBITaxon:9606,...,5 prime tag,EFO:0008367,True,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000242


# Validate obs and uns from adata

In [7]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata_merged = merger.add_obs_metadata(
    adata_col = 'sample_id',
    df_col = 'sample_ID',
    skip = None
)

adata_merged.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_Protein,nFeature_Protein,clonotype_id,reads,umis,v_gene,d_gene,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
BRI915_AAACCTGAGTAGCGGT-1,BRI-915,5032.0,1934,549.0,133,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000242
BRI915_AAACCTGCAGACAAGC-1,BRI-915,7761.0,2745,2101.0,183,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000242
BRI915_AAACCTGCAGCATGAG-1,BRI-915,9515.0,2771,2069.0,180,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000242
BRI915_AAACCTGCATTCCTGC-1,BRI-915,11623.0,3248,1238.0,138,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000242
BRI915_AAACCTGGTACAGTGG-1,BRI-915,15134.0,3744,1671.0,176,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BRI998_TTTGTCACACGTCAGC-1,BRI-998,18025.0,4252,1549.0,150,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000240
BRI998_TTTGTCACATTTCAGG-1,BRI-998,2870.0,1345,387.0,89,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000240
BRI998_TTTGTCATCCAGAAGG-1,BRI-998,5801.0,2021,529.0,102,clonotype1,3444,6,TRAV24,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000240
BRI998_TTTGTCATCCTTAATC-1,BRI-998,4150.0,1538,422.0,95,,-2147483648,-2147483648,,,...,5 prime tag,EFO:0008367,true,GRCh38,v93,cell ranger 4,no,PATO:0000461,unknown,HsapDv:0000240


In [8]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 109361 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_Protein', 'nFeature_Protein', 'clonotype_id', 'reads', 'umis', 'v_gene', 'd_gene', 'j_gene', 'c_gene', 'cdr3s_aa', 'cdr3s_nt', 'sample_ID', 'disease', 'cell_viability', 'percent.mt', 'G2M.Score', 'doublet_scrublet', 'percent.RPS', 'percent.RPL', 'qc.mit.Outlier', 'qc.nFeat.Outlier', 'qc.nCount.Outlier', 'qc.rps.Outlier', 'qc.rpl.Outlier', 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'CellOntology', 'wnnUMAP_1', 'wnnUMAP_2', 'RNAmarkers', 'CITEmarkers', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_en

# Add author cell type markers to UNS

In [9]:
# RNAmarkers, CITEmarkers, CellOntology
columns_to_keep = ['Level_4', 'Level_3', 'Level_2', 'CellOntology', 'RNAmarkers', 'CITEmarkers']
unique_level_4 = adata.obs.drop_duplicates(subset=['Level_4'])[columns_to_keep].copy()
unique_level_4.reset_index(drop=True, inplace=True)
unique_level_4.rename(columns={'Level_2': AUTHOR_CELL_TYPE_L0, 'Level_3': AUTHOR_CELL_TYPE_L1, 'Level_4': AUTHOR_CELL_TYPE_L2, 'CellOntology': CELL_TYPE_ONTOLOGY_ID, 'RNAmarkers': MARKER_GENES, 'CITEmarkers': MARKER_PROTEINS}, inplace=True)
unique_level_4

Unnamed: 0,author_cell_type_level_2,author_cell_type_level_1,author_cell_type_level_0,cell_type_ontology_term_id,author_cell_type_markers_rna,author_cell_type_markers_protein
0,CD8+ T rm s3,CD8 T memory,CD8+ T cell,CL:4033040,PTGDS; SPON2; FGFBP2; FCER1G; FCGR3A; PRF1; TY...,CD16-prot; CD45RA-prot4; CD57Recombinant-prot;...
1,ncMono,Non-Classical Monocyte,Monocyte,CL:0002396,HLA-DRA; CD79A; CD74; MS4A1; TCL1A; FCER2; HLA...,CD19-prot4; IgD-prot; CD21-prot4; IgM-prot2; C...
2,Mac intra,Macrophage interstitial intravascular,Mac Alv s1,CL:4033043,S100A9; RPS4Y1; PLAC8; PRELID1; AIF1; ACTB; CY...,CLEC12A-prot4; CD35-prot6; Ig-light-chain-kapp...
3,AT2 s1,AT2 s1,AT2 s1,CL:0002063,SFTPA1; SFTPC; NNMT; PGC; LRRK2; SFTPA2; WIF1;...,CD326-or-Ep-CAM-prot2; CD324-or-E-Cadherin-pro...
4,Capillary cell s1,Capillary Cell,Endothelium,CL:0002144,IGFBP7; CLU; HSPG2; CD36; SELP; PROCR; FCN3; M...,CD144-or-VE-Cadherin-prot5; CD34-prot3; CD141-...
...,...,...,...,...,...,...
66,CD4+ T h1 s1,CD4 T resident memory,CD4+ T cell,CL:4033038,IGLV1-40; IGKV4-1; IGKV3-20; IGKC; RBPJ; NR3C1...,CD103-or-IntegrinalphaE-prot1; CD5-prot6; CD4-...
67,cMono s3,Classical Monocyte,Monocyte,CL:0000860,KLRC2; LINC02446; KRT86; IGKC; CSF1; IGKV1-5; ...,CD57Recombinant-prot7; KLRG1-or-MAFA-prot5; CD...
68,Plasmablast,B Plasma cells,B cell,CL:0000980,CCR7; LEF1; LTB; SOCS3; TCF7; SELL; MAL; IL7R;...,CD5-prot2; CD3-prot1; TCR-alpha-or-beta-prot1;...
69,Pre-B Plasma Cell IgG,Pre-B Plasma Cell IgG,Plasma Cell,CL:0000985,TRDV2; TRGV9; KLRG1; TRDC; S100B; RPS4Y1; KLRC...,TCRVdelta2-prot; CD3-prot17; KLRG1-or-MAFA-pro...


In [10]:
adata.uns[MARKERS] = unique_level_4

adata

AnnData object with n_obs × n_vars = 109361 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_Protein', 'nFeature_Protein', 'clonotype_id', 'reads', 'umis', 'v_gene', 'd_gene', 'j_gene', 'c_gene', 'cdr3s_aa', 'cdr3s_nt', 'sample_ID', 'disease', 'cell_viability', 'percent.mt', 'G2M.Score', 'doublet_scrublet', 'percent.RPS', 'percent.RPL', 'qc.mit.Outlier', 'qc.nFeat.Outlier', 'qc.nCount.Outlier', 'qc.rps.Outlier', 'qc.rpl.Outlier', 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'CellOntology', 'wnnUMAP_1', 'wnnUMAP_2', 'RNAmarkers', 'CITEmarkers', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_en

# Check author cell type annotations and Cell Ontology IDs

In [11]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['Level_4']
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['CellOntology']
adata.obs[MARKER_GENES] = adata.obs['RNAmarkers']
adata.obs[MARKER_PROTEINS] = adata.obs['CITEmarkers']
adata.obs[MARKERS] = adata.obs['RNAmarkers']

In [12]:
adata

AnnData object with n_obs × n_vars = 109361 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_Protein', 'nFeature_Protein', 'clonotype_id', 'reads', 'umis', 'v_gene', 'd_gene', 'j_gene', 'c_gene', 'cdr3s_aa', 'cdr3s_nt', 'sample_ID', 'disease', 'cell_viability', 'percent.mt', 'G2M.Score', 'doublet_scrublet', 'percent.RPS', 'percent.RPL', 'qc.mit.Outlier', 'qc.nFeat.Outlier', 'qc.nCount.Outlier', 'qc.rps.Outlier', 'qc.rpl.Outlier', 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'CellOntology', 'wnnUMAP_1', 'wnnUMAP_2', 'RNAmarkers', 'CITEmarkers', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_en

# Check whether ENSEMBL IDs in var

In [13]:
adata.var

Unnamed: 0,hgnc_symbol,ensembl_gene_id,gene_biotype,genes
MIR1302-2HG,MIR1302-2HG,ENSG00000243485,lncRNA,MIR1302-2HG
FAM138A,FAM138A,ENSG00000237613,lncRNA,FAM138A
OR4F5,OR4F5,ENSG00000186092,protein_coding,OR4F5
AL627309.1,,,,AL627309.1
AL627309.3,,,,AL627309.3
...,...,...,...,...
AC141272.1,,,,AC141272.1
AC023491.2,,,,AC023491.2
AC007325.1,,,,AC007325.1
AC007325.4,,,,AC007325.4


In [14]:
adata.var.rename(columns={'genes': 'gene_symbol', 'ensembl_gene_id': 'ensembl_id'}, inplace=True)
adata.var.index.name = 'index'
adata.var

Unnamed: 0_level_0,hgnc_symbol,ensembl_id,gene_biotype,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MIR1302-2HG,MIR1302-2HG,ENSG00000243485,lncRNA,MIR1302-2HG
FAM138A,FAM138A,ENSG00000237613,lncRNA,FAM138A
OR4F5,OR4F5,ENSG00000186092,protein_coding,OR4F5
AL627309.1,,,,AL627309.1
AL627309.3,,,,AL627309.3
...,...,...,...,...
AC141272.1,,,,AC141272.1
AC023491.2,,,,AC023491.2
AC007325.1,,,,AC007325.1
AC007325.4,,,,AC007325.4


# Check raw data

In [15]:
adata.X.toarray().max()

32554.0

In [16]:
adata.X = adata.X.astype(np.int64)

In [17]:
adata.raw = adata

In [18]:
adata.X

<109361x36601 sparse matrix of type '<class 'numpy.int64'>'
	with 234353414 stored elements in Compressed Sparse Column format>

In [19]:
adata.raw.X

<109361x36601 sparse matrix of type '<class 'numpy.int64'>'
	with 234353414 stored elements in Compressed Sparse Column format>

In [20]:
adata.X.toarray().max()

32554

In [21]:
adata.raw.X.toarray().max()

32554

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- Validation Error: Tier 1 OBS Google Sheet: 'cryopreserved lung tissue' is not among available categories
- Validation Error: Tier 1 OBS Anndata Object: 'cryopreserved lung tissue' is not among available categories

# Data Submission Status

- CHECK: Raw data in X and in raw
- REVISE: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs and gene symbols in var

### Revision
DONE

In [None]:
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')

In [None]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')