In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "Meyer_01_publ"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'
CELL_TYPE_ONTOLOGY_ID_L2 = 'cell_type_ontology_term_id_level_2'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'
MARKER_GENES_L2 = 'author_cell_type_markers_level_2'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
MARKER_GENES = 'author_cell_type_markers'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [4]:
obs.columns

Index(['sample_ID', 'donor_id', 'protocol_URL', 'institute',
       'sample_collection_site', 'sample_collection_relative_time_point',
       'library_ID', 'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 'gene_annotation_version', 'alignment_software',
       'intron_inclusion', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='objec

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Cells collection from A spatially resolved atl...,"Kerstin, B, Meyer",-,X_umap,published,-


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,5841STDY7991475,A37,,Wellcome Sanger Institute,Cambridge,,A37-LNG-1-SC-45P-1,,A37_cells,NCBITaxon:9606,...,5 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000240
1,5841STDY7991476,A37,,Wellcome Sanger Institute,Cambridge,,A37-LNG-3-SC-45P-1,,A37_cells,NCBITaxon:9606,...,5 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000240
2,5841STDY7991477,A37,,Wellcome Sanger Institute,Cambridge,,A37-LNG-4-SC-45P-1,,A37_cells,NCBITaxon:9606,...,5 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000240
3,5841STDY7991478,A37,,Wellcome Sanger Institute,Cambridge,,A37-LNG-4-SC-45P-2,,A37_cells,NCBITaxon:9606,...,5 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000240
4,5841STDY7991479,A37,,Wellcome Sanger Institute,Cambridge,,A37-LNG-5-SC-45P-1,,A37_cells,NCBITaxon:9606,...,5 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,WTDAtest7888001,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-2-SC-45N-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
71,WTDAtest7888002,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-2-SC-45P-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
72,WTDAtest7985096,A26,https://www.protocols.io/view/nuclei-isolation...,Wellcome Sanger Institute,Cambridge,,WTDAtest7985096,,A26_nuclei,NCBITaxon:9606,...,na,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000242
73,WTDAtest8433409,A48,https://www.protocols.io/view/nuclei-isolation...,Wellcome Sanger Institute,Cambridge,,WTDAtest8433409,,A48_nuclei,NCBITaxon:9606,...,na,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241


# Validate obs and uns from adata

In [7]:
val_workflow = ValidationWorkflow(
    input = adata,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,unpublished
0,Cells collection from A spatially resolved atl...,Kerstin B. Meyer,published


In [8]:
val_workflow = ValidationWorkflow(
    input = adata,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
WTDAtest7887999-AAACCTGGTGTGAATA,WTDAtest7887999,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-1-SC-45N-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
WTDAtest7887999-AAACGGGCAACCGCCA,WTDAtest7887999,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-1-SC-45N-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
WTDAtest7887999-AACCATGGTACAGCAG,WTDAtest7887999,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-1-SC-45N-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
WTDAtest7887999-AACGTTGGTGTCCTCT,WTDAtest7887999,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-1-SC-45N-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
WTDAtest7887999-AACTCCCTCCTAGTGA,WTDAtest7887999,A32,,Wellcome Sanger Institute,Cambridge,,A32-LNG-1-SC-45N-1,,A32_cells,NCBITaxon:9606,...,3 prime tag,EFO:0008563,true,GRCh38,Ensembl 93,cell ranger 3.0.2,no,PATO:0000461,unknown,HsapDv:0000237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSSS_A_LNG8757929-TTTGGTTGTAATGCGG,WSSS_A_LNG8757929,A42,https://www.protocols.io/view/single-cell-and-...,Wellcome Sanger Institute,Cambridge,,WSSS_A_LNG8757929,,A42_nuclei,NCBITaxon:9606,...,na,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241
WSSS_A_LNG8757929-TTTGGTTGTGCCCTTT,WSSS_A_LNG8757929,A42,https://www.protocols.io/view/single-cell-and-...,Wellcome Sanger Institute,Cambridge,,WSSS_A_LNG8757929,,A42_nuclei,NCBITaxon:9606,...,na,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241
WSSS_A_LNG8757929-TTTGGTTTCAAGAGTA,WSSS_A_LNG8757929,A42,https://www.protocols.io/view/single-cell-and-...,Wellcome Sanger Institute,Cambridge,,WSSS_A_LNG8757929,,A42_nuclei,NCBITaxon:9606,...,na,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241
WSSS_A_LNG8757929-TTTGTTGAGCGAGTCA,WSSS_A_LNG8757929,A42,https://www.protocols.io/view/single-cell-and-...,Wellcome Sanger Institute,Cambridge,,WSSS_A_LNG8757929,,A42_nuclei,NCBITaxon:9606,...,na,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241


# Add author cell type markers to UNS

In [9]:
# Already in object
adata.uns.keys()

dict_keys(['author_cell_type_markers', 'study_PI', 'title', 'unpublished'])

In [10]:
adata.uns[MARKER_GENES]

Unnamed: 0,author_cell_type,author_cell_type_markers
0,AT1,AGER;RTKN2;CLIC5
1,AT2,SFTPC;SFTPA1;SFTPA2;WIF1;HHIP;CA2;ETV5;WIF1;HHIP
2,B_memory,CD27;TNFRSF13B
3,B_naive,IGHD;FCER2;TCL1A
4,B_plasma_IgA,IGHA1;IGHA2;CCR10
...,...,...
75,Secretory_Club,SCGB3A2
76,Secretory_Goblet,MUC5AC;TSPAN8;CYP2F1;CEACAM5;VSIG2;FUT6
77,Suprabasal,LY6D;PLAT;SERPINB4
78,T_reg,FOXP3;CCR4;CTLA4;IL2RA;TNFRSF4;TIGIT


# Check author cell type annotations and Cell Ontology IDs

In [11]:
set(adata.uns[MARKER_GENES][AUTHOR_CELL_TYPE]).symmetric_difference(set(adata.obs[AUTHOR_CELL_TYPE]))

set()

In [12]:
adata.obs[MARKER_GENES] = adata.obs[AUTHOR_CELL_TYPE].map(dict(zip(adata.uns[MARKER_GENES][AUTHOR_CELL_TYPE], adata.uns[MARKER_GENES][MARKER_GENES])))
adata.obs[MARKER_GENES].value_counts(dropna=False)

SFTPC;SFTPA1;SFTPA2;WIF1;HHIP;CA2;ETV5;WIF1;HHIP              18192
MARCO;MCEMP1;INHBA;TREM1;ABHD5;PPARG;RETN;CD5L;FABP4          17098
SFRP2;PI16;FBLN2;CD248;MFAP5                                  10170
CA4;FCN3;SLC6A4;IL7R                                           9348
FCER1G;GNLY;KLRF1;KIR2DL1;GZMB;FGFBP2;NKG7                     8116
                                                              ...  
CXCL13;FDCSP                                                     59
FOXI1;CFTR;ASCL3;HEPACAM2;PLCG2;BIK                              54
CHGA;CALCA;ASCL1;CHGB;GRP;BEX1                                   44
TUBB1;ANK1;PF4;TUBB1;CMTM5;PCSK6;STON2;PRKAR2B;SYTL4;LTBP1       14
GLDN;CDH7;DRP2;NFASC;NCMAP;MBP;PRX;MLIP                          11
Name: author_cell_type_markers, Length: 79, dtype: int64

In [13]:
# Already in object
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

AT2                          18192
Macro_alveolar               17098
Fibro_adventitial            10170
Endothelia_vascular_Cap_g     9348
NK_CD16hi                     8116
                             ...  
Fibro_immune_recruiting         59
Ionocyte_n_Brush                54
Neuroendocrine                  44
Megakaryocyte                   14
Schwann_Myelinating             11
Name: author_cell_type, Length: 80, dtype: int64

In [14]:
# Already in object
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063    18372
CL:0000583    17098
CL:0002503    10170
CL:0000235    10108
CL:4028002     9348
              ...  
CL:0000980       62
CL:0005006       54
CL:0000165       44
CL:0000556       14
CL:0000218       11
Name: cell_type_ontology_term_id, Length: 65, dtype: int64

# Check whether ENSEMBL IDs in var

In [15]:
adata.var

Unnamed: 0_level_0,gene_symbol,feature_types
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression
...,...,...
ENSG00000277856,AC233755.2,Gene Expression
ENSG00000275063,AC233755.1,Gene Expression
ENSG00000271254,AC240274.1,Gene Expression
ENSG00000277475,AC213203.1,Gene Expression


In [16]:
adata.var['ensembl_id'] = adata.var.index
adata.var.index.name = 'index'
adata.var

Unnamed: 0_level_0,gene_symbol,feature_types,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression,ENSG00000243485
ENSG00000237613,FAM138A,Gene Expression,ENSG00000237613
ENSG00000186092,OR4F5,Gene Expression,ENSG00000186092
ENSG00000238009,AL627309.1,Gene Expression,ENSG00000238009
ENSG00000239945,AL627309.3,Gene Expression,ENSG00000239945
...,...,...,...
ENSG00000277856,AC233755.2,Gene Expression,ENSG00000277856
ENSG00000275063,AC233755.1,Gene Expression,ENSG00000275063
ENSG00000271254,AC240274.1,Gene Expression,ENSG00000271254
ENSG00000277475,AC213203.1,Gene Expression,ENSG00000277475


# Check raw data

In [17]:
adata.X.toarray().max()

14469.0

In [18]:
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')
adata.obs['manner_of_death'] = pd.to_numeric(adata.obs['manner_of_death'], errors='coerce')

In [19]:
adata.X = adata.X.astype(np.int64)

In [20]:
adata.raw = adata

In [21]:
adata.X

<193108x33538 sparse matrix of type '<class 'numpy.int64'>'
	with 282988531 stored elements in Compressed Sparse Row format>

In [22]:
adata.raw.X

<193108x33538 sparse matrix of type '<class 'numpy.int64'>'
	with 282988531 stored elements in Compressed Sparse Row format>

In [23]:
adata.X.toarray().max()

14469

In [24]:
adata.raw.X.toarray().max()

14469

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- Validation Error: Tier 1 OBS Google Sheet: 'Frozen' is not among available categories
- Validation Error: Tier 1 OBS Anndata Object: 'Frozen' is not among available categories


# Data Submission Status

- CHECK: Raw counts in X and in raw
- REVISE: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs in var

### Revision:

DONE

In [26]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')