In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "Meyer_02_unpubl"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'
AUTHOR_CELL_TYPE_L2 = 'author_cell_type_level_2'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'
CELL_TYPE_ONTOLOGY_ID_L2 = 'cell_type_ontology_term_id_level_2'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'
MARKER_GENES_L2 = 'author_cell_type_markers_level_2'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
MARKER_GENES = 'author_cell_type_markers'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [5]:
adata.obs.columns

Index(['sample_ID', 'donor_id', 'protocol_URL', 'institute',
       'sample_collection_site', 'sample_collection_relative_time_point',
       'library_ID', 'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 'gene_annotation_version', 'alignment_software',
       'intron_inclusion', 'disease_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id',
       'development_stage_ontology_term_id', 'cell_type_ontology

# Validate obs and uns from Tier 1 Metadata Template

In [6]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Human lung five locations multiome ATAC and RN...,"Sarah,A,Teichmann, Martijn,C,Nawijn, Kerstin,B...","SampleID, Donor",X_umap,protected under embargo,data was pooled and donor metadata per cell re...


In [7]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID_ATAC
0,HCA_A_LNG12177506,A56,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000237,HCA_A_LNG11986508
1,HCA_A_LNG12177508,A60,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000240,HCA_A_LNG11986510
2,HCA_A_LNG12177516,A63,,Wellcome Sanger Institute,Cambridge,,E,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000239,HCA_A_LNG11986518
3,HCA_A_LNG12177507,A56,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000237,HCA_A_LNG11986509
4,HCA_A_LNG12177507,A60,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000240,HCA_A_LNG11986509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,HCA_A_LNG12177510,A61,,Wellcome Sanger Institute,Cambridge,,C,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241,HCA_A_LNG11986512
71,HCA_A_LNG12177510,A63,,Wellcome Sanger Institute,Cambridge,,C,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000239,HCA_A_LNG11986512
72,HCA_A_LNG12177509,A60,,Wellcome Sanger Institute,Cambridge,,C,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000240,HCA_A_LNG11986511
73,HCA_A_LNG12177509,A56,,Wellcome Sanger Institute,Cambridge,,C,,,NCBITaxon:9606,...,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000237,HCA_A_LNG11986511


# Validate obs and uns from adata

In [8]:
val_workflow = ValidationWorkflow(
    input = adata,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,unpublished
0,Human lung five locations multiome ATAC and RN...,"Sarah,A,Teichmann, Martijn,C,Nawijn, Kerstin,B...",protected under embargo


In [9]:
val_workflow = ValidationWorkflow(
    input = adata,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0_level_0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
rna_barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TATCCAGCAGCGCTTG-1-HCA_A_LNG12177506,HCA_A_LNG12177506,A56,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000237
GGTGATTTCAAACACC-1-HCA_A_LNG12177508,HCA_A_LNG12177508,A60,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000240
GCAAACAAGGCCATCA-1-HCA_A_LNG12177516,HCA_A_LNG12177516,A63,,Wellcome Sanger Institute,Cambridge,,E,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000239
CCCAATTGTCCCGGAA-1-HCA_A_LNG12177508,HCA_A_LNG12177508,A60,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000240
GACCTCAAGGAAGCTA-1-HCA_A_LNG12177507,HCA_A_LNG12177507,A56,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CTATGAGGTGTGTGGT-1-HCA_A_LNG12177506,HCA_A_LNG12177506,A61,,Wellcome Sanger Institute,Cambridge,,B,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241
GACAATACACGTGCTG-1-HCA_A_LNG12177517,HCA_A_LNG12177517,A58,,Wellcome Sanger Institute,Cambridge,,E,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000241
GTTAGGCGTAACTACG-1-HCA_A_LNG12177505,HCA_A_LNG12177505,A56,,Wellcome Sanger Institute,Cambridge,,A,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000237
TACAAGCTCAAAGGCA-1-HCA_A_LNG12177516,HCA_A_LNG12177516,A60,,Wellcome Sanger Institute,Cambridge,,E,,,NCBITaxon:9606,...,3 prime tag,EFO:0008637,true,GRCh38,Ensembl 93,cell ranger 3.0.2,yes,PATO:0000461,unknown,HsapDv:0000240


# Add author cell type markers to UNS

In [10]:
# Already in object
adata.uns.keys()

dict_keys(['annot_colors', 'author_cell_type_markers', 'study_PI', 'title', 'unpublished'])

In [11]:
adata.uns[MARKER_GENES]

Unnamed: 0,author_cell_type,author_cell_type_markers
0,AT1,AGER;RTKN2;CLIC5
1,AT2,SFTPC;SFTPA1;SFTPA2;WIF1;HHIP;CA2;ETV5;WIF1;HHIP
2,B-memory,CD27;TNFRSF13B
3,B-naive,IGHD;FCER2;TCL1A
4,B-plasma-IgA,IGHA1;IGHA2;CCR10
5,B-plasma-IgG,IGHG1;IGHG3;IGHG2
6,Basal,KRT5;TP63;S100A2;KRT6A;TNS4;MMP10;KRT14;DLK2;K...
7,CD4-EM-Effector,CCR6;IL7R
8,CD4-naive-CM,LTB;LEF1;CD28;KLF2;SELL
9,CD8-EM,PDCD1;DUSP2;NKG7;CD7


In [12]:
ontology_mapping = adata.obs[[AUTHOR_CELL_TYPE, CELL_TYPE_ONTOLOGY_ID]].drop_duplicates(AUTHOR_CELL_TYPE).reset_index(drop=True)

ontology_mapping

Unnamed: 0,author_cell_type,cell_type_ontology_term_id
0,Chondrocyte,CL:0000138
1,NK_CD16hi,CL:0000939
2,Fibro-alveolar,CL:0002553
3,AT2,CL:0002063
4,Fibro-adventitial,CL:0002503
5,AT1,CL:0002062
6,CD4-EM-Effector,CL:0000905
7,Muscle-smooth-airway,CL:0019019
8,SMG-mucous,CL:4033037
9,Monocyte-CD14,CL:0001054


In [13]:
adata.uns[MARKER_GENES][CELL_TYPE_ONTOLOGY_ID] = adata.uns[MARKER_GENES][AUTHOR_CELL_TYPE].map(dict(zip(ontology_mapping[AUTHOR_CELL_TYPE], ontology_mapping[CELL_TYPE_ONTOLOGY_ID])))

adata.uns[MARKER_GENES]

Unnamed: 0,author_cell_type,author_cell_type_markers,cell_type_ontology_term_id
0,AT1,AGER;RTKN2;CLIC5,CL:0002062
1,AT2,SFTPC;SFTPA1;SFTPA2;WIF1;HHIP;CA2;ETV5;WIF1;HHIP,CL:0002063
2,B-memory,CD27;TNFRSF13B,CL:0000787
3,B-naive,IGHD;FCER2;TCL1A,CL:0000788
4,B-plasma-IgA,IGHA1;IGHA2;CCR10,CL:0000987
5,B-plasma-IgG,IGHG1;IGHG3;IGHG2,CL:0000985
6,Basal,KRT5;TP63;S100A2;KRT6A;TNS4;MMP10;KRT14;DLK2;K...,CL:0002633
7,CD4-EM-Effector,CCR6;IL7R,CL:0000905
8,CD4-naive-CM,LTB;LEF1;CD28;KLF2;SELL,CL:0000904
9,CD8-EM,PDCD1;DUSP2;NKG7;CD7,CL:0000913


In [27]:
adata.uns[MARKER_GENES] = adata.uns[MARKER_GENES].astype(str)

# Check author cell type annotations and Cell Ontology IDs

In [14]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

AT2                                       6333
SMG-mucous                                3399
AT1                                       2990
SMG-serous                                2691
Macro-alveolar                            2149
NK_CD16hi                                 1677
Monocyte-CD14                             1428
Chondrocyte                               1271
Fibro-alveolar                            1229
Macro-intravascular                       1049
CD4-EM-Effector                            993
CD4-naive-CM                               984
Fibro-adventitial                          942
Mast                                       831
Endothelia-vascular-Cap-g                  789
CD8-TRM-EM                                 768
Monocyte-CD16                              673
Club                                       634
Basal                                      467
Ciliated                                   413
CD8-EM-EMRA                                385
B-plasma-IgA 

In [15]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063    6333
CL:4033037    3399
CL:0002062    2990
CL:4033005    2691
CL:0000583    2149
CL:0000939    1677
CL:0001054    1428
CL:0000138    1271
CL:0002553    1229
CL:0000235    1180
CL:4033039    1051
CL:0000905     993
CL:0000904     984
CL:0002503     942
CL:0000097     831
CL:4028002     789
CL:0002396     673
CL:0000158     634
CL:0002633     467
CL:0000067     413
CL:0001062     385
CL:0000987     351
CL:4033043     312
CL:0000787     304
CL:0000815     301
CL:0000160     297
CL:0000798     289
CL:0000788     282
CL:0009089     224
CL:2000093     220
CL:0000840     214
CL:0002338     201
CL:4028003     196
CL:1001568     190
CL:0000985     178
NaN            167
CL:0000940     136
CL:0000623     131
CL:4033024     126
CL:4033008      97
CL:0019019      89
CL:0000165      77
CL:0000990      68
CL:0002543      59
CL:0000185      59
CL:0000186      56
CL:4033048      54
CL:0000913      43
CL:0001065      33
CL:0000841      30
CL:4033026      27
CL:0000057      19
Name: cell_t

In [16]:
adata.obs[MARKER_GENES] = adata.obs[AUTHOR_CELL_TYPE].map(dict(zip(adata.uns[MARKER_GENES][AUTHOR_CELL_TYPE], adata.uns[MARKER_GENES][MARKER_GENES])))
adata.obs[MARKER_GENES].value_counts(dropna=False)

SFTPC;SFTPA1;SFTPA2;WIF1;HHIP;CA2;ETV5;WIF1;HHIP                                                                                                           6333
BPIFB2;MUC5B;TFF3;TFF1                                                                                                                                     3399
AGER;RTKN2;CLIC5                                                                                                                                           2990
PRR4;LPO;PIP;S100A1;PRB3;C6orf58;PRB4;ODAM;PRH2                                                                                                            2691
MARCO;MCEMP1;INHBA;TREM1;ABHD5;PPARG;RETN;CD5L;FABP4                                                                                                       2149
FCER1G;GNLY;KLRF1;KIR2DL1;GZMB;FGFBP2;NKG7                                                                                                                 1677
S100A12;EREG;CD14                       

# Check whether ENSEMBL IDs in var

In [17]:
adata.var

Unnamed: 0_level_0,gene_symbol,feature_types,genome
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression,GRCh38
ENSG00000186092,OR4F5,Gene Expression,GRCh38
ENSG00000238009,AL627309.1,Gene Expression,GRCh38
ENSG00000239945,AL627309.3,Gene Expression,GRCh38
ENSG00000239906,AL627309.2,Gene Expression,GRCh38
...,...,...,...
ENSG00000277761,AC136616.2,Gene Expression,GRCh38
ENSG00000278633,AC023491.2,Gene Expression,GRCh38
ENSG00000276017,AC007325.1,Gene Expression,GRCh38
ENSG00000278817,AC007325.4,Gene Expression,GRCh38


In [18]:
adata.var['ensembl_id'] = adata.var.index
adata.var.index.name = 'index'
adata.var

Unnamed: 0_level_0,gene_symbol,feature_types,genome,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression,GRCh38,ENSG00000243485
ENSG00000186092,OR4F5,Gene Expression,GRCh38,ENSG00000186092
ENSG00000238009,AL627309.1,Gene Expression,GRCh38,ENSG00000238009
ENSG00000239945,AL627309.3,Gene Expression,GRCh38,ENSG00000239945
ENSG00000239906,AL627309.2,Gene Expression,GRCh38,ENSG00000239906
...,...,...,...,...
ENSG00000277761,AC136616.2,Gene Expression,GRCh38,ENSG00000277761
ENSG00000278633,AC023491.2,Gene Expression,GRCh38,ENSG00000278633
ENSG00000276017,AC007325.1,Gene Expression,GRCh38,ENSG00000276017
ENSG00000278817,AC007325.4,Gene Expression,GRCh38,ENSG00000278817


# Check raw data

In [19]:
adata.X.toarray().max()

1448

In [20]:
adata.X = adata.X.astype(np.int64)

In [21]:
adata.raw = adata

In [22]:
adata.X

<37339x35045 sparse matrix of type '<class 'numpy.int64'>'
	with 78476531 stored elements in Compressed Sparse Row format>

In [23]:
adata.raw.X

<37339x35045 sparse matrix of type '<class 'numpy.int64'>'
	with 78476531 stored elements in Compressed Sparse Row format>

In [24]:
adata.X.toarray().max()

1448

In [25]:
adata.raw.X.toarray().max()

1448

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- Validation Error: Tier 1 OBS Google Sheet: 'Frozen' is not among available categories
- Validation Error: Tier 1 OBS Anndata Object: 'Frozen' is not among available categories

# Data Submission Status

- CHECK: Raw counts in X and in raw
- REVISE: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs in var

### Revision:

DONE

In [28]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')