In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [None]:
DATASET_ID = "Fujita_publ"
DATA_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}"
RDS_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.rds"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [5]:
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [6]:
%%R -i RDS_PATH

suppressPackageStartupMessages(library(Seurat))
rds <- readRDS(file=RDS_PATH)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

package ‘tools’ was built under R version 4.3.3 


In [7]:
%%R -o adata

adata <- as.SingleCellExperiment(rds)
adata

class: SingleCellExperiment 
dim: 38224 70051 
metadata(0):
assays(1): counts
rownames(38224): MIR1302-2HG FAM138A ... AC004556.1 FAM231C
rowData names(0):
colnames(70051): AAACCCAAGTGTTGAA-1_3 AAACCCACAAACACGG-1_3 ...
  TTTGTTGTCGTAACTG-1_19 TTTGTTGTCTCGTTTA-1_19
colData names(10): orig.ident nCount_RNA ... patientID ident
reducedDimNames(0):
mainExpName: RNA
altExpNames(0):


1: Layer ‘data’ is empty 
2: Layer ‘scale.data’ is empty 


  return AnnData(exprs, obs, var, uns, obsm, layers=layers)


In [8]:
obs['sample_ID'].value_counts()

JK05    1
JK07    1
JK08    1
JK09    1
JK10    1
JK19    1
JK21    1
JK24    1
JK27    1
JK03    1
JK04    1
JK26    1
JK29    1
JK06    1
JK11    1
JK12    1
Name: sample_ID, dtype: int64

In [9]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt',
       'DoubletFinder', 'seurat_clusters', 'RNA_snn_res.0.5', 'anno',
       'patientID', 'ident'],
      dtype='object')

In [10]:
adata.obs['ident'].value_counts()

JK22_26    9167
JK04       8271
JK03       6033
JK12       5285
JK22_29    5027
JK11       5005
JK09       4558
JK22_19    3808
JK06       3634
JK22_27    3606
JK07       3574
JK08       3149
JK05       2890
JK10       2322
JK22_24    1941
JK22_21    1781
Name: ident, dtype: int64

In [11]:
adata.obs['patientID'].value_counts()

JK22_26    9167
JK04       8271
JK03       6033
JK12       5285
JK22_29    5027
JK11       5005
JK09       4558
JK22_19    3808
JK06       3634
JK22_27    3606
JK07       3574
JK08       3149
JK05       2890
JK10       2322
JK22_24    1941
JK22_21    1781
Name: patientID, dtype: int64

In [12]:
# show nonoverlap between obs['sample_ID'] and obs['ident']

adata_ids = adata.obs['ident'].unique()
obs_ids = obs['sample_ID'].unique()

non_overlap = set(adata_ids) - set(obs_ids)
non_overlap_other_side = set(obs_ids) - set(adata_ids)

In [13]:
non_overlap

{'JK22_19', 'JK22_21', 'JK22_24', 'JK22_26', 'JK22_27', 'JK22_29'}

In [14]:
non_overlap_other_side

{'JK19', 'JK21', 'JK24', 'JK26', 'JK27', 'JK29'}

In [15]:
sample_id_mapping_dict = {
    'JK22_19': 'JK19',
    'JK22_21': 'JK21',
    'JK22_24': 'JK24',
    'JK22_26': 'JK26',
    'JK22_27': 'JK27',
    'JK22_29': 'JK29'
}

In [16]:
adata.obs['sample_ID'] = adata.obs['ident'].map(sample_id_mapping_dict)
adata.obs['sample_ID'].fillna(adata.obs['ident'], inplace=True)

adata.obs['sample_ID'].value_counts(dropna=False)

JK26    9167
JK04    8271
JK03    6033
JK12    5285
JK29    5027
JK11    5005
JK09    4558
JK19    3808
JK06    3634
JK27    3606
JK07    3574
JK08    3149
JK05    2890
JK10    2322
JK24    1941
JK21    1781
Name: sample_ID, dtype: int64

In [17]:
adata.obs['anno'].value_counts(dropna=False)

NaN               12133
CD4T              11365
NK/NKT             8423
Capillary_Cell     6393
Macrophage         4289
CD8T               4136
Mono               3425
Fibroblast         3022
Mast_cell          2413
AT2                2144
DC1                2132
Vein_cell          2015
Cilia              1950
SM/Pericyte        1279
Artery_cell         912
B_cell              712
DC2                 676
AT1                 554
Club/Basal          529
Neutrophil          521
Plasma_cell         388
Prolif.immune       376
Lymphatic_Cell      264
Name: anno, dtype: int64

In [18]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Anomalous Epithelial Variations and Ectopic In...,YU FUJITA,patient,,The dataset is published. https://www.atsjourn...,


In [19]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,JK05,JK05,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK05,GSM5282538,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000241
1,JK07,JK07,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK07,GSM5282539,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000243
2,JK08,JK08,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK08,GSM5282540,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000242
3,JK09,JK09,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK09,GSM5282541,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000242
4,JK10,JK10,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK10,GSM5282542,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000242
5,JK19,JK19,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK19,GSM7882055,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000241
6,JK21,JK21,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK21,GSM7882056,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000242
7,JK24,JK24,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK24,GSM7882057,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000242
8,JK27,JK27,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK27,GSM7882059,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,MONDO:0005002,Asian,HsapDv:0000242
9,JK03,JK03,https://www.atsjournals.org/doi/10.1165/rcmb.2...,The Jikei University School of Medicine,,,JK03,GSM5282544,Batch run by different sample,NCBITaxon:9606,...,3 prime tag,EFO:0008567,True,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241


# Validate obs and uns from adata

In [21]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 70051 × 38224
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'DoubletFinder', 'seurat_clusters', 'RNA_snn_res.0.5', 'anno', 'patientID', 'ident', 'sample_ID'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'

In [22]:
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'sample_ID',
    df_col = 'sample_ID',
    skip = None
)

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,DoubletFinder,seurat_clusters,RNA_snn_res.0.5,anno,patientID,ident,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
AAACCCAAGTGTTGAA-1_3,JK03,5979.0,2711,3.110888,Singlet,12,12,Capillary_Cell,JK03,JK03,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
AAACCCACAAACACGG-1_3,JK03,8711.0,3072,2.938813,Singlet,1,1,NK/NKT,JK03,JK03,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
AAACCCAGTAGGATAT-1_3,JK03,24013.0,4975,3.202432,Singlet,5,5,Mono,JK03,JK03,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
AAACCCAGTCGATTAC-1_3,JK03,6906.0,2496,5.589343,Singlet,0,0,CD4T,JK03,JK03,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
AAACCCATCCGCTGTT-1_3,JK03,10166.0,3378,2.705095,Singlet,1,1,NK/NKT,JK03,JK03,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCAGGTCCGT-1_19,JK22_29,1567.0,640,9.125718,Singlet,13,13,SM/Pericyte,JK22_29,JK22_29,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
TTTGTTGCATTGAGGG-1_19,JK22_29,576.0,363,17.361111,,,,,JK22_29,JK22_29,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
TTTGTTGGTCTAGTGT-1_19,JK22_29,5079.0,2097,2.736759,Singlet,10,10,Vein_cell,JK22_29,JK22_29,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241
TTTGTTGTCGTAACTG-1_19,JK22_29,5333.0,2590,2.118882,Singlet,13,13,SM/Pericyte,JK22_29,JK22_29,...,3 prime tag,EFO:0008567,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241


# Add author cell type markers to UNS

In [23]:
celltype_ontology = pd.read_csv(f"{DATA_PATH}/cellontology.csv")
celltype_markergenes = pd.read_csv(f"{DATA_PATH}/markergenes.csv")

In [24]:
celltype_ontology

Unnamed: 0,annotation,Ontology
0,Capillary_Cell,CL_0002144
1,NK/NKT,CL_0000623/BTO_0006501
2,Mono,CL_0000576
3,CD4T,NCIT:C12537
4,Macrophage,CL_0000235
5,B_cell,CL:0000236
6,Neutrophil,CL_0000775
7,CD8T,NCIT_C12542
8,Mast_cell,NCIT_C12747
9,Dendritic_cell,CL_0000451


In [25]:
celltype_markergenes

Unnamed: 0,Cell type,Canonical markers
0,Epithelium,EPCAM
1,Club Cell,SCGB3A2
2,Ciliated Cell,FOXJ1; TP73; CCDC78
3,Basal Cell,KRT5; TP63
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5
6,Endothelium,CLDN5
7,Artery Cell,GJA5; BMX
8,Vein Cell,ACKR1
9,Capillary Cell,CA4


In [26]:
mapping_dict = {
    'Epithelium': None,  # No direct match; may need a generic or separate category
    'Club Cell': 'Club/Basal',
    'Ciliated Cell': 'Cilia',
    'Basal Cell': 'Club/Basal',
    'Alveolar Epithelial Type 1 Cell': 'AT1',
    'Alveolar Epithelial Type 2 Cell': 'AT2',
    'Endothelium': None,  # No direct match
    'Artery Cell': 'Artery_cell',
    'Vein Cell': 'Vein_cell',
    'Capillary Cell': 'Capillary_Cell',
    'Lymphatic Cell': 'Lymphatic_Cell',
    'Stroma': None,  # No direct match
    'Smooth Muscle': 'SM/Pericyte',
    'Pericyte': 'SM/Pericyte',
    'Fibroblast': 'Fibroblast',
    'Immune': None,  # No direct match; might include various immune cells
    'B Cell': 'B_cell',
    'Plasma Cell': 'Plasma_cell',
    'CD8+ Tcell': 'CD8T',
    'CD4+ Tcell': 'CD4T',
    'Natural Killer Cell/Natural Killer T Cell': 'NK/NKT',
    'Neutrophil': 'Neutrophil',
    'Mast Cell': 'Mast_cell',
    'Macrophage': 'Macrophage',
    'Monocyte': 'Mono',
    'Myeloid Dendritic Cell ': 'Dendritic_cell'
}


In [27]:
celltype_markergenes[AUTHOR_CELL_TYPE] = celltype_markergenes['Cell type'].map(mapping_dict)
celltype_markergenes

Unnamed: 0,Cell type,Canonical markers,author_cell_type
0,Epithelium,EPCAM,
1,Club Cell,SCGB3A2,Club/Basal
2,Ciliated Cell,FOXJ1; TP73; CCDC78,Cilia
3,Basal Cell,KRT5; TP63,Club/Basal
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5,AT1
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5,AT2
6,Endothelium,CLDN5,
7,Artery Cell,GJA5; BMX,Artery_cell
8,Vein Cell,ACKR1,Vein_cell
9,Capillary Cell,CA4,Capillary_Cell


In [28]:
len(celltype_markergenes[AUTHOR_CELL_TYPE].value_counts())

20

In [29]:
author_cell_types_list = celltype_ontology['annotation']

# proliferating immune cell not in marker genes
non_overlap = [x for x in author_cell_types_list if x not in celltype_markergenes[AUTHOR_CELL_TYPE].unique()]
non_overlap

['proliferating_immune_cell']

In [30]:
celltype_markergenes[CELL_TYPE_ONTOLOGY_ID] = celltype_markergenes[AUTHOR_CELL_TYPE].map(celltype_ontology.set_index('annotation')['Ontology'])

celltype_markergenes

Unnamed: 0,Cell type,Canonical markers,author_cell_type,cell_type_ontology_term_id
0,Epithelium,EPCAM,,
1,Club Cell,SCGB3A2,Club/Basal,CL_0000158/CL_0002633
2,Ciliated Cell,FOXJ1; TP73; CCDC78,Cilia,CL_1000271
3,Basal Cell,KRT5; TP63,Club/Basal,CL_0000158/CL_0002633
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5,AT1,CL_0002062
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5,AT2,CL_0002063
6,Endothelium,CLDN5,,
7,Artery Cell,GJA5; BMX,Artery_cell,NCIT_C49194
8,Vein Cell,ACKR1,Vein_cell,CL_0002543
9,Capillary Cell,CA4,Capillary_Cell,CL_0002144


In [31]:
set1 = set(celltype_markergenes[AUTHOR_CELL_TYPE])
set2 = set(celltype_ontology['annotation'])

non_overlap = set1 - set2
non_overlap_other_side = set2 - set1

non_overlap_other_side

{'proliferating_immune_cell'}

In [32]:
# add row with 'Proliferating Immune Cell', NaN, 'proliferating_immune_cell' and CMPO_0000241
celltype_markergenes = celltype_markergenes.append({
    'Cell type': 'Proliferating Immune Cell',
    'Canonical markers': np.nan,
    AUTHOR_CELL_TYPE: 'proliferating_immune_cell',
    CELL_TYPE_ONTOLOGY_ID: 'CMPO_0000241'
}, ignore_index=True)

celltype_markergenes

  celltype_markergenes = celltype_markergenes.append({


Unnamed: 0,Cell type,Canonical markers,author_cell_type,cell_type_ontology_term_id
0,Epithelium,EPCAM,,
1,Club Cell,SCGB3A2,Club/Basal,CL_0000158/CL_0002633
2,Ciliated Cell,FOXJ1; TP73; CCDC78,Cilia,CL_1000271
3,Basal Cell,KRT5; TP63,Club/Basal,CL_0000158/CL_0002633
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5,AT1,CL_0002062
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5,AT2,CL_0002063
6,Endothelium,CLDN5,,
7,Artery Cell,GJA5; BMX,Artery_cell,NCIT_C49194
8,Vein Cell,ACKR1,Vein_cell,CL_0002543
9,Capillary Cell,CA4,Capillary_Cell,CL_0002144


In [33]:
celltype_markergenes.rename(columns={'Canonical markers': MARKER_GENES, 'Cell type': AUTHOR_CELL_TYPE_DESCRIPTION}, inplace=True)
celltype_markergenes

Unnamed: 0,author_cell_type_description,author_cell_type_markers,author_cell_type,cell_type_ontology_term_id
0,Epithelium,EPCAM,,
1,Club Cell,SCGB3A2,Club/Basal,CL_0000158/CL_0002633
2,Ciliated Cell,FOXJ1; TP73; CCDC78,Cilia,CL_1000271
3,Basal Cell,KRT5; TP63,Club/Basal,CL_0000158/CL_0002633
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5,AT1,CL_0002062
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5,AT2,CL_0002063
6,Endothelium,CLDN5,,
7,Artery Cell,GJA5; BMX,Artery_cell,NCIT_C49194
8,Vein Cell,ACKR1,Vein_cell,CL_0002543
9,Capillary Cell,CA4,Capillary_Cell,CL_0002144


In [34]:
# replace all '_' with ':' in CELL_TYPE_ONTOLOGY_ID of celltype_markergenes
celltype_markergenes[CELL_TYPE_ONTOLOGY_ID] = celltype_markergenes[CELL_TYPE_ONTOLOGY_ID].str.replace('_', ':')
celltype_markergenes[CELL_TYPE_ONTOLOGY_ID] = celltype_markergenes[CELL_TYPE_ONTOLOGY_ID].str.replace('/', ', ')

celltype_markergenes

Unnamed: 0,author_cell_type_description,author_cell_type_markers,author_cell_type,cell_type_ontology_term_id
0,Epithelium,EPCAM,,
1,Club Cell,SCGB3A2,Club/Basal,"CL:0000158, CL:0002633"
2,Ciliated Cell,FOXJ1; TP73; CCDC78,Cilia,CL:1000271
3,Basal Cell,KRT5; TP63,Club/Basal,"CL:0000158, CL:0002633"
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5,AT1,CL:0002062
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5,AT2,CL:0002063
6,Endothelium,CLDN5,,
7,Artery Cell,GJA5; BMX,Artery_cell,NCIT:C49194
8,Vein Cell,ACKR1,Vein_cell,CL:0002543
9,Capillary Cell,CA4,Capillary_Cell,CL:0002144


In [35]:
# make all columns dtype category
celltype_markergenes = celltype_markergenes.astype('category')
celltype_markergenes

Unnamed: 0,author_cell_type_description,author_cell_type_markers,author_cell_type,cell_type_ontology_term_id
0,Epithelium,EPCAM,,
1,Club Cell,SCGB3A2,Club/Basal,"CL:0000158, CL:0002633"
2,Ciliated Cell,FOXJ1; TP73; CCDC78,Cilia,CL:1000271
3,Basal Cell,KRT5; TP63,Club/Basal,"CL:0000158, CL:0002633"
4,Alveolar Epithelial Type 1 Cell,AGER; PDPN; CLIC5,AT1,CL:0002062
5,Alveolar Epithelial Type 2 Cell,SFTPB; SFTPC; SFTPD; MUC1; ETV5,AT2,CL:0002063
6,Endothelium,CLDN5,,
7,Artery Cell,GJA5; BMX,Artery_cell,NCIT:C49194
8,Vein Cell,ACKR1,Vein_cell,CL:0002543
9,Capillary Cell,CA4,Capillary_Cell,CL:0002144


In [36]:
celltype_markergenes.dtypes

author_cell_type_description    category
author_cell_type_markers        category
author_cell_type                category
cell_type_ontology_term_id      category
dtype: object

In [37]:
adata.uns[MARKER_GENES] = celltype_markergenes

In [38]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt',
       'DoubletFinder', 'seurat_clusters', 'RNA_snn_res.0.5', 'anno',
       'patientID', 'ident', 'sample_ID', 'donor_id', 'protocol_URL',
       'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 'gene_annotation_version', 'alignment_software',
      

In [39]:
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['anno'].map(dict(zip(celltype_markergenes[AUTHOR_CELL_TYPE], celltype_markergenes[CELL_TYPE_ONTOLOGY_ID])))
adata.obs[MARKER_GENES] = adata.obs['anno'].map(dict(zip(celltype_markergenes[AUTHOR_CELL_TYPE], celltype_markergenes[MARKER_GENES])))

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,DoubletFinder,seurat_clusters,RNA_snn_res.0.5,anno,patientID,ident,...,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,cell_type_ontology_term_id,author_cell_type_markers
AAACCCAAGTGTTGAA-1_3,JK03,5979.0,2711,3.110888,Singlet,12,12,Capillary_Cell,JK03,JK03,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,CL:0002144,CA4
AAACCCACAAACACGG-1_3,JK03,8711.0,3072,2.938813,Singlet,1,1,NK/NKT,JK03,JK03,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,"CL:0000623, BTO:0006501",KLRD1; NKG7; TYROBP; CD3E; CD8A; FCER1G
AAACCCAGTAGGATAT-1_3,JK03,24013.0,4975,3.202432,Singlet,5,5,Mono,JK03,JK03,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,CL:0000576,CD14; S100A8; FCGR3A
AAACCCAGTCGATTAC-1_3,JK03,6906.0,2496,5.589343,Singlet,0,0,CD4T,JK03,JK03,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,NCIT:C12537,CD3E; CD4; LEF1
AAACCCATCCGCTGTT-1_3,JK03,10166.0,3378,2.705095,Singlet,1,1,NK/NKT,JK03,JK03,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,"CL:0000623, BTO:0006501",KLRD1; NKG7; TYROBP; CD3E; CD8A; FCER1G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCAGGTCCGT-1_19,JK22_29,1567.0,640,9.125718,Singlet,13,13,SM/Pericyte,JK22_29,JK22_29,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,"CL:0000192, CL:0000669",CSPG4; TRPC6; PDGFRB;RGS5
TTTGTTGCATTGAGGG-1_19,JK22_29,576.0,363,17.361111,,,,,JK22_29,JK22_29,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,,
TTTGTTGGTCTAGTGT-1_19,JK22_29,5079.0,2097,2.736759,Singlet,10,10,Vein_cell,JK22_29,JK22_29,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,CL:0002543,ACKR1
TTTGTTGTCGTAACTG-1_19,JK22_29,5333.0,2590,2.118882,Singlet,13,13,SM/Pericyte,JK22_29,JK22_29,...,true,GRCh38,,version 6.1.2,yes,PATO:0000461,Asian,HsapDv:0000241,"CL:0000192, CL:0000669",CSPG4; TRPC6; PDGFRB;RGS5


# Check author cell type annotations and Cell Ontology IDs

In [40]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

NaN                        15317
NCIT:C12537                11365
CL:0000623, BTO:0006501     8423
CL:0002144                  6393
CL:0000235                  4289
NCIT:C12542                 4136
CL:0000576                  3425
CL:0000057                  3022
NCIT:C12747                 2413
CL:0002063                  2144
CL:0002543                  2015
CL:1000271                  1950
CL:0000192, CL:0000669      1279
NCIT:C49194                  912
CL:0000236                   712
CL:0002062                   554
CL:0000158, CL:0002633       529
CL:0000775                   521
CL:0000786                   388
CL:0002138                   264
Name: cell_type_ontology_term_id, dtype: int64

In [41]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['anno']
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

NaN               12133
CD4T              11365
NK/NKT             8423
Capillary_Cell     6393
Macrophage         4289
CD8T               4136
Mono               3425
Fibroblast         3022
Mast_cell          2413
AT2                2144
DC1                2132
Vein_cell          2015
Cilia              1950
SM/Pericyte        1279
Artery_cell         912
B_cell              712
DC2                 676
AT1                 554
Club/Basal          529
Neutrophil          521
Plasma_cell         388
Prolif.immune       376
Lymphatic_Cell      264
Name: author_cell_type, dtype: int64

In [42]:
adata.obs[MARKER_GENES].value_counts(dropna=False)

NaN                                        15317
CD3E; CD4; LEF1                            11365
KLRD1; NKG7; TYROBP; CD3E; CD8A; FCER1G     8423
CA4                                         6393
MARCO; MSR1; MRC1                           4289
CD3E; CD8A;GZMH; GZMB; DUSP2                4136
CD14; S100A8; FCGR3A                        3425
COL1A1; PDGFRA                              3022
MS4A2; CPA3; TPSAB1                         2413
SFTPB; SFTPC; SFTPD; MUC1; ETV5             2144
ACKR1                                       2015
FOXJ1; TP73; CCDC78                         1950
CSPG4; TRPC6; PDGFRB;RGS5                   1279
GJA5; BMX                                    912
CD79A; CD24; MS4A1; CD19                     712
AGER; PDPN; CLIC5                            554
KRT5; TP63                                   529
S100A8; S100A9; IFITM2; FCGR3B               521
CD79A; CD27; SLAMF7                          388
PROX1; PDPN                                  264
Name: author_cell_ty

# Check whether ENSEMBL IDs in var

In [52]:
adata.var

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3
...
C21orf2
AP001065.1
AP001065.2
AC004556.1
FAM231C


In [53]:
adata.var['gene_symbol'] = adata.var.index.astype(str)
adata.var

Unnamed: 0,gene_symbol
MIR1302-2HG,MIR1302-2HG
FAM138A,FAM138A
OR4F5,OR4F5
AL627309.1,AL627309.1
AL627309.3,AL627309.3
...,...
C21orf2,C21orf2
AP001065.1,AP001065.1
AP001065.2,AP001065.2
AC004556.1,AC004556.1


In [51]:
ensembl_ids = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/JKdata_genenames_and_ENSEMBL_IDs.csv", index_col=0)
ensembl_ids

Unnamed: 0,V1,V2
1,ENSG00000243485,MIR1302-2HG
2,ENSG00000237613,FAM138A
3,ENSG00000186092,OR4F5
4,ENSG00000238009,AL627309.1
5,ENSG00000239945,AL627309.3
...,...,...
38559,ENSG00000274225,AP001065.1
38560,ENSG00000277352,AP001065.2
38561,ENSG00000276345,AC004556.1
38562,ENSG00000277475,AC213203.1


In [75]:
adata.var['ensembl_id'] = adata.var['gene_symbol'].map(dict(zip(ensembl_ids['V2'], ensembl_ids['V1'])))
adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-2HG,MIR1302-2HG,ENSG00000243485
FAM138A,FAM138A,ENSG00000237613
OR4F5,OR4F5,ENSG00000186092
AL627309.1,AL627309.1,ENSG00000238009
AL627309.3,AL627309.3,ENSG00000239945
...,...,...
C21orf2,C21orf2,ENSG00000160226
AP001065.1,AP001065.1,ENSG00000274225
AP001065.2,AP001065.2,ENSG00000277352
AC004556.1,AC004556.1,ENSG00000276345


In [76]:
adata.var['ensembl_id'].value_counts(dropna=False)

NaN                27
ENSG00000229611     2
ENSG00000237980     2
ENSG00000224535     2
ENSG00000065600     2
                   ..
ENSG00000260418     1
ENSG00000029363     1
ENSG00000146410     1
ENSG00000237596     1
ENSG00000268674     1
Name: ensembl_id, Length: 37300, dtype: int64

In [77]:
adata.var.index.name = 'index'
adata.var['gene_symbol'] = adata.var.index.astype('category')

In [78]:
adata.var = adata.var.astype(str)

In [79]:
adata.var.dtypes

gene_symbol    object
ensembl_id     object
dtype: object

# Check raw data

In [36]:
adata.X.toarray().max()

71764.0

In [80]:
# make checks for adata before saving raw

# Make specific columns numeric
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')


numeric_cols = ['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'seurat_clusters', 'RNA_snn_res.0.5']
for col in numeric_cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors='coerce')

# Make df indices str
adata.obs.index = adata.obs.index.astype('str')
adata.var.index = adata.var.index.astype('str')
adata.uns[MARKER_GENES].index = adata.uns[MARKER_GENES].index.astype('str')
adata.var_names = adata.var_names.astype('str')

In [81]:
adata.X = adata.X.astype(np.int64)

In [82]:
adata.raw = adata

In [61]:
adata.X

<70051x38224 sparse matrix of type '<class 'numpy.int64'>'
	with 203942545 stored elements in Compressed Sparse Row format>

In [62]:
adata.raw.X

<70051x38224 sparse matrix of type '<class 'numpy.int64'>'
	with 203942545 stored elements in Compressed Sparse Row format>

In [63]:
adata.X.toarray().max()

71764

In [64]:
adata.raw.X.toarray().max()

71764

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object

# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs in var


### Revisions:

DONE

In [83]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')