In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [2]:
DATASET_ID = "Zhang_Guo_publ"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,Cells of pre- and early-stage lung adenocarcin...,Xiaoju Zhang,Patient,,Published and consented for release,


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,SY_CA1,SY_Donor_1,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238
1,SY_N1,SY_Donor_1,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238
2,SY_CA2,SY_Donor_2,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000240
3,SY_N2,SY_Donor_2,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000240
4,SY_CA3,SY_Donor_3,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0005061,unknown,HsapDv:0000238
5,SY_N3,SY_Donor_3,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238
6,SY_CA4,SY_Donor_4,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0005061,unknown,HsapDv:0000240
7,SY_CA5,SY_Donor_5,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0005061,unknown,HsapDv:0000240
8,SY_N5,SY_Donor_5,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000240
9,SY_CA6,SY_Donor_6,https://doi.\rorg/10.1186/s12964-023-01322-x.,Zhengzhou University Peoples Hospital,,,,,,NCBITaxon:9606,...,5 prime tag,,True,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0005061,unknown,HsapDv:0000240


In [7]:
files = os.listdir(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/")
counts = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{[f for f in files if f.startswith('counts')][0]}")

counts

  counts = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{[f for f in files if f.startswith('counts')][0]}")


Unnamed: 0.1,Unnamed: 0,genename,Ensembl,AAACCTGCACCGTTGG-1_1,AAACCTGGTTGGTGGA-1_1,AAACGGGAGAACTGTA-1_1,AAACGGGAGCTCAACT-1_1,AAACGGGTCCTTCAAT-1_1,AAAGATGCAACTGGCC-1_1,AAAGATGTCCATGAAC-1_1,...,TTTGTCACATCCCATC-1_12,TTTGTCACATTTGCCC-1_12,TTTGTCAGTAAGCACG-1_12,TTTGTCAGTACTCGCG-1_12,TTTGTCAGTGCACGAA-1_12,TTTGTCAGTTCATGGT-1_12,TTTGTCAGTTCTCATT-1_12,TTTGTCATCCGCTGTT-1_12,TTTGTCATCTCTGCTG-1_12,TTTGTCATCTTGCATT-1_12
0,2,FO538757.2,,0,1,1,1,0,1,0,...,0,0,0,0,0,0,2,0,1,0
1,3,AP006222.2,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,RP11-206L10.9,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,LINC00115,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,NOC2L,ENSG00000188976,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22542,22566,NKPD1,ENSG00000179846,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22543,22567,GAB4,ENSG00000215568,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22544,22568,VPREB1,ENSG00000169575,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22545,22569,TFF2,ENSG00000160181,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# get columns 'genename' and 'Ensembl' from the counts dataframe and remove them from the counts dataframe
features = counts[['genename', 'Ensembl']]
features.rename(columns={'genename': 'gene_name', 'Ensembl': 'gene_id'}, inplace=True)
counts = counts.drop(columns=['genename', 'Ensembl', 'Unnamed: 0']).reset_index(drop=True)

counts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.rename(columns={'genename': 'gene_name', 'Ensembl': 'gene_id'}, inplace=True)


Unnamed: 0,AAACCTGCACCGTTGG-1_1,AAACCTGGTTGGTGGA-1_1,AAACGGGAGAACTGTA-1_1,AAACGGGAGCTCAACT-1_1,AAACGGGTCCTTCAAT-1_1,AAAGATGCAACTGGCC-1_1,AAAGATGTCCATGAAC-1_1,AAAGATGTCTGTTGAG-1_1,AAAGCAATCCTTGGTC-1_1,AAATGCCAGGTGGGTT-1_1,...,TTTGTCACATCCCATC-1_12,TTTGTCACATTTGCCC-1_12,TTTGTCAGTAAGCACG-1_12,TTTGTCAGTACTCGCG-1_12,TTTGTCAGTGCACGAA-1_12,TTTGTCAGTTCATGGT-1_12,TTTGTCAGTTCTCATT-1_12,TTTGTCATCCGCTGTT-1_12,TTTGTCATCTCTGCTG-1_12,TTTGTCATCTTGCATT-1_12
0,0,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,2,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22544,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22545,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
adata = ad.AnnData(X=counts.T, var=features)
adata

  adata = ad.AnnData(X=counts.T, var=features)


AnnData object with n_obs × n_vars = 38814 × 22547
    var: 'gene_name', 'gene_id'

In [152]:
adata.obs

AAACCTGCACCGTTGG-1_1
AAACCTGGTTGGTGGA-1_1
AAACGGGAGAACTGTA-1_1
AAACGGGAGCTCAACT-1_1
AAACGGGTCCTTCAAT-1_1
...
TTTGTCAGTTCATGGT-1_12
TTTGTCAGTTCTCATT-1_12
TTTGTCATCCGCTGTT-1_12
TTTGTCATCTCTGCTG-1_12
TTTGTCATCTTGCATT-1_12


In [10]:
cell_meta_data = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{[f for f in files if f.startswith('meta')][0]}", index_col=0)
cell_meta_data

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,barcodes,nUMI,nGene,Clusters,Cell Ontology Class ID,Markers
AAACCTGCACCGTTGG-1_1,SY_CA1,696,392,AAACCTGCACCGTTGG,701,397,NK_CL:0000623,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1
AAACCTGGTTGGTGGA-1_1,SY_CA1,602,399,AAACCTGGTTGGTGGA,602,399,Monocyte_CL:0000576,CL:0000576,FCN1;VCAN;S100A8;S100A9;CD14
AAACGGGAGAACTGTA-1_1,SY_CA1,610,408,AAACGGGAGAACTGTA,612,410,Macrophage_CL:0000235,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE
AAACGGGAGCTCAACT-1_1,SY_CA1,3901,1358,AAACGGGAGCTCAACT,3902,1359,Macrophage_CL:0000235,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE
AAACGGGTCCTTCAAT-1_1,SY_CA1,2499,946,AAACGGGTCCTTCAAT,2506,953,Epithelial_CL:0000066,CL:0000066,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2
...,...,...,...,...,...,...,...,...,...
TTTGTCAGTTCATGGT-1_12,SY_N3,984,575,TTTGTCAGTTCATGGT,984,575,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A
TTTGTCAGTTCTCATT-1_12,SY_N3,5957,2236,TTTGTCAGTTCTCATT,5957,2236,NK_CL:0000623,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1
TTTGTCATCCGCTGTT-1_12,SY_N3,913,579,TTTGTCATCCGCTGTT,914,580,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A
TTTGTCATCTCTGCTG-1_12,SY_N3,4190,1653,TTTGTCATCTCTGCTG,4190,1653,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A


In [11]:
adata.obs = adata.obs.merge(cell_meta_data, left_index=True, right_index=True)
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,barcodes,nUMI,nGene,Clusters,Cell Ontology Class ID,Markers
AAACCTGCACCGTTGG-1_1,SY_CA1,696,392,AAACCTGCACCGTTGG,701,397,NK_CL:0000623,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1
AAACCTGGTTGGTGGA-1_1,SY_CA1,602,399,AAACCTGGTTGGTGGA,602,399,Monocyte_CL:0000576,CL:0000576,FCN1;VCAN;S100A8;S100A9;CD14
AAACGGGAGAACTGTA-1_1,SY_CA1,610,408,AAACGGGAGAACTGTA,612,410,Macrophage_CL:0000235,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE
AAACGGGAGCTCAACT-1_1,SY_CA1,3901,1358,AAACGGGAGCTCAACT,3902,1359,Macrophage_CL:0000235,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE
AAACGGGTCCTTCAAT-1_1,SY_CA1,2499,946,AAACGGGTCCTTCAAT,2506,953,Epithelial_CL:0000066,CL:0000066,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2
...,...,...,...,...,...,...,...,...,...
TTTGTCAGTTCATGGT-1_12,SY_N3,984,575,TTTGTCAGTTCATGGT,984,575,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A
TTTGTCAGTTCTCATT-1_12,SY_N3,5957,2236,TTTGTCAGTTCTCATT,5957,2236,NK_CL:0000623,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1
TTTGTCATCCGCTGTT-1_12,SY_N3,913,579,TTTGTCATCCGCTGTT,914,580,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A
TTTGTCATCTCTGCTG-1_12,SY_N3,4190,1653,TTTGTCATCTCTGCTG,4190,1653,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A


# Validate obs and uns from adata

In [155]:
adata.obs['orig.ident'].value_counts()

SY_CA2    7852
SY_CA4    5579
SY_N3     4116
SY_N2     3935
SY_N5     3094
SY_CA6    2963
SY_N6     2609
SY_CA5    2430
SY_CA7    2398
SY_CA3    1479
SY_N1     1351
SY_CA1    1008
Name: orig.ident, dtype: int64

In [156]:
obs['sample_ID'].value_counts()

SY_CA1    1
SY_N1     1
SY_CA2    1
SY_N2     1
SY_CA3    1
SY_N3     1
SY_CA4    1
SY_CA5    1
SY_N5     1
SY_CA6    1
SY_N6     1
SY_CA7    1
Name: sample_ID, dtype: int64

In [12]:
# show non-overlap between the two
metadata_id = set(adata.obs['orig.ident'].unique())
obs_id = set(obs['sample_ID'].unique())

non_overlap = metadata_id - obs_id
non_overlap_other_side = obs_id - metadata_id

In [13]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 38814 × 22547
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'barcodes', 'nUMI', 'nGene', 'Clusters', 'Cell Ontology Class ID', 'Markers'
    var: 'gene_name', 'gene_id'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'

In [14]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'orig.ident',
    df_col = 'sample_ID',
    skip = None
)

adata.obs['sample_ID'] = adata.obs['orig.ident']

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,barcodes,nUMI,nGene,Clusters,Cell Ontology Class ID,Markers,donor_id,...,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID
AAACCTGCACCGTTGG-1_1,SY_CA1,696,392,AAACCTGCACCGTTGG,701,397,NK_CL:0000623,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACCTGGTTGGTGGA-1_1,SY_CA1,602,399,AAACCTGGTTGGTGGA,602,399,Monocyte_CL:0000576,CL:0000576,FCN1;VCAN;S100A8;S100A9;CD14,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACGGGAGAACTGTA-1_1,SY_CA1,610,408,AAACGGGAGAACTGTA,612,410,Macrophage_CL:0000235,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACGGGAGCTCAACT-1_1,SY_CA1,3901,1358,AAACGGGAGCTCAACT,3902,1359,Macrophage_CL:0000235,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACGGGTCCTTCAAT-1_1,SY_CA1,2499,946,AAACGGGTCCTTCAAT,2506,953,Epithelial_CL:0000066,CL:0000066,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTTCATGGT-1_12,SY_N3,984,575,TTTGTCAGTTCATGGT,984,575,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3
TTTGTCAGTTCTCATT-1_12,SY_N3,5957,2236,TTTGTCAGTTCTCATT,5957,2236,NK_CL:0000623,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3
TTTGTCATCCGCTGTT-1_12,SY_N3,913,579,TTTGTCATCCGCTGTT,914,580,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3
TTTGTCATCTCTGCTG-1_12,SY_N3,4190,1653,TTTGTCATCTCTGCTG,4190,1653,T_CL:0000084,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3


# Add author cell type markers to UNS

In [15]:
adata.obs['Clusters'] = [x.split('_')[0] for x in adata.obs['Clusters']]
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,barcodes,nUMI,nGene,Clusters,Cell Ontology Class ID,Markers,donor_id,...,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID
AAACCTGCACCGTTGG-1_1,SY_CA1,696,392,AAACCTGCACCGTTGG,701,397,NK,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACCTGGTTGGTGGA-1_1,SY_CA1,602,399,AAACCTGGTTGGTGGA,602,399,Monocyte,CL:0000576,FCN1;VCAN;S100A8;S100A9;CD14,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACGGGAGAACTGTA-1_1,SY_CA1,610,408,AAACGGGAGAACTGTA,612,410,Macrophage,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACGGGAGCTCAACT-1_1,SY_CA1,3901,1358,AAACGGGAGCTCAACT,3902,1359,Macrophage,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
AAACGGGTCCTTCAAT-1_1,SY_CA1,2499,946,AAACGGGTCCTTCAAT,2506,953,Epithelial,CL:0000066,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2,SY_Donor_1,...,,true,GRCh38,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTTCATGGT-1_12,SY_N3,984,575,TTTGTCAGTTCATGGT,984,575,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3
TTTGTCAGTTCTCATT-1_12,SY_N3,5957,2236,TTTGTCAGTTCTCATT,5957,2236,NK,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3
TTTGTCATCCGCTGTT-1_12,SY_N3,913,579,TTTGTCATCCGCTGTT,914,580,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3
TTTGTCATCTCTGCTG-1_12,SY_N3,4190,1653,TTTGTCATCTCTGCTG,4190,1653,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,true,GRCh38,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3


In [16]:
adata.obs[MARKER_GENES] = adata.obs['Markers']
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['Cell Ontology Class ID']
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['Clusters']

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,barcodes,nUMI,nGene,Clusters,Cell Ontology Class ID,Markers,donor_id,...,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sample_ID,author_cell_type_markers,cell_type_ontology_term_id,author_cell_type
AAACCTGCACCGTTGG-1_1,SY_CA1,696,392,AAACCTGCACCGTTGG,701,397,NK,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1,SY_Donor_1,...,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1,GNLY;NKG7;XCL2;XCL1;KLRF1,CL:0000623,NK
AAACCTGGTTGGTGGA-1_1,SY_CA1,602,399,AAACCTGGTTGGTGGA,602,399,Monocyte,CL:0000576,FCN1;VCAN;S100A8;S100A9;CD14,SY_Donor_1,...,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1,FCN1;VCAN;S100A8;S100A9;CD14,CL:0000576,Monocyte
AAACGGGAGAACTGTA-1_1,SY_CA1,610,408,AAACGGGAGAACTGTA,612,410,Macrophage,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE,SY_Donor_1,...,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1,APOC1;FTL;C1QA;C1QB;APOE,CL:0000235,Macrophage
AAACGGGAGCTCAACT-1_1,SY_CA1,3901,1358,AAACGGGAGCTCAACT,3902,1359,Macrophage,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE,SY_Donor_1,...,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1,APOC1;FTL;C1QA;C1QB;APOE,CL:0000235,Macrophage
AAACGGGTCCTTCAAT-1_1,SY_CA1,2499,946,AAACGGGTCCTTCAAT,2506,953,Epithelial,CL:0000066,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2,SY_Donor_1,...,,Cell Ranger 3.0.1,no,MONDO:0000503,unknown,HsapDv:0000238,SY_CA1,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2,CL:0000066,Epithelial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCAGTTCATGGT-1_12,SY_N3,984,575,TTTGTCAGTTCATGGT,984,575,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3,CD3E;CD3D;CD2;CD3G;CD8A,CL:0000084,T
TTTGTCAGTTCTCATT-1_12,SY_N3,5957,2236,TTTGTCAGTTCTCATT,5957,2236,NK,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1,SY_Donor_3,...,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3,GNLY;NKG7;XCL2;XCL1;KLRF1,CL:0000623,NK
TTTGTCATCCGCTGTT-1_12,SY_N3,913,579,TTTGTCATCCGCTGTT,914,580,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3,CD3E;CD3D;CD2;CD3G;CD8A,CL:0000084,T
TTTGTCATCTCTGCTG-1_12,SY_N3,4190,1653,TTTGTCATCTCTGCTG,4190,1653,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A,SY_Donor_3,...,,Cell Ranger 3.0.1,no,PATO:0000461,unknown,HsapDv:0000238,SY_N3,CD3E;CD3D;CD2;CD3G;CD8A,CL:0000084,T


In [17]:
cell_type_mapping_df = adata.obs[[AUTHOR_CELL_TYPE, CELL_TYPE_ONTOLOGY_ID, MARKER_GENES]].copy().drop_duplicates(subset=AUTHOR_CELL_TYPE).reset_index(drop=True)
cell_type_mapping_df

Unnamed: 0,author_cell_type,cell_type_ontology_term_id,author_cell_type_markers
0,NK,CL:0000623,GNLY;NKG7;XCL2;XCL1;KLRF1
1,Monocyte,CL:0000576,FCN1;VCAN;S100A8;S100A9;CD14
2,Macrophage,CL:0000235,APOC1;FTL;C1QA;C1QB;APOE
3,Epithelial,CL:0000066,EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2
4,Fibroblast,CL:0000057,DCN;MGP;CFD;PTGDS;LUM
5,Dendritic,CL:0000782,CCL17;CD1C;AIF1;CD1A;GPR183
6,B,CL:0000236,CD79A;MS4A1;BANK1;CD79B;CD19
7,Undefined,Undefined,
8,Plasma,CL:0000786,MZB1;JCHAIN;CD27;IGHG1;IGHA1
9,T,CL:0000084,CD3E;CD3D;CD2;CD3G;CD8A


In [18]:
adata.uns[MARKER_GENES] = cell_type_mapping_df
adata

AnnData object with n_obs × n_vars = 38814 × 22547
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'barcodes', 'nUMI', 'nGene', 'Clusters', 'Cell Ontology Class ID', 'Markers', 'donor_id', 'protocol_URL', 'institute', 'sample_collection_site', 'sample_collection_relative_time_point', 'library_ID', 'library_ID_repository', 'author_batch_notes', 'organism_ontology_term_id', 'manner_of_death', 'sample_source', 'sex_ontology_term_id', 'sample_collection_method', 'tissue_type', 'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text', 'sample_preservation_method', 'suspension_type', 'cell_enrichment', 'cell_viability_percentage', 'cell_number_loaded', 'sample_collection_year', 'assay_ontology_term_id', 'library_preparation_batch', 'library_sequencing_run', 'sequenced_fragment', 'sequencing_platform', 'is_primary_data', 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intron_inclusion', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_ter

# Check author cell type annotations and Cell Ontology IDs

In [19]:
adata.obs[AUTHOR_CELL_TYPE].value_counts()

T              11847
Macrophage      6379
NK              5880
Epithelial      3122
Endothelial     2577
Fibroblast      2513
B               2028
Dendritic       1749
Monocyte        1051
Plasma           626
Undefined        541
Mast             501
Name: author_cell_type, dtype: int64

In [20]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts()

CL:0000084    11847
CL:0000235     6379
CL:0000623     5880
CL:0000066     3122
CL:0000115     2577
CL:0000057     2513
CL:0000236     2028
CL:0000782     1749
CL:0000576     1051
CL:0000786      626
Undefined       541
CL:0000097      501
Name: cell_type_ontology_term_id, dtype: int64

In [21]:
adata.obs[MARKER_GENES].value_counts()

CD3E;CD3D;CD2;CD3G;CD8A                11847
APOC1;FTL;C1QA;C1QB;APOE                6379
GNLY;NKG7;XCL2;XCL1;KLRF1               5880
EPCAM;SFTPC;SCGB3A1;SFTPB;EMP2          3122
CLDN5;RAMP2;FCN3;SPARCL1;VWF;PECAM1     2577
DCN;MGP;CFD;PTGDS;LUM                   2513
CD79A;MS4A1;BANK1;CD79B;CD19            2028
CCL17;CD1C;AIF1;CD1A;GPR183             1749
FCN1;VCAN;S100A8;S100A9;CD14            1051
MZB1;JCHAIN;CD27;IGHG1;IGHA1             626
None                                     541
TPSAB1;TPSB2;CPA3;TPSD1;MS4A2            501
Name: author_cell_type_markers, dtype: int64

In [22]:
adata.var

Unnamed: 0,gene_name,gene_id
0,FO538757.2,
1,AP006222.2,
2,RP11-206L10.9,
3,LINC00115,
4,NOC2L,ENSG00000188976
...,...,...
22542,NKPD1,ENSG00000179846
22543,GAB4,ENSG00000215568
22544,VPREB1,ENSG00000169575
22545,TFF2,ENSG00000160181


# Check whether ENSEMBL IDs in var

In [24]:
adata.var.rename(columns={'gene_name': 'gene_symbol', 'gene_id': 'ensembl_id'}, inplace=True)
adata.var.index = adata.var['gene_symbol']
adata.var.index.name = 'index'

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
FO538757.2,FO538757.2,
AP006222.2,AP006222.2,
RP11-206L10.9,RP11-206L10.9,
LINC00115,LINC00115,
NOC2L,NOC2L,ENSG00000188976
...,...,...
NKPD1,NKPD1,ENSG00000179846
GAB4,GAB4,ENSG00000215568
VPREB1,VPREB1,ENSG00000169575
TFF2,TFF2,ENSG00000160181


# Check raw data

In [25]:
adata.X = adata.X.astype(np.int64)
adata.X

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
if not sp.issparse(adata.X):
    adata.X = sp.csr_matrix(adata.X)

In [27]:
adata.X.toarray().max()

24444

In [28]:
adata.raw = adata

In [29]:
adata.raw.X

<38814x22547 sparse matrix of type '<class 'numpy.int64'>'
	with 60562822 stored elements in Compressed Sparse Row format>

In [30]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'barcodes', 'nUMI', 'nGene',
       'Clusters', 'Cell Ontology Class ID', 'Markers', 'donor_id',
       'protocol_URL', 'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platform', 'is_primary_data',
       'reference_genome', 'gene_annotation_version', 'alignment_software',
       'intron_inclusion', 'disease_ontology_te

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object

# Data Submission Status

- CHECK: Raw counts in X and in raw
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs and gene symbols in var

In [31]:
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')

In [32]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')