In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "Polverino_unpubl"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"
SAMPLE_MAPPING_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/COPD_Francesca_IDs_mapping.csv"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,All- Tissue of the human lungs,"Polverino, Francesca",-,-,"unpublished, partially under embargo",sc lung data processed in 3 different batches ...


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,COPD expl 10 LLL,COPD expl 10,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,COPD expl 10,,Batch 2,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
1,COPD expl 10 LUL,COPD expl 10,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,COPD expl 10,,Batch 1,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
2,COPD expl 2 RUL,COPD expl 2,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,COPD expl 2,,Batch 2,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
3,COPD expl 2 LUL,COPD expl 2,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,COPD expl 2,,Batch 1,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
4,COPD expl 4 LLL,COPD expl 4,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,COPD expl 4,,Batch 2,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,UA 13 RLL,UA 13,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,UA 13,,,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0004849,unknown,HsapDv:0000242
74,UA 18 LUL,UA 18,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,UA 18,,,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0004849,unknown,HsapDv:0000243
75,UA 22 LUL,UA 22,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,UA 22,,,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
76,UA 57 LLL,UA 57,Chromium Single Cell V(D)J Reagent Kits,University of Southern California,University of Arizona,,UA 57,,,NCBITaxon:9606,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241


# Validate obs and uns from adata

In [7]:
adata.obs['orig.ident'].value_counts().index.tolist()

['S74',
 'S75',
 '58689',
 'S73',
 '59523',
 'S62',
 'S66',
 '59525',
 'S64',
 'S58',
 '59524',
 'S72',
 'S80',
 'S69',
 'S18',
 'S76',
 'S61',
 'S88',
 'S83',
 'S20',
 'S59',
 '58687',
 'S79',
 'S55',
 'S94',
 '58688',
 'S19',
 'S56',
 'S77',
 'S63',
 'S82',
 'S53',
 'S91',
 'S86',
 'S3',
 'S15',
 'S96',
 'S84',
 'S52',
 'S12',
 'S51',
 'S65',
 'S67',
 'S54',
 'S1',
 'S2',
 'S95',
 'S70',
 'S87',
 'S78',
 'S68',
 'S4',
 '59526',
 'S57',
 'S6',
 'S93',
 'S85',
 'S89',
 'S23',
 'S7',
 'S5',
 'S14',
 'S8',
 'S50',
 'S22',
 'S81',
 'S49',
 'S16',
 'S17',
 'S60',
 'S90',
 'S13',
 'S9',
 'S21',
 'S11',
 'S71',
 'S92',
 'S10']

In [8]:
sample_mapping_df = pd.read_csv(SAMPLE_MAPPING_PATH)
sample_mapping = dict(zip(sample_mapping_df.iloc[:,1], sample_mapping_df.iloc[:,0]))
sample_mapping

{'COPD10LLL': 'COPD expl 10 LLL',
 'COPD10LUL': 'COPD expl 10 LUL',
 'COPD2RUL': 'COPD expl 2 RUL',
 'COPD2LUL': 'COPD expl 2 LUL',
 'COPD4LLL': 'COPD expl 4 LLL',
 'COPD4RUL': 'COPD expl 4 RUL',
 'COPD6LUL': 'COPD expl 6 LUL',
 'COPD6LLL': 'COPD expl 6 LLL',
 'COPD7LUL': 'COPD expl 7 LUL',
 'COPD7LLL': 'COPD expl 7 LLL',
 'COPD8LUL': 'COPD expl 8 LUL',
 'COPD8LLL': 'COPD expl 8 LLL',
 'COPD1LUL': 'COPD expl 1 LUL',
 'COPD3RUL': 'COPD expl 3 RUL',
 'COPD5RLL': 'COPD expl 5 RLL',
 'UA100': 'UA 100 RLL',
 'UA102': 'UA 102 RUL',
 'UA104': 'UA 104 LUL',
 'UA16': 'UA 16 RUL',
 'UA21': 'UA 21 RML',
 'UA26': 'UA 26 RUL',
 'UA28': 'UA 28 RUL',
 'UA33': 'UA 33 RML',
 'UA35': 'UA 35 LLL',
 'UA37': 'UA 37 RUL',
 'UA39': 'UA 39 LUL',
 'UA40': 'UA 40 LUL',
 'UA42': 'UA 42 RLL',
 'UA44': 'UA 44 RUL',
 'UA45': 'UA 45 RLL',
 'UA48': 'UA 48 LLL',
 'UA50': 'UA 50 RUL',
 'UA53': 'UA 53 LLL',
 'UA56': 'UA 56 LLL',
 'UA58': 'UA 58 LUL',
 'UA60': 'UA 60 RUL',
 'UA64': 'UA 64 LUL',
 'UA65': 'UA 65 LLL',
 'UA

In [9]:
adata.obs['Subjet_Lobe_Specific'].value_counts().index.tolist()

['UA45',
 'UA42',
 'COPD1LUL',
 'COPD4LLL',
 'COPD10LUL',
 'UA58',
 'COPD5RLL',
 'UA5',
 'UA64',
 'UA66',
 'UA100',
 'UA56',
 'UA16',
 'DNAZ5LUL',
 'COPD2RUL',
 'COPD4RUL',
 'UA104',
 'UA50',
 'UA28',
 'DNAZ7LLL',
 'UA102',
 'COPD6LUL',
 'COPD8LLL',
 'UA96',
 'UA26',
 'DNAZ6LUL',
 'UA90',
 'UA61',
 'UA27',
 'UA74',
 'UA59',
 'UA33',
 'COPD3RUL',
 'DNAZ7RLL',
 'UA40',
 'UA30',
 'UA73',
 'DNAZ1LLL',
 'UA70',
 'UA65',
 'UA21',
 'UA78',
 'COPD2LUL',
 'COPD8LUL',
 'UA37',
 'UA35',
 'UA36',
 'COPD6LLL',
 'UA60',
 'UA38',
 'COPD7LLL',
 'UA94',
 'UA43',
 'COPD10LLL',
 'UA32',
 'UA51',
 'UA44',
 'UA41',
 'DNAZ6LLL',
 'UA48',
 'UA69',
 'DNAZ4RML',
 'COPD7LUL',
 'UA72',
 'DNAZ2LUL',
 'DNAZ4RUL',
 'UA95',
 'UA53',
 'DNAZ4RLL',
 'DNAZ1LUL',
 'UA39',
 'UA46']

In [10]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percMALAT1', 'percSpliced',
       'percent.mito', 'batch', 'S.Score', 'G2M.Score', 'Phase',
       'predicted.ann_level_1', 'predicted.ann_level_2',
       'predicted.ann_level_3', 'predicted.ann_level_4',
       'predicted.ann_level_5', 'predicted.ann_finest_level', 'mapping.score',
       'doublet', 'cell', 'Subjet_Lobe_Specific', 'Subject', 'Smoke', 'GOLD',
       'Age', 'Sex', 'Disease', 'Disease2', 'FEV1', 'FEV1.FVC', 'QuitDate',
       'Bronchodilator', 'Leukotriene', 'Steroids', 'Lobe', 'CANCER',
       'Follicle', 'totalemph', 'lobeemph', 'lobegroup', 'lobegroup2', 'Diag2',
       'PackYear', 'USC.CORE.ID', 'percent_ribo', 'cell_passed_qc',
       'n_counts_ribo', 'log1p_n_genes', 'log1p_n_counts', 'n_counts',
       'percent_top50', 'percent_mito', 'n_genes', 'n_counts_mito',
       'n_counts_hb', 'percent_hb', 'cell_type_ontology'],
      dtype='object')

In [11]:
adata.obs['sample_ID'] = adata.obs['Subjet_Lobe_Specific'].map(sample_mapping)

adata.obs['sample_ID'].value_counts(dropna=False)

UA 45 RLL           11359
UA 42 RLL           10519
COPD expl 1 LUL      9630
COPD expl 4 LLL      7980
COPD expl 10 LUL     7740
                    ...  
UA 53 LLL             702
DNAZ 4 RLL            645
DNAZ 1 LUL            460
UA 39 LUL             400
UA 46 RUL             304
Name: sample_ID, Length: 72, dtype: int64

In [12]:
# check non overlap between adata.obs['orig.ident'] and obs['sample_ID']

adata_ids = adata.obs['sample_ID'].value_counts().index.tolist()
obs_ids = obs['sample_ID'].value_counts().index.tolist()

non_overlap = list(set(adata_ids) - set(obs_ids))
non_overlap_other_side = list(set(obs_ids) - set(adata_ids))

In [13]:
non_overlap_other_side

['UA 11 RLL', 'UA 22 LUL', 'UA 57 LLL', 'UA 71 RUL', 'UA 18 LUL', 'UA 13 RLL']

In [14]:
non_overlap

[]

In [15]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 205102 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percMALAT1', 'percSpliced', 'percent.mito', 'batch', 'S.Score', 'G2M.Score', 'Phase', 'predicted.ann_level_1', 'predicted.ann_level_2', 'predicted.ann_level_3', 'predicted.ann_level_4', 'predicted.ann_level_5', 'predicted.ann_finest_level', 'mapping.score', 'doublet', 'cell', 'Subjet_Lobe_Specific', 'Subject', 'Smoke', 'GOLD', 'Age', 'Sex', 'Disease', 'Disease2', 'FEV1', 'FEV1.FVC', 'QuitDate', 'Bronchodilator', 'Leukotriene', 'Steroids', 'Lobe', 'CANCER', 'Follicle', 'totalemph', 'lobeemph', 'lobegroup', 'lobegroup2', 'Diag2', 'PackYear', 'USC.CORE.ID', 'percent_ribo', 'cell_passed_qc', 'n_counts_ribo', 'log1p_n_genes', 'log1p_n_counts', 'n_counts', 'percent_top50', 'percent_mito', 'n_genes', 'n_counts_mito', 'n_counts_hb', 'percent_hb', 'cell_type_ontology', 'sample_ID'
    var: 'name', 'feature_type'
    uns: 'Subject_colors', 'cell_passed_qc_colors', 'predicted.ann_level_1_colo

In [16]:
# Merge obs and uns
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'sample_ID',
    df_col = 'sample_ID',
    skip = None,
)

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percMALAT1,percSpliced,percent.mito,batch,S.Score,G2M.Score,Phase,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
58687_AAACCTGCAGCTCCGA-1,58687,1252.0,721,2.476038,38.338658,12.619808,Batch3,-0.020504,0.002282,G2M,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
58687_AAAGATGCAAACGTGG-1,58687,539.0,330,0.000000,96.103896,0.371058,Batch3,0.087622,0.144198,G2M,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
58687_AAAGATGCAGGCGATA-1,58687,2471.0,1389,6.677459,81.505463,2.670983,Batch3,-0.036907,-0.043963,G1,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
58687_AAAGATGCAGGGTACA-1,58687,4127.0,1734,2.762297,87.206203,2.495760,Batch3,-0.017784,-0.072971,G1,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
58687_AAAGATGTCTGTTTGT-1,58687,5119.0,2222,2.285603,91.599922,2.402813,Batch3,-0.025609,-0.021068,G1,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S96_TTGGAACAGAGTAAGG-1,S96,1635.0,790,4.097859,50.030581,24.525994,Batch1,-0.011340,-0.042933,G1,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238
S96_TTGGCAAGTCCTAGCG-1,S96,545.0,426,5.137615,74.495413,2.385321,Batch1,0.015022,-0.017849,S,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238
S96_TTGTAGGGTATAGTAG-1,S96,1425.0,778,6.736842,60.280702,14.877193,Batch1,-0.008997,0.007106,G2M,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238
S96_TTTCCTCTCGGAAATA-1,S96,1130.0,692,6.283186,59.911504,16.637168,Batch1,-0.035149,-0.039074,G1,...,5 prime tag,"EFO:0009173, EFO:0008637, EFO:0004205",true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238


# Add author cell type markers to UNS

In [17]:
# Marker genes not provided

# Check author cell type annotations and Cell Ontology IDs

In [18]:
# Adding  author_cell_type
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['cell']

In [19]:
# Just cell ontology term and not IDs present
adata.obs['cell_type_ontology'].value_counts().index.to_list()

['macrophage',
 'CD4-positive helper T cell',
 'natural killer cell',
 'monocyte',
 'CD8-positive, alpha-beta T cell',
 'plasmacytoid dendritic cell',
 'B cell',
 'ciliated cell',
 'type II pneumocyte',
 'pro-T cell',
 'conventional dendritic cell',
 'regulatory T cell',
 'T cell',
 'nan',
 'mast cell',
 'alveolar capillary type 1 endothelial cell',
 'type I pneumocyte',
 'gamma-delta T cell',
 'lymphatic vessel',
 'myeloid dendritic cell',
 'artery',
 'alveolar type 2 fibroblast cell',
 'secretory cell',
 'respiratory basal cell',
 'plasma cell',
 'alveolar capillary type 2 endothelial cell',
 'mesothelium',
 'smooth muscle cell',
 'fibroblast of lung',
 'pulmonary vein',
 'stromal cell',
 'systemic vein',
 'pericyte',
 'alveolar type 1 fibroblast cell',
 'lung endothelial cell',
 'adult endothelial progenitor cell']

In [20]:
cell_ontology_term_id = {
    'macrophage': 'CL:0000235',
    'CD4-positive helper T cell': 'CL:0000492',
    'natural killer cell': 'CL:0000623',
    'monocyte': 'CL:0000576',
    'CD8-positive, alpha-beta T cell': 'CL:0000625',
    'plasmacytoid dendritic cell': 'CL:0000784',
    'B cell': 'CL:0000236',
    'ciliated cell': 'CL:0000064',
    'type II pneumocyte': '',
    'pro-T cell': 'CL:0002063',
    'conventional dendritic cell': 'CL:0000990',
    'regulatory T cell': 'CL:0000815',
    'T cell': 'CL:0000084',
    'nan': 'NA',
    'mast cell': 'CL:0000097',
    'alveolar capillary type 1 endothelial cell': 'CL:4028002',
    'type I pneumocyte': 'CL:0002062',
    'gamma-delta T cell': 'CL:0000798',
    'lymphatic vessel': 'UBERON:0001473',
    'myeloid dendritic cell': 'CL:0000782',
    'artery': 'UBERON:0001637',
    'alveolar type 2 fibroblast cell': 'CL:4028006',
    'secretory cell': 'CL:0000151',
    'respiratory basal cell': 'CL:0002633',
    'plasma cell': 'CL:0000786',
    'alveolar capillary type 2 endothelial cell': 'CL:4028003',
    'mesothelium': 'UBERON:0001136',
    'smooth muscle cell': 'CL:0000192',
    'fibroblast of lung': 'CL:0002553',
    'pulmonary vein': 'UBERON:0002016',
    'stromal cell': 'CL:0000499',
    'systemic vein': 'UBERON:0013140',
    'pericyte': 'CL:0000669',
    'alveolar type 1 fibroblast cell': 'CL:4028004',
    'lung endothelial cell': 'CL:1001567',
    'adult endothelial progenitor cell': 'CL:0002619'
}

In [21]:
# Add cell_type_ontology_term_id
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['cell_type_ontology'].map(cell_ontology_term_id)

In [22]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percMALAT1,percSpliced,percent.mito,batch,S.Score,G2M.Score,Phase,...,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,author_cell_type,cell_type_ontology_term_id
58687_AAACCTGCAGCTCCGA-1,58687,1252.0,721,2.476038,38.338658,12.619808,Batch3,-0.020504,0.002282,G2M,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241,T_CD4,CL:0000492
58687_AAAGATGCAAACGTGG-1,58687,539.0,330,0.000000,96.103896,0.371058,Batch3,0.087622,0.144198,G2M,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241,T_CD4,CL:0000492
58687_AAAGATGCAGGCGATA-1,58687,2471.0,1389,6.677459,81.505463,2.670983,Batch3,-0.036907,-0.043963,G1,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241,NK,CL:0000623
58687_AAAGATGCAGGGTACA-1,58687,4127.0,1734,2.762297,87.206203,2.495760,Batch3,-0.017784,-0.072971,G1,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241,NK,CL:0000623
58687_AAAGATGTCTGTTTGT-1,58687,5119.0,2222,2.285603,91.599922,2.402813,Batch3,-0.025609,-0.021068,G1,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000241,NK,CL:0000623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S96_TTGGAACAGAGTAAGG-1,S96,1635.0,790,4.097859,50.030581,24.525994,Batch1,-0.011340,-0.042933,G1,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238,AT2,
S96_TTGGCAAGTCCTAGCG-1,S96,545.0,426,5.137615,74.495413,2.385321,Batch1,0.015022,-0.017849,S,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238,AT1,CL:0002062
S96_TTGTAGGGTATAGTAG-1,S96,1425.0,778,6.736842,60.280702,14.877193,Batch1,-0.008997,0.007106,G2M,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238,AT2,
S96_TTTCCTCTCGGAAATA-1,S96,1130.0,692,6.283186,59.911504,16.637168,Batch1,-0.035149,-0.039074,G1,...,true,GRCh38,v98,Cell Ranger Version 7.1,no,MONDO:0005002,unknown,HsapDv:0000238,AT1,CL:0002062


# Check whether ENSEMBL IDs in var

In [23]:
adata.var

Unnamed: 0_level_0,name,feature_type
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression
ENSG00000237613,FAM138A,Gene Expression
ENSG00000186092,OR4F5,Gene Expression
ENSG00000238009,AL627309.1,Gene Expression
ENSG00000239945,AL627309.3,Gene Expression
...,...,...
ENSG00000277836,AC141272.1,Gene Expression
ENSG00000278633,AC023491.2,Gene Expression
ENSG00000276017,AC007325.1,Gene Expression
ENSG00000278817,AC007325.4,Gene Expression


In [24]:
adata.var['ensembl_id'] = adata.var.index
adata.var.rename(columns={'name': 'gene_symbol'}, inplace=True)
adata.var.index.name = 'index'
adata.var

Unnamed: 0_level_0,gene_symbol,feature_type,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,MIR1302-2HG,Gene Expression,ENSG00000243485
ENSG00000237613,FAM138A,Gene Expression,ENSG00000237613
ENSG00000186092,OR4F5,Gene Expression,ENSG00000186092
ENSG00000238009,AL627309.1,Gene Expression,ENSG00000238009
ENSG00000239945,AL627309.3,Gene Expression,ENSG00000239945
...,...,...,...
ENSG00000277836,AC141272.1,Gene Expression,ENSG00000277836
ENSG00000278633,AC023491.2,Gene Expression,ENSG00000278633
ENSG00000276017,AC007325.1,Gene Expression,ENSG00000276017
ENSG00000278817,AC007325.4,Gene Expression,ENSG00000278817


# Check raw data

In [25]:
adata.X.toarray().max()

22076.0

In [26]:
adata.X = adata.X.astype(np.int64)

In [27]:
adata.raw = adata

In [28]:
adata.X

<205102x36601 sparse matrix of type '<class 'numpy.int64'>'
	with 301276859 stored elements in Compressed Sparse Column format>

In [29]:
adata.raw.X

<205102x36601 sparse matrix of type '<class 'numpy.int64'>'
	with 301276859 stored elements in Compressed Sparse Column format>

In [30]:
adata.X.toarray().max()

22076

In [31]:
adata.raw.X.toarray().max()

22076

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object  same as Google Sheet

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object same as Google Sheet

# Data Submission Status

- CHECK: Raw counts in X and in raw
- PARTIALLY MISSING: Tier 1 Metadata in OBS: check 'N/A' in 'sample_collection_method'
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- MISSING: Marker genes in UNS: not needed for now
- CHECK: ENSEMBL IDs and gene symbols in var

# Revisions:

DONE

Not ideal: some 'N/A's in 'sample_collection_method'

In [32]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))