In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [8]:
DATASET_ID = "Yildirim_unpubl"
MTX_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.mtx"
RDS_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.rds"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [53]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,COPD_lung_tissue_snRNASeq,"A,Oender,Yildirim",batch,X_umap,"unpublished, protected under embargo","Technology is 10X Flex (not in EFO),"


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,A22-4,A22,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_1,,Batch_1,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,PATO:0000461,unknown,HsapDv:0000242
1,A20-21,A20,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_1,,Batch_1,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCA_009914755.4,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
2,A6-9,A6,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_1,,Batch_1,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
3,A6-42,A6,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_1,,Batch_1,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
4,A12-30,A12,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_1,,Batch_1,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,A111-9,A111,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_7,,Batch_7,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000242
74,A110-73,A110,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_8,,Batch_8,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
75,A118-44,A118,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_8,,Batch_8,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
76,B77-30,A77,https://www.10xgenomics.com/support/software/c...,Helmholtz Munich,DATAR_site_1,,Library_Batch_8,,Batch_8,NCBITaxon:9606,...,probe-based,EFO:0008637,true,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241


In [7]:
adata = sc.read_mtx(MTX_PATH)
adata = adata.T

In [7]:
metadata = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/copd.metadata_AGYildirim_DD.csv", index_col=0)
metadata

Unnamed: 0,nCount_RNA,nFeature_RNA,percent.mt,barcodes,sample_id,disease_stage,patient_id,batch_number,nCount_SCT,nFeature_SCT,SCT_snn_res.2,seurat_clusters,assigned_celltype
AAACCAATCATTAGTCACTTTAGG-1,743,623,2.489906,AAACCAATCATTAGTCACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1486,623,45,45,Alveolar Fibroblasts
AAACCAGGTTCGATGTACTTTAGG-1,870,640,1.130952,AAACCAGGTTCGATGTACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1680,640,11,11,AT2 cells
AAACGGGCAAGCTAATACTTTAGG-1,809,659,2.107280,AAACGGGCAAGCTAATACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1566,659,4,4,EC general capillary
AAACTGTCAGCAAGCTACTTTAGG-1,612,505,1.474011,AAACTGTCAGCAAGCTACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1289,513,26,26,Transitioning AT2 cells
AAAGCATGTAAGGCCGACTTTAGG-1,2001,1340,0.949525,AAAGCATGTAAGGCCGACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,2001,1340,15,15,Alveolar Mφ
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCTGAGGCCAACAAGCTGTGA-80,1213,809,1.785714,TTTGCTGAGGCCAACAAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,1512,809,20,20,Alveolar Fibroblasts
TTTGCTGAGTCCTGCGAGCTGTGA-80,11710,4607,1.256078,TTTGCTGAGTCCTGCGAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,2468,1711,5,5,Lipid associated Mφ
TTTGGACGTCGTCCAAAGCTGTGA-80,6699,3176,0.253293,TTTGGACGTCGTCCAAAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,1974,1241,26,26,Transitioning AT2 cells
TTTGGCGGTTTAGTTGAGCTGTGA-80,6064,2760,1.669359,TTTGGCGGTTTAGTTGAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,1857,963,13,13,AT2 cells


In [9]:
adata.obs = metadata
adata

AnnData object with n_obs × n_vars = 195067 × 17773
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'barcodes', 'sample_id', 'disease_stage', 'patient_id', 'batch_number', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.2', 'seurat_clusters', 'assigned_celltype'

In [16]:
probe_set = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/Chromium_Human_Transcriptome_Probe_Set_v1.0.1_GRCh38-2020-A.csv", skiprows=5, index_col=0)
probe_set

Unnamed: 0_level_0,probe_seq,probe_id,included,region
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,GGTGACACCACAACAATGCAACGTATTTTGGATCTTGTCTACTGCA...,ENSG00000000003|TSPAN6|8eab823,True,spliced
ENSG00000000003,TCTGCATCTCTCTGTGGAGTACAATCTTCAAGTTTACAGCAACTCT...,ENSG00000000003|TSPAN6|9d7fe51,True,unspliced
ENSG00000000003,AAAGCTGTTCTTAATCTCATGTCTGAAAACAAATCCTACGATGGCA...,ENSG00000000003|TSPAN6|d2b5833,True,spliced
ENSG00000000005,CGTGACGGGTCTTCTCTACTTTCACTTGAGGGACCACCCACTGTTC...,ENSG00000000005|TNMD|7790621,True,unspliced
ENSG00000000005,GCCTCGACGGCAGTAAATACAACAATAACCTCTCTCATCCAGCATG...,ENSG00000000005|TNMD|923f04b,True,unspliced
...,...,...,...,...
DEPRECATED_ENSG00000278803,CCTGGAGTAGGAGCAATGGCCTTGTGTTTCCGTGACCTGTCAGACC...,DEPRECATED_ENSG00000278803|AC236972.4|53ec843,False,unspliced
DEPRECATED_ENSG00000278803,ACCCTCTCCGGGGTCCTGAATGTTTCCCGAGAAGGGAGAACTTTCC...,DEPRECATED_ENSG00000278803|AC236972.4|b4ceb8b,False,unspliced
DEPRECATED_ENSG00000283967,CAGCAGGGTTGTCATCCTCTGAGCTTCCTCCGCATCCACGGTGGGC...,DEPRECATED_ENSG00000283967|TAF11L8|5800ed2,False,unspliced
DEPRECATED_ENSG00000284873,AGCGCAGGTCCATATCAAGGAATGCAGATAAAAGAGCCCTTTGAAC...,DEPRECATED_ENSG00000284873|OOSP1|b83341e,False,unspliced


In [17]:
# create new column 'gene_symbol' in probe_set which extracts gene symbol from 'probe_id' column by getting the string between the first and second '|'

probe_set['gene_symbol'] = probe_set['probe_id'].str.split('|').str[1]
probe_set

Unnamed: 0_level_0,probe_seq,probe_id,included,region,gene_symbol
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,GGTGACACCACAACAATGCAACGTATTTTGGATCTTGTCTACTGCA...,ENSG00000000003|TSPAN6|8eab823,True,spliced,TSPAN6
ENSG00000000003,TCTGCATCTCTCTGTGGAGTACAATCTTCAAGTTTACAGCAACTCT...,ENSG00000000003|TSPAN6|9d7fe51,True,unspliced,TSPAN6
ENSG00000000003,AAAGCTGTTCTTAATCTCATGTCTGAAAACAAATCCTACGATGGCA...,ENSG00000000003|TSPAN6|d2b5833,True,spliced,TSPAN6
ENSG00000000005,CGTGACGGGTCTTCTCTACTTTCACTTGAGGGACCACCCACTGTTC...,ENSG00000000005|TNMD|7790621,True,unspliced,TNMD
ENSG00000000005,GCCTCGACGGCAGTAAATACAACAATAACCTCTCTCATCCAGCATG...,ENSG00000000005|TNMD|923f04b,True,unspliced,TNMD
...,...,...,...,...,...
DEPRECATED_ENSG00000278803,CCTGGAGTAGGAGCAATGGCCTTGTGTTTCCGTGACCTGTCAGACC...,DEPRECATED_ENSG00000278803|AC236972.4|53ec843,False,unspliced,AC236972.4
DEPRECATED_ENSG00000278803,ACCCTCTCCGGGGTCCTGAATGTTTCCCGAGAAGGGAGAACTTTCC...,DEPRECATED_ENSG00000278803|AC236972.4|b4ceb8b,False,unspliced,AC236972.4
DEPRECATED_ENSG00000283967,CAGCAGGGTTGTCATCCTCTGAGCTTCCTCCGCATCCACGGTGGGC...,DEPRECATED_ENSG00000283967|TAF11L8|5800ed2,False,unspliced,TAF11L8
DEPRECATED_ENSG00000284873,AGCGCAGGTCCATATCAAGGAATGCAGATAAAAGAGCCCTTTGAAC...,DEPRECATED_ENSG00000284873|OOSP1|b83341e,False,unspliced,OOSP1


In [32]:
# make 'gene_symbol' unique
probe_set = probe_set.drop_duplicates(subset='gene_symbol', keep='first')
probe_set


Unnamed: 0_level_0,probe_seq,probe_id,included,region,gene_symbol
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,GGTGACACCACAACAATGCAACGTATTTTGGATCTTGTCTACTGCA...,ENSG00000000003|TSPAN6|8eab823,True,spliced,TSPAN6
ENSG00000000005,CGTGACGGGTCTTCTCTACTTTCACTTGAGGGACCACCCACTGTTC...,ENSG00000000005|TNMD|7790621,True,unspliced,TNMD
ENSG00000000419,TTGTAGCGAGTTCCAGAGACAATATCAAAATTACCCTCCTTTTGCT...,ENSG00000000419|DPM1|73ef065,True,unspliced,DPM1
ENSG00000000457,AAGCAAAGCAGGAATAGGCTTCTGCTCCCCTGAGGTAACAGGTTTT...,ENSG00000000457|SCYL3|930c2e3,True,unspliced,SCYL3
ENSG00000000460,GTGGTCTGGATTTCCAAATTTTGGCTTGACAGTCCTCTGGCTTGAC...,ENSG00000000460|C1orf112|0b8a2b7,True,unspliced,C1orf112
...,...,...,...,...,...
ENSG00000286265,CAAGGCCGCGCGAATGCCCCGGGCCAGCCCCAACCACTGTCCCTAG...,ENSG00000286265|AC007244.1|2b89ff8,True,unspliced,AC007244.1
DEPRECATED_ENSG00000034713,CCTGTCGGGATATTTCGCTCGAATCTTCGCGGACTCCACGCATCTG...,DEPRECATED_ENSG00000034713|GABARAPL2|25bf87a,False,unspliced,GABARAPL2
DEPRECATED_ENSG00000147804,GTACTTTCAACATCGCCGGGAGCATGTCGCAGAGTGCTACGTAGAG...,DEPRECATED_ENSG00000147804|SLC39A4|1998f7f,False,spliced,SLC39A4
DEPRECATED_ENSG00000163931,TCACAGCTTGTGCAATGGCATCCCTGTCGATACCAAACATCTTCAG...,DEPRECATED_ENSG00000163931|TKT|ef04907,False,unspliced,TKT


In [12]:
probe_set_details = {
    'probe_set_file_format': '2.0',
    'panel_name': 'Chromium Human Transcriptome Probe Set v1.0.1',
    'panel_type': 'predesigned',
    'reference_genome': 'GRCh38',
    'reference_version': '2020-A'
}

In [9]:
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [10]:
%%R -i RDS_PATH

suppressPackageStartupMessages(library(Seurat))
rds <- readRDS(file=RDS_PATH)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [36]:
%%R -o adata

adata <- as.SingleCellExperiment(rds)
adata

class: SingleCellExperiment 
dim: 17773 195067 
metadata(0):
assays(2): counts logcounts
rownames(17773): SAMD11 NOC2L ... MT-ND6 MT-CYB
rowData names(0):
colnames(195067): AAACCAATCATTAGTCACTTTAGG-1 AAACCAGGTTCGATGTACTTTAGG-1
  ... TTTGGCGGTTTAGTTGAGCTGTGA-80 TTTGTGAGTGTTGCTGAGCTGTGA-80
colData names(35): orig.ident nCount_RNA ... assigned_celltype ident
reducedDimNames(1): REF.UMAP
mainExpName: RNA
altExpNames(7): prediction.score.ann_level_1
  prediction.score.ann_level_2 ... prediction.score.ann_finest_level
  SCT


1: In .check_reddim_names(x, value, withDimnames) :
  non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor.
2: In .check_reddim_names(x, value, withDimnames) :
  non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor.
3: In .check_reddim_names(x, value, withDimnames) :
  non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor.
4: In .check_reddim_names(x, value, withDimnames) :
  non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor.


  return AnnData(exprs, obs, var, uns, obsm, layers=layers)


In [37]:
adata.var

SAMD11
NOC2L
KLHL17
PLEKHN1
PERM1
...
MT-ND4L
MT-ND4
MT-ND5
MT-ND6
MT-CYB


In [38]:
replacement_dict = {
    'TBCE.1': 'TBCE',
    'HSPA14.1': 'HSPA14',
    'TMSB15B.1': 'TMSB15B'
}

# replace in probe_set 'gene_symbol' column
probe_set['gene_symbol'] = probe_set['gene_symbol'].replace(replacement_dict)

In [39]:
probe_set

Unnamed: 0_level_0,probe_seq,probe_id,included,region,gene_symbol
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000000003,GGTGACACCACAACAATGCAACGTATTTTGGATCTTGTCTACTGCA...,ENSG00000000003|TSPAN6|8eab823,True,spliced,TSPAN6
ENSG00000000005,CGTGACGGGTCTTCTCTACTTTCACTTGAGGGACCACCCACTGTTC...,ENSG00000000005|TNMD|7790621,True,unspliced,TNMD
ENSG00000000419,TTGTAGCGAGTTCCAGAGACAATATCAAAATTACCCTCCTTTTGCT...,ENSG00000000419|DPM1|73ef065,True,unspliced,DPM1
ENSG00000000457,AAGCAAAGCAGGAATAGGCTTCTGCTCCCCTGAGGTAACAGGTTTT...,ENSG00000000457|SCYL3|930c2e3,True,unspliced,SCYL3
ENSG00000000460,GTGGTCTGGATTTCCAAATTTTGGCTTGACAGTCCTCTGGCTTGAC...,ENSG00000000460|C1orf112|0b8a2b7,True,unspliced,C1orf112
...,...,...,...,...,...
ENSG00000286265,CAAGGCCGCGCGAATGCCCCGGGCCAGCCCCAACCACTGTCCCTAG...,ENSG00000286265|AC007244.1|2b89ff8,True,unspliced,AC007244.1
DEPRECATED_ENSG00000034713,CCTGTCGGGATATTTCGCTCGAATCTTCGCGGACTCCACGCATCTG...,DEPRECATED_ENSG00000034713|GABARAPL2|25bf87a,False,unspliced,GABARAPL2
DEPRECATED_ENSG00000147804,GTACTTTCAACATCGCCGGGAGCATGTCGCAGAGTGCTACGTAGAG...,DEPRECATED_ENSG00000147804|SLC39A4|1998f7f,False,spliced,SLC39A4
DEPRECATED_ENSG00000163931,TCACAGCTTGTGCAATGGCATCCCTGTCGATACCAAACATCTTCAG...,DEPRECATED_ENSG00000163931|TKT|ef04907,False,unspliced,TKT


In [48]:
adata.var['gene_symbol'] = adata.var.index.astype(str)
adata.var['gene_symbol'] = adata.var['gene_symbol'].replace(replacement_dict)
adata.var['ensembl_id'] = adata.var['gene_symbol'].map(dict(zip(probe_set['gene_symbol'], probe_set.index)))
adata.var.index = adata.var['ensembl_id']
adata.var.index.name = 'index'

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000187634,SAMD11,ENSG00000187634
ENSG00000188976,NOC2L,ENSG00000188976
ENSG00000187961,KLHL17,ENSG00000187961
ENSG00000187583,PLEKHN1,ENSG00000187583
ENSG00000187642,PERM1,ENSG00000187642
...,...,...
ENSG00000212907,MT-ND4L,ENSG00000212907
ENSG00000198886,MT-ND4,ENSG00000198886
ENSG00000198786,MT-ND5,ENSG00000198786
ENSG00000198695,MT-ND6,ENSG00000198695


# Validate obs and uns from adata

In [54]:
set(adata.obs['sample_id'].value_counts().index).symmetric_difference(set(obs['sample_ID'].unique()))

non_overlap = [x for x in adata.obs['sample_id'].value_counts().index if x not in obs['sample_ID'].unique()]

non_overlap_other_side = [x for x in obs['sample_ID'].unique() if x not in adata.obs['sample_id'].value_counts().index]

In [55]:
non_overlap

[]

In [56]:
non_overlap_other_side

[]

In [58]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 195067 × 17773
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'barcodes', 'sample_id', 'disease_stage', 'patient_id', 'batch_number', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'SCT_snn_res.1', 'SCT_snn_res.1.2', 'SCT_snn_res.1.4', 'SCT_snn_res.1.6', 'SCT_snn_res.1.8', 'SCT_snn_res.2', 'seurat_clusters', 'predicted.ann_level_1.score', 'predicted.ann_level_1', 'predicted.ann_level_2.score', 'predicted.ann_level_2', 'predicted.ann_level_3.score', 'predicted.ann_level_3', 'predicted.ann_level_4.score', 'predicted.ann_level_4', 'predicted.ann_level_5.score', 'predicted.ann_level_5', 'predicted.ann_finest_level.score', 'predicted.ann_finest_level', 'mapping.score', 'main_clus', 'assigned_celltype', 'ident'
    var: 'gene_symbol', 'ensembl_id'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'
    obsm: 'REF.UMAP'
    layers: 'logcounts'

In [59]:
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'sample_id',
    df_col = 'sample_ID',
    skip = None
)

adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,barcodes,sample_ID,disease_stage,patient_id,batch_number,nCount_SCT,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
AAACCAATCATTAGTCACTTTAGG-1,precopd,743.0,623,2.489906,AAACCAATCATTAGTCACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1486.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
AAACCAGGTTCGATGTACTTTAGG-1,precopd,870.0,640,1.130952,AAACCAGGTTCGATGTACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1680.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
AAACGGGCAAGCTAATACTTTAGG-1,precopd,809.0,659,2.107280,AAACGGGCAAGCTAATACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1566.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
AAACTGTCAGCAAGCTACTTTAGG-1,precopd,612.0,505,1.474011,AAACTGTCAGCAAGCTACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,1289.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
AAAGCATGTAAGGCCGACTTTAGG-1,precopd,2001.0,1340,0.949525,AAAGCATGTAAGGCCGACTTTAGG-1,A110-73,COPD GOLD IV,A110,COPD_Batch_8,2001.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGCTGAGGCCAACAAGCTGTGA-80,precopd,1213.0,809,1.785714,TTTGCTGAGGCCAACAAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,1512.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
TTTGCTGAGTCCTGCGAGCTGTGA-80,precopd,11710.0,4607,1.256078,TTTGCTGAGTCCTGCGAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,2468.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
TTTGGACGTCGTCCAAAGCTGTGA-80,precopd,6699.0,3176,0.253293,TTTGGACGTCGTCCAAAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,1974.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241
TTTGGCGGTTTAGTTGAGCTGTGA-80,precopd,6064.0,2760,1.669359,TTTGGCGGTTTAGTTGAGCTGTGA-80,A11-20,COPD GOLD I,A11,COPD_Batch_1,1857.0,...,probe-based,EFO:0008637,TRUE,GRCh38,GCF_000001405.39,cellranger-7.1.0,yes,MONDO:0005002,unknown,HsapDv:0000241


# Add author cell type markers to UNS

In [60]:
cell_markers_df = pd.read_csv(f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/celltype_markers_semicolon_separated.csv")
cell_markers_df = cell_markers_df.drop(columns=['Unnamed: 1'])
cell_markers_df.columns = [CELL_TYPE_ONTOLOGY_ID, AUTHOR_CELL_TYPE, MARKER_GENES]
cell_markers_df

Unnamed: 0,cell_type_ontology_term_id,author_cell_type,author_cell_type_markers
0,CL:4028004,Alveolar Fibroblasts,PCDH15;NKD2;GRIA1;COL6A6;FGFR4;ACTN2;ADAMTS12;...
1,CL:0002063,AT2 cells,SCN1A;AGBL1;F11;PLA2G4F;LRP2;SCTR;MUCL3;ROS1;D...
2,CL:4028002,EC general capillary,FCN3;GPIHBP1;TMEM100;SLC6A4;RAMP3;SEMA3G;TEK;M...
3,CL:0000244,Transitioning AT2 cells,MUC4;GDF15;CRACR2B;P2RY2;KLK11;CTSE;SCNN1G;MET...
4,CL:0000583,Alveolar Mφ,INHBA;SPOCD1;PTCRA;ADTRP;CD5L;IQSEC3;KMO;SLC11...
5,CL:0000583,LILRB+CD169+ TRMφ,F13A1;LILRB5;FOLR2;STAB1;MS4A6A;HSPA6;SLCO2B1;...
6,CL:0000583,Lipid associated Mφ,MARCO;RETN;MCEMP1;RBP4;FFAR4;VSIG4;TREM2;PCOLC...
7,CL:0002062,AT1_Stem,JCAD;NES;GNG11;CAVIN1;EPAS1;IFI27;SLC9A3R2;RAM...
8,CL:0002062,AT1 cells,RTKN2;MYRF;C10orf67;SCEL;SLC5A9;C2orf91;WNT3A;...
9,CL:0000583,Monocyte derived Mφ,RBP4;MCEMP1;AGRP;CXCL5;MARCO;C1QB;FABP4;DEFB1;...


In [61]:
cell_markers_df[AUTHOR_CELL_TYPE].unique()

array(['Alveolar Fibroblasts', 'AT2 cells', 'EC general capillary',
       'Transitioning AT2 cells', 'Alveolar Mφ', 'LILRB+CD169+ TRMφ',
       'Lipid associated Mφ', 'AT1_Stem', 'AT1 cells',
       'Monocyte derived Mφ', 'Club cells', 'unknown',
       'Smooth Muscle cells', 'Plasma cells', 'Interstitial Mφ',
       'Pericytes', 'Ciliated cells', 'Club_AT2', 'DC2', 'Basal cells',
       'EC Venous+Arterial', 'B cells', ' Ionocytes', 'Lymphatic EC',
       'Non-Classical Monocytes', 'CD4 T cells', 'NK cells',
       'Lipofibroblasts', 'Plasmacytoid DC', 'Adventitial Fibroblasts',
       'EC aerocyte capillary', 'CD8 T cells', 'Mast cells',
       'Goblet cells', 'Classical Monocytes', 'unknown_2'], dtype=object)

In [62]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'barcodes',
       'sample_ID', 'disease_stage', 'patient_id', 'batch_number',
       'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'SCT_snn_res.1',
       'SCT_snn_res.1.2', 'SCT_snn_res.1.4', 'SCT_snn_res.1.6',
       'SCT_snn_res.1.8', 'SCT_snn_res.2', 'seurat_clusters',
       'predicted.ann_level_1.score', 'predicted.ann_level_1',
       'predicted.ann_level_2.score', 'predicted.ann_level_2',
       'predicted.ann_level_3.score', 'predicted.ann_level_3',
       'predicted.ann_level_4.score', 'predicted.ann_level_4',
       'predicted.ann_level_5.score', 'predicted.ann_level_5',
       'predicted.ann_finest_level.score', 'predicted.ann_finest_level',
       'mapping.score', 'main_clus', 'assigned_celltype', 'ident', 'donor_id',
       'protocol_URL', 'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'orga

In [65]:
adata.obs['ident'].value_counts(dropna=False)

AT2 cells                  40845
AT1 cells                  28951
Alveolar Fibroblasts       22832
Alveolar Mφ                18087
CD4 T cells                 7535
EC general capillary        7135
Lipid associated Mφ         6430
Adventitial Fibroblasts     6138
EC aerocyte capillary       4744
EC Venous+Arterial          4707
CD8 T cells                 4390
NK cells                    4175
Interstitial Mφ             4024
DC2                         3924
Transitioning AT2 cells     3449
Plasma cells                3154
Monocyte derived Mφ         2656
Lymphatic EC                2634
Ciliated cells              2580
Mast cells                  2310
Pericytes                   1943
Smooth Muscle cells         1792
unknown                     1779
Basal cells                 1732
B cells                     1538
Classical Monocytes         1080
Non-Classical Monocytes     1016
Goblet cells                 872
Club_AT2                     734
AT1_Stem                     641
Lipofibrob

In [76]:
set(adata.obs['ident'].unique()).symmetric_difference(set(cell_markers_df[AUTHOR_CELL_TYPE].unique()))

set()

In [77]:
correction_dict = {
    ' Ionocytes': 'Ionocytes'
}

cell_markers_df[AUTHOR_CELL_TYPE] = cell_markers_df[AUTHOR_CELL_TYPE].replace(correction_dict)
adata.obs['ident'] = adata.obs['ident'].replace(correction_dict)

In [78]:
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs['ident'].map(dict(zip(cell_markers_df[AUTHOR_CELL_TYPE], cell_markers_df[CELL_TYPE_ONTOLOGY_ID])))
adata.obs[MARKER_GENES] = adata.obs['ident'].map(dict(zip(cell_markers_df[AUTHOR_CELL_TYPE], cell_markers_df[MARKER_GENES])))

In [79]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063     40845
CL:0002062     29592
CL:0000583     27405
CL:4028004     23138
CL:0000904      7535
CL:4028002      7135
CL:4028006      6138
CL:4028003      4744
CL:1000413      4707
CL:0000913      4390
CL:0000623      4175
CL:4033043      4024
CL:0000990      3924
CL:0000244      3449
CL:0000786      3154
BTO:0004167     2634
CL:0000067      2580
CL:0000097      2310
CL:0000669      1943
NaN             1842
CL:0002591      1792
CL:0000646      1732
CL:0000236      1538
CL:0000860      1080
CL:0000875      1016
CL:0000158       985
CL:0000160       872
CL:0001058       213
CL:0017000       175
Name: cell_type_ontology_term_id, dtype: int64

In [81]:
adata.obs[adata.obs[CELL_TYPE_ONTOLOGY_ID].isna()]['ident'].value_counts()

unknown                    1779
unknown_2                    63
Basal cells                   0
B cells                       0
Ionocytes                     0
Lymphatic EC                  0
Non-Classical Monocytes       0
CD4 T cells                   0
NK cells                      0
Lipofibroblasts               0
Plasmacytoid DC               0
Adventitial Fibroblasts       0
EC aerocyte capillary         0
CD8 T cells                   0
Mast cells                    0
Goblet cells                  0
Classical Monocytes           0
EC Venous+Arterial            0
Alveolar Fibroblasts          0
AT2 cells                     0
AT1 cells                     0
EC general capillary          0
Transitioning AT2 cells       0
Alveolar Mφ                   0
LILRB+CD169+ TRMφ             0
Lipid associated Mφ           0
AT1_Stem                      0
Monocyte derived Mφ           0
Club_AT2                      0
Club cells                    0
Smooth Muscle cells           0
Plasma c

In [82]:
adata.obs[MARKER_GENES].isna().sum()

0

In [83]:
adata.uns[MARKER_GENES] = cell_markers_df

# Check author cell type annotations and Cell Ontology IDs

In [87]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['ident']
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

AT2 cells                  40845
AT1 cells                  28951
Alveolar Fibroblasts       22832
Alveolar Mφ                18087
CD4 T cells                 7535
EC general capillary        7135
Lipid associated Mφ         6430
Adventitial Fibroblasts     6138
EC aerocyte capillary       4744
EC Venous+Arterial          4707
CD8 T cells                 4390
NK cells                    4175
Interstitial Mφ             4024
DC2                         3924
Transitioning AT2 cells     3449
Plasma cells                3154
Monocyte derived Mφ         2656
Lymphatic EC                2634
Ciliated cells              2580
Mast cells                  2310
Pericytes                   1943
Smooth Muscle cells         1792
unknown                     1779
Basal cells                 1732
B cells                     1538
Classical Monocytes         1080
Non-Classical Monocytes     1016
Goblet cells                 872
Club_AT2                     734
AT1_Stem                     641
Lipofibrob

In [88]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0002063     40845
CL:0002062     29592
CL:0000583     27405
CL:4028004     23138
CL:0000904      7535
CL:4028002      7135
CL:4028006      6138
CL:4028003      4744
CL:1000413      4707
CL:0000913      4390
CL:0000623      4175
CL:4033043      4024
CL:0000990      3924
CL:0000244      3449
CL:0000786      3154
BTO:0004167     2634
CL:0000067      2580
CL:0000097      2310
CL:0000669      1943
NaN             1842
CL:0002591      1792
CL:0000646      1732
CL:0000236      1538
CL:0000860      1080
CL:0000875      1016
CL:0000158       985
CL:0000160       872
CL:0001058       213
CL:0017000       175
Name: cell_type_ontology_term_id, dtype: int64

In [89]:
adata.obs[MARKER_GENES].value_counts(dropna=False)

SCN1A;AGBL1;F11;PLA2G4F;LRP2;SCTR;MUCL3;ROS1;DMBT1;LRRC36;SLC26A9;SLC46A2;MFSD2A;SLC6A14;LRRK2;RASGRF1;ACOXL;ZNF385B;LGI3;FGG;PGC;SLC22A31;FREM2;ABCA3;FMO5;LAMP3;CCDC141;TMEM163;CRTAC1;ADGRF1;ELF5;ARHGEF38;HHIP;SLC22A3;ALPL;C4orf19;WIF1;STEAP4;ALOX15B;TTN;HAS3;PPP4R4;LAD1;SFTPC;KCNJ15;RND1;SDR16C5;CACNB4;SFTPA1;KCNQ3;IL20RA;FASN;CSF3R;FHDC1;RAP1GAP;CAPN8;TTC39A;NRGN;SFTPB;NAPSA;CDKL2;SFTPD;CFAP221;CHI3L1;PDE4C;HPN;PLIN5;PARM1;PLCH1;C8orf34;CACNA2D2;CXCL17;SNX25;SLC34A2;ETV5;KIAA1324L;CTSH;ARFGEF3;SLC7A2;HMGCS1;MTRR;TCIM;PTP4A3;MUC1;MET;SNX30;PTPN13;IRX3;PIGR;CEP70;ANK3;EHF;CXCL2;BMP1;KNDC1;C16orf89;SIAE;GPRC5C;CREB3L1;NTN4;MLPH;LGALSL;TFCP2L1;MALL;MSMO1;C4BPA;SECISBP2L;RASEF;POLR3H;SLPI;SPRY4;P3H2;FAM20A;PMM1;SHC3;GSTA4;FAM184A;DRAM1;TJP3;ETV1;AHCYL2;EPS8;PIGA;LPCAT1;SELENBP1;MBIP;FGFR2;CA13;CACHD1;CAMSAP3;DUOXA1;ADGRV1;PFKFB2;PEBP4;PDE4D;SCGB3A2;ACSS3;LMO3;NPC2;VWA2;MAST4;AQP3;MTUS1;NECAB3;PLCXD1;SLC4A4;SOCS2;CACNB1;TMEM243;AFF3;VEPH1;AREG;ATP2C2;CD38;PON3;SFTA3;ACADL;ACACA;CAPN3;BMP2

In [90]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'barcodes',
       'sample_ID', 'disease_stage', 'patient_id', 'batch_number',
       'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'SCT_snn_res.1',
       'SCT_snn_res.1.2', 'SCT_snn_res.1.4', 'SCT_snn_res.1.6',
       'SCT_snn_res.1.8', 'SCT_snn_res.2', 'seurat_clusters',
       'predicted.ann_level_1.score', 'predicted.ann_level_1',
       'predicted.ann_level_2.score', 'predicted.ann_level_2',
       'predicted.ann_level_3.score', 'predicted.ann_level_3',
       'predicted.ann_level_4.score', 'predicted.ann_level_4',
       'predicted.ann_level_5.score', 'predicted.ann_level_5',
       'predicted.ann_finest_level.score', 'predicted.ann_finest_level',
       'mapping.score', 'main_clus', 'assigned_celltype', 'ident', 'donor_id',
       'protocol_URL', 'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'orga

# Check whether ENSEMBL IDs in var

In [92]:
adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000187634,SAMD11,ENSG00000187634
ENSG00000188976,NOC2L,ENSG00000188976
ENSG00000187961,KLHL17,ENSG00000187961
ENSG00000187583,PLEKHN1,ENSG00000187583
ENSG00000187642,PERM1,ENSG00000187642
...,...,...
ENSG00000212907,MT-ND4L,ENSG00000212907
ENSG00000198886,MT-ND4,ENSG00000198886
ENSG00000198786,MT-ND5,ENSG00000198786
ENSG00000198695,MT-ND6,ENSG00000198695


# Check raw data

In [55]:
adata.X.toarray().max()

20334.0

In [93]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'barcodes',
       'sample_ID', 'disease_stage', 'patient_id', 'batch_number',
       'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'SCT_snn_res.1',
       'SCT_snn_res.1.2', 'SCT_snn_res.1.4', 'SCT_snn_res.1.6',
       'SCT_snn_res.1.8', 'SCT_snn_res.2', 'seurat_clusters',
       'predicted.ann_level_1.score', 'predicted.ann_level_1',
       'predicted.ann_level_2.score', 'predicted.ann_level_2',
       'predicted.ann_level_3.score', 'predicted.ann_level_3',
       'predicted.ann_level_4.score', 'predicted.ann_level_4',
       'predicted.ann_level_5.score', 'predicted.ann_level_5',
       'predicted.ann_finest_level.score', 'predicted.ann_finest_level',
       'mapping.score', 'main_clus', 'assigned_celltype', 'ident', 'donor_id',
       'protocol_URL', 'institute', 'sample_collection_site',
       'sample_collection_relative_time_point', 'library_ID',
       'library_ID_repository', 'author_batch_notes',
       'orga

In [105]:
adata.obs['main_clus'].value_counts()

0     9391
1     9097
2     8982
3     7647
4     7455
5     7374
6     7020
7     6980
8     6918
9     6813
10    5860
11    5507
12    5457
13    4890
14    4738
15    4678
16    4524
17    4513
18    4441
19    4371
20    3911
21    3910
22    3541
23    3487
24    3454
25    3429
26    3253
27    3128
28    2949
29    2934
30    2634
31    2614
32    2581
33    2539
34    2515
35    2384
36    2048
37    1871
38    1842
39    1780
40    1742
41    1589
42    1565
43    1549
44    1282
45     999
46     943
47     524
48     461
49     390
50     213
51     176
52     174
Name: main_clus, dtype: int64

In [117]:
# make checks for adata before saving raw

adata.obs = adata.obs.astype('category')

# Make specific columns numeric
adata.obs['cell_viability_percentage'] = pd.to_numeric(adata.obs['cell_viability_percentage'], errors='coerce')
adata.obs['cell_number_loaded'] = pd.to_numeric(adata.obs['cell_number_loaded'], errors='coerce')
adata.obs['sample_collection_year'] = pd.to_numeric(adata.obs['sample_collection_year'], errors='coerce')

numeric_cols = ['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res.0.8', 'SCT_snn_res.1',
       'SCT_snn_res.1.2', 'SCT_snn_res.1.4', 'SCT_snn_res.1.6',
       'SCT_snn_res.1.8', 'SCT_snn_res.2', 'seurat_clusters','predicted.ann_level_1.score','predicted.ann_level_2.score','predicted.ann_level_3.score',
       'predicted.ann_level_4.score','predicted.ann_level_5.score', 'predicted.ann_finest_level.score', 'mapping.score', 'main_clus',]
for col in numeric_cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors='coerce')

# Make df indices str
adata.obs.index = adata.obs.index.astype('str')
adata.var.index = adata.var.index.astype('str')
adata.uns[MARKER_GENES].index = adata.uns[MARKER_GENES].index.astype('str')
adata.var_names = adata.var_names.astype('str')

adata.uns[MARKER_GENES] = adata.uns[MARKER_GENES].astype('category')

In [107]:
adata.X = adata.X.astype(np.int64)

In [108]:
adata.raw = adata

In [109]:
adata.X

<195067x17773 sparse matrix of type '<class 'numpy.int64'>'
	with 319339179 stored elements in Compressed Sparse Row format>

In [110]:
adata.raw.X

<195067x17773 sparse matrix of type '<class 'numpy.int64'>'
	with 319339179 stored elements in Compressed Sparse Row format>

In [111]:
adata.X.toarray().max()

20334

In [112]:
adata.raw.X.toarray().max()

20334

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object  same as Google Sheet

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object same as Google Sheet

# Data Submission Status

- CHECK: Raw data in X
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- MISSING: ENSEMBL IDs not there bc features not clear

# Revisions:

DONE

In [120]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"), compression='gzip')