In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join
import scipy.sparse as sp
import numpy as np

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

package ‘methods’ was built under R version 4.3.3 
1: package ‘datasets’ was built under R version 4.3.3 
2: package ‘utils’ was built under R version 4.3.3 
3: package ‘grDevices’ was built under R version 4.3.3 
4: package ‘graphics’ was built under R version 4.3.3 
5: package ‘stats’ was built under R version 4.3.3 


In [2]:
DATASET_ID = "Pryhuber_02_LungMAP"
RDS_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.rds"
ENSEMBL_ID_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/features.tsv"
OUTPUT_PATH_PREREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_prerevision'
OUTPUT_PATH_POSTREVISION = '/home/icb/raphael.kfuri-rubens/data/hlca_v2/HLCA_V2_CORE/adata_postrevision'

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

# Validate obs and uns from Tier 1 Metadata Template

In [5]:
val_workflow = ValidationWorkflow(
    input = uns,
    axis = 'uns'
)

validated_uns = val_workflow.init_workflow()
validated_uns

Unnamed: 0,title,study_PI,batch_condition,default_embedding,unpublished,comments
0,LungMAP Dissociated Adult Human Lung Cells - C...,"Gloria,S,Pryhuber","Donor.Id, capture batch, seq batch date ID",ref.umap,unpublished,Reference genome: GRCh38 + SARS-CoV-2 (ncbi nu...


In [6]:
val_workflow = ValidationWorkflow(
    input = obs,
    axis = 'obs'
)

validated_obs = val_workflow.init_workflow()
validated_obs

Unnamed: 0,sample_ID,donor_id,protocol_URL,institute,sample_collection_site,sample_collection_relative_time_point,library_ID,library_ID_repository,author_batch_notes,organism_ontology_term_id,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
0,D116-RML-MIX-2,D116,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A5,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173483,Black/AA,HsapDv:0000239
1,D122-RML-MIX-2,D122,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A23,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173483,Hispanic,HsapDv:0000238
2,D239-CBL-MIX-3,D239,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A36,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173483,Black/AA,HsapDv:0000238
3,D271-RML-MIX-2,D271,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A1,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,White,HsapDv:0000240
4,D283-RML-MIX-2,D283,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A35,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,White,HsapDv:0000241
5,D291-RUL-MIX-3,D291,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A20,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173483,White,HsapDv:0000237
6,D292-RUL-MIX-3,D292,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A17,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,White,HsapDv:0000237
7,D305-RUL-MIX-2,D305,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A27,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,White,HsapDv:0000241
8,D307-CBL-MIX-2,D307,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A40,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,Black/AA,HsapDv:0000264
9,D312-RUL-MIX-3,D312,https://www.protocols.io/view/lungmap2-urmc-ce...,University of Rochester,BRINDL_site_2,,A10,,,NCBITaxon:9606,...,3 prime tag,EFO_0008637,True,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,White,HsapDv:0000240


In [7]:
anndata2ri.activate()
%load_ext rpy2.ipython

  anndata2ri.activate()


In [8]:
%%R -i RDS_PATH

suppressPackageStartupMessages(library(Seurat))
rds <- readRDS(file=RDS_PATH)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

package ‘tools’ was built under R version 4.3.3 


In [9]:
%%R -o adata

adata <- as.SingleCellExperiment(rds)
adata

class: SingleCellExperiment 
dim: 30669 110600 
metadata(0):
assays(2): counts logcounts
rownames(30669): OR4F5 AL627309.1 ... AC007325.2 Sars-CoV2
rowData names(0):
colnames(110600): AAACCCAAGCAGCCCT-1_3_1 AAACCCAAGTGACACG-1_3_1 ...
  TTTGTTGTCAGCTGAT-1_7_7 TTTGTTGTCCATAGAC-1_7_7
colData names(13): Donor.Id nCount_RNA ... annotation_type ident
reducedDimNames(0):
mainExpName: SCT
altExpNames(2): ADT RNA


1: In .check_reddim_names(x, value, withDimnames) :
  non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor.
2: In .check_reddim_names(x, value, withDimnames) :
  non-NULL 'rownames(value)' should be the same as 'colnames(x)' for
'reducedDim<-'. This will be an error in the next release of
Bioconductor.


  return AnnData(exprs, obs, var, uns, obsm, layers=layers)


# Validate obs and uns from adata

In [10]:
adata.obs.columns

Index(['Donor.Id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'percent.mt',
       'predicted.ann_level_1', 'predicted.ann_level_2',
       'predicted.ann_level_3', 'predicted.ann_level_4',
       'predicted.ann_level_5', 'predicted.ann_finest_level',
       'annotation_type', 'ident'],
      dtype='object')

In [11]:
adata.obs['ident']

AAACCCAAGCAGCCCT-1_3_1      EC venous systemic
AAACCCAAGTGACACG-1_3_1    Alveolar fibroblasts
AAACCCACATGTGTCA-1_3_1             EC arterial
AAACCCAGTACGTGTT-1_3_1                NK cells
AAACCCAGTCCTGAAT-1_3_1     Monocyte-derived Mφ
                                  ...         
TTTGTTGGTTGCCAAT-1_7_7    Alveolar macrophages
TTTGTTGGTTTGAACC-1_7_7     Classical monocytes
TTTGTTGTCACTCACC-1_7_7     Monocyte-derived Mφ
TTTGTTGTCAGCTGAT-1_7_7    Alveolar macrophages
TTTGTTGTCCATAGAC-1_7_7     Monocyte-derived Mφ
Name: ident, Length: 110600, dtype: category
Categories (48, object): ['EC venous systemic', 'Alveolar fibroblasts', 'EC arterial', 'NK cells', ..., 'Mesothelium', 'Goblet (nasal)', 'Plasmacytoid DCs', 'Neuroendocrine']

In [12]:
non_overlap = [x for x in adata.obs['Donor.Id'].value_counts().index if x not in obs['donor_id'].unique()]

non_overlap_other_side = [x for x in obs['donor_id'].unique() if x not in adata.obs['Donor.Id'].value_counts().index]

In [13]:
non_overlap

[]

In [14]:
non_overlap_other_side

[]

In [15]:
merger = AnnDataMerger(
    adata = adata,
    uns_df = uns
)

adata = merger.add_uns_metadata()

adata

AnnData object with n_obs × n_vars = 110600 × 30669
    obs: 'Donor.Id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'percent.mt', 'predicted.ann_level_1', 'predicted.ann_level_2', 'predicted.ann_level_3', 'predicted.ann_level_4', 'predicted.ann_level_5', 'predicted.ann_finest_level', 'annotation_type', 'ident'
    uns: 'title', 'study_PI', 'batch_condition', 'default_embedding', 'unpublished', 'comments'
    layers: 'logcounts'

In [16]:
merger = AnnDataMerger(
    adata = adata,
    obs_df = obs
)

adata = merger.add_obs_metadata(
    adata_col = 'Donor.Id',
    df_col = 'donor_id',
    skip = None
)

adata.obs.rename(columns={'Donor.Id': 'donor_id'}, inplace=True)

adata.obs

Unnamed: 0,donor_id,nCount_RNA,nFeature_RNA,nCount_ADT,percent.mt,predicted.ann_level_1,predicted.ann_level_2,predicted.ann_level_3,predicted.ann_level_4,predicted.ann_level_5,...,sequenced_fragment,sequencing_platform,is_primary_data,reference_genome,gene_annotation_version,alignment_software,intron_inclusion,disease_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id
AAACCCAAGCAGCCCT-1_3_1,D376,9072.0,2954,118.0,3.516314,Endothelial,Blood vessels,EC venous,EC venous systemic,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,Asian,HsapDv:0000237
AAACCCAAGTGACACG-1_3_1,D376,10798.0,3512,56.0,2.046675,Stroma,Fibroblast lineage,Fibroblasts,Alveolar fibroblasts,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,Asian,HsapDv:0000237
AAACCCACATGTGTCA-1_3_1,D376,3209.0,1633,100.0,4.144593,Endothelial,Blood vessels,EC arterial,,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,Asian,HsapDv:0000237
AAACCCAGTACGTGTT-1_3_1,D376,4064.0,1569,126.0,5.634843,Immune,Lymphoid,Innate lymphoid cell NK,NK cells,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,Asian,HsapDv:0000237
AAACCCAGTCCTGAAT-1_3_1,D376,7436.0,2521,91.0,15.935987,Immune,Myeloid,Macrophages,Interstitial macrophages,Monocyte-derived Mφ,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,NCIT:C173484,Asian,HsapDv:0000237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTTGCCAAT-1_7_7,D404,1327.0,735,382.0,2.788244,Immune,Myeloid,Macrophages,Alveolar macrophages,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,More than 1 race,HsapDv:0000264
TTTGTTGGTTTGAACC-1_7_7,D404,16326.0,4232,510.0,3.613867,Immune,Myeloid,Monocytes,Classical monocytes,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,More than 1 race,HsapDv:0000264
TTTGTTGTCACTCACC-1_7_7,D404,3671.0,1707,306.0,5.502588,Immune,Myeloid,Macrophages,Interstitial macrophages,Monocyte-derived Mφ,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,More than 1 race,HsapDv:0000264
TTTGTTGTCAGCTGAT-1_7_7,D404,1065.0,728,19199.0,5.915493,Immune,Myeloid,Macrophages,Interstitial macrophages,,...,3 prime tag,EFO_0008637,true,GRCh38,GENCODE v32 / Ensembl 98 + SARS-CoV-2 (ncbi nu...,cellranger 6.0.1,no,PATO:0000461,More than 1 race,HsapDv:0000264


# Add author cell type markers to UNS

In [17]:
marker_data = {
    AUTHOR_CELL_TYPE: [
        "Alveolar macrophages", "NK cells", "AT2", "Alveolar Mφ CCL3+", "Suprabasal", "Basal resting",
        "EC venous pulmonary", "CD8 T cells", "EC arterial", "Peribronchial fibroblasts", "CD4 T cells", 
        "AT1", "Multiciliated (non-nasal)", "Plasma cells", "Goblet (nasal)", "Club (nasal)", 
        "SM activated stress response", "Classical monocytes", "Monocyte derived Mφ", 
        "Alveolar Mφ proliferating", "Club (non-nasal)", "SMG serous (bronchial)", "EC venous systemic", 
        "Non classical monocytes", "EC general capillary", "Adventitial fibroblasts", "Lymphatic EC mature", 
        "EC aerocyte capillary", "Smooth muscle", "Alveolar fibroblasts", "Multiciliated (nasal)", 
        "Goblet (bronchial)", "Neuroendocrine", "Lymphatic EC differentiating", "DC2", "Transitional Club AT2", 
        "DC1", "Myofibroblasts", "B cells", "Mast cells", "Interstitial Mφ perivascular", 
        "SMG mucous", "AT2 proliferating", "Goblet (subsegmental)", "Pericytes", "SMG duct", 
        "Mesothelium", "SMG serous (nasal)", "Ionocyte", "Alveolar Mφ MT-positive", "Fibromyocytes", 
        "Deuterosomal", "Tuft", "Plasmacytoid DCs", "T cells proliferating", "Subpleural fibroblasts", 
        "Lymphatic EC proliferating", "Migratory DCs"
    ],
    MARKER_GENES: [
        "MS4A7;C1QA;HLA-DQB1;HLA-DMA;HLA-DPB1;HLA-DPA1;ACP5;C1QC;CTSS;HLA-DQA1",
        "GZMA;CD7;CCL4;CST7;NKG7;GNLY;CTSW;CCL5;GZMB;PRF1",
        "SEPP1;PGC;NAPSA;SFTPD;SLC34A2;CYB5A;MUC1;S100A14;SFTA2;SFTA3",
        "MCEMP1;UPP1;HLA-DQA1;C5AR1;HLA-DMA;AIF1;LST1;LINC01272;MRC1;CCL18",
        "PRDX2;KRT19;SFN;TACSTD2;KRT5;LDHB;KRT17;KLK11;S100A2;SERPINB4",
        "CYR61;PERP;IGFBP2;KRT19;KRT5;KRT17;KRT15;S100A2;LAMB3;BCAM",
        "VWF;MGP;GNG11;RAMP2;SPARCL1;IGFBP7;IFI27;CLDN5;ACKR1;AQP1",
        "CD8A;CD3E;CCL4;CD2;CXCR4;GZMA;NKG7;IL32;CD3D;CCL5",
        "SPARCL1;SOX17;IFI27;TM4SF1;A2M;CLEC14A;GIMAP7;CRIP2;CLDN5;PECAM1",
        "IGFBP7;COL1A2;COL3A1;A2M;BGN;DCN;MGP;LUM;MFAP4;C1S",
        "CORO1A;KLRB1;CD3E;LTB;CXCR4;IL7R;TRAC;IL32;CD2;CD3D",
        "SFTA2;CEACAM6;FXYD3;CAV1;TSPAN13;KRT7;ADIRF;HOPX;AGER;EMP2",
        "SNTN;FAM229B;TMEM231;C5orf49;C12orf75;GSTA1;C11orf97;RP11-356K23.1;CD24;RP11-295M3.4",
        "ITM2C;TNFRSF17;FKBP11;IGKC;IGHA1;IGHG1;CD79A;JCHAIN;MZB1;ISG20",
        "KRT7;MUC1;MUC5AC;MSMB;CP;LMO7;LCN2;CEACAM6;BPIFB1;PIGR",
        "ELF3;C19orf33;KRT8;KRT19;TACSTD2;MUC1;S100A14;CXCL17;PSCA;FAM3D",
        "C11orf96;HES4;PLAC9;FLNA;KANK2;TPM2;PLN;SELM;GPX3;LBH",
        "LST1;IL1B;LYZ;COTL1;S100A9;VCAN;S100A8;S100A12;AIF1;FCN1",
        "LYZ;ACP5;TYROBP;LGALS1;CD68;AIF1;CTSL;EMP3;FCER1G;LAPTM5",
        "H2AFV;STMN1;LSM4;GYPC;PTTG1;KIAA0101;FABP4;CKS1B;UBE2C;HMGN2",
        "SCGB3A1;CYP2F1;GSTA1;HES4;TSPAN8;TFF3;MSMB;BPIFB1;SCGB1A1;PIGR",
        "AZGP1;ZG16B;PIGR;NDRG2;LPO;C6orf58;DMBT1;PRB3;FAM3D;RP11-1143G9.4",
        "VWF;MGP;GNG11;PLVAP;RAMP2;SPARCL1;IGFBP7;A2M;CLEC14A;ACKR1",
        "PSAP;FCGR3A;FCN1;CORO1A;COTL1;FCER1G;LAPTM5;CTSS;AIF1;LST1",
        "EPAS1;GNG11;IFI27;TM4SF1;EGFL7;AQP1;VWF;FCN3;SPARCL1;CLDN5",
        "COL6A2;SFRP2;IGFBP7;IGFBP6;COL3A1;C1S;MMP2;MGP;SPARC;COL1A2",
        "PPFIBP1;GNG11;RAMP2;CCL21;MMRN1;IGFBP7;SDPR;TM4SF1;CLDN5;ECSCR",
        "EMCN;HPGD;IFI27;CA4;EGFL7;AQP1;IL1RL1;SPARCL1;SDPR;CLDN5",
        "PRKCDBP;NDUFA4L2;MYL9;ACTA2;MGP;CALD1;TPM1;TAGLN;IGFBP7;TPM2",
        "LUM;COL6A1;CYR61;C1R;COL1A2;MFAP4;A2M;C1S;ADH1B;GPX3",
        "RP11-356K23.1;EFHC1;CAPS;ROPN1L;RSPH1;C9orf116;TMEM190;DNALI1;PIFO;ODF3B",
        "MUC5AC;MSMB;PI3;MDK;ANKRD36C;TFF3;PIGR;SAA1;CP;BPIFB1",
        "UCHL1;TFF3;APOA1BP;CLDN3;SEC11C;NGFRAP1;SCG5;HIGD1A;PHGR1;CD24",
        "AKAP12;TFF3;SDPR;CLDN5;TCF4;TFPI;TIMP3;GNG11;CCL21;IGFBP7",
        "ITGB2;LAPTM5;HLA-DRB1;HLA-DPB1;HLA-DPA1;HLA-DMB;HLA-DQB1;HLA-DQA1;HLA-DMA;LST1",
        "CXCL17;C16orf89;RNASE1;KRT7;SCGB1A1;PIGR;SCGB3A2;KLK11;SFTA1P;FOLR1",
        "HLA-DPA1;CPNE3;CORO1A;CPVL;C1orf54;WDFY4;LSP1;HLA-DQB1;HLA-DQA1;HLA-DMA",
        "CALD1;CYR61;TAGLN;MT1X;PRELP;TPM2;GPX3;CTGF;IGFBP5;SPARCL1",
        "CD69;CORO1A;LIMD2;BANK1;LAPTM5;CXCR4;LTB;CD79A;CD37;MS4A1",
        "VWA5A;RGS13;C1orf186;HPGDS;CPA3;GATA2;MS4A2;KIT;TPSAB1;TPSB2",
        "MRC1;RNASE1;FGL2;RNASE6;HLA-DPA1;GPR183;CD14;HLA-DPB1;MS4A6A;AIF1",
        "FKBP11;TCN1;GOLM1;TFF3;PIGR;KLK11;MARCKSL1;CRACR2B;SELM;MSMB",
        "CDK1;LSM3;CKS1B;EIF1AX;UBE2C;MRPL14;PRC1;CENPW;EMP2;DHFR",
        "MDK;MUC5B;SCGB1A1;CP;C3;TSPAN8;TFF3;MSMB;PIGR;BPIFB1",
        "MYL9;SPARC;SPARCL1;IGFBP7;COL4A1;GPX3;PDGFRB;CALD1;COX4I2;TPM2",
        "PIP;ZG16B;PIGR;SAA1;MARCKSL1;ALDH1A3;SELM;LTF;RARRES1;AZGP1",
        "CEBPD;LINC01133;MRPL33;UPK3B;CFB;SEPP1;EID1;HP;CUX1;MRPS21",
        "ZG16B;MUC7;C6orf58;PRB3;LTF;LYZ;PRR4;AZGP1;PIGR;RP11-1143G9.4",
        "FOXI1;ATP6V1A;GOLM1;TMEM61;SEC11C;SCNN1B;ASCL3;CLCNKB;HEPACAM2;CD24",
        "GSTO1;LGALS1;CTSZ;MT2A;APOC1;CTSL;UPP1;CCL18;FABP4;MT1X",
        "NEXN;ACTG2;LMOD1;IGFBP7;PPP1R14A;DES;FLNA;TPM2;PLN;SELM",
        "RSPH9;PIFO;RUVBL2;C11orf88;FAM183A;MORN2;SAXO2;CFAP126;FAM229B;C5orf49",
        "MUC20;KHDRBS1;ZNF428;BIK;CRYM;LRMP;HES6;KIT;AZGP1;RASSF6",
        "IL3RA;TCF4;LTB;GZMB;JCHAIN;ITM2C;IRF8;PLD4;IRF7;C12orf75",
        "TRAC;HMGN2;IL32;CORO1A;ARHGDIB;STMN1;RAC2;IL2RG;HMGB2;CD3D",
        "SERPING1;C1R;COL1A2;NNMT;COL3A1;MT1E;MT1X;PLA2G2A;SELM;MT1M",
        "S100A16;TUBB;HMGN2;COX20;LSM2;HMGN1;ARPC1A;ECSCR;EID1;MARCKS",
        "IL2RG;HLA-DRB5;TMEM176A;BIRC3;TYMP;CCL22;SYNGR2;CD83;LSP1;HLA-DQA1"
    ]
}

marker_genes_df = pd.DataFrame(marker_data)

marker_genes_df

Unnamed: 0,author_cell_type,author_cell_type_markers
0,Alveolar macrophages,MS4A7;C1QA;HLA-DQB1;HLA-DMA;HLA-DPB1;HLA-DPA1;...
1,NK cells,GZMA;CD7;CCL4;CST7;NKG7;GNLY;CTSW;CCL5;GZMB;PRF1
2,AT2,SEPP1;PGC;NAPSA;SFTPD;SLC34A2;CYB5A;MUC1;S100A...
3,Alveolar Mφ CCL3+,MCEMP1;UPP1;HLA-DQA1;C5AR1;HLA-DMA;AIF1;LST1;L...
4,Suprabasal,PRDX2;KRT19;SFN;TACSTD2;KRT5;LDHB;KRT17;KLK11;...
5,Basal resting,CYR61;PERP;IGFBP2;KRT19;KRT5;KRT17;KRT15;S100A...
6,EC venous pulmonary,VWF;MGP;GNG11;RAMP2;SPARCL1;IGFBP7;IFI27;CLDN5...
7,CD8 T cells,CD8A;CD3E;CCL4;CD2;CXCR4;GZMA;NKG7;IL32;CD3D;CCL5
8,EC arterial,SPARCL1;SOX17;IFI27;TM4SF1;A2M;CLEC14A;GIMAP7;...
9,Peribronchial fibroblasts,IGFBP7;COL1A2;COL3A1;A2M;BGN;DCN;MGP;LUM;MFAP4...


# Check author cell type annotations and Cell Ontology IDs

In [18]:
object_cell_types = adata.obs['ident'].value_counts(dropna=False).index.tolist()

In [19]:
# get hlca v1 ontology mapping
pd.set_option('display.max_rows', 100)
hlca_v1_ontology_mapping_df = pd.read_csv("../HLCA_v1/hlca_v1_finest_annot_ontology_id_mapping.csv")

hlca_v1_ontology_mapping_dict = dict(zip(hlca_v1_ontology_mapping_df['ann_finest_level'], hlca_v1_ontology_mapping_df['cell_type_ontology_term_id']))
hlca_v1_ontology_mapping_dict

{'Alveolar macrophages': 'CL:0000583',
 'NK cells': 'CL:0000623',
 'AT2': 'CL:0002063',
 'Alveolar Mph CCL3+': 'CL:0000583',
 'Suprabasal': 'CL:0002633',
 'Basal resting': 'CL:0002633',
 'EC venous pulmonary': 'CL:0002543',
 'CD8 T cells': 'CL:0000625',
 'EC arterial': 'CL:1001568',
 'Peribronchial fibroblasts': 'CL:2000093',
 'CD4 T cells': 'CL:0000624',
 'AT1': 'CL:0002062',
 'Multiciliated (non-nasal)': 'CL:0002145',
 'Plasma cells': 'CL:0000786',
 'Hillock-like': 'CL:4030023',
 'Goblet (nasal)': 'CL:0002480',
 'Club (nasal)': 'CL:0000158',
 'SM activated stress response': 'CL:0000192',
 'Classical monocytes': 'CL:0000860',
 'Monocyte-derived Mph': 'CL:0000861',
 'Alveolar Mph proliferating': 'CL:0000583',
 'Club (non-nasal)': 'CL:0000158',
 'SMG serous (bronchial)': 'CL:0019001',
 'EC venous systemic': 'CL:0002543',
 'Non-classical monocytes': 'CL:0000875',
 'EC general capillary': 'CL:0002144',
 'Adventitial fibroblasts': 'CL:4028006',
 'Lymphatic EC mature': 'CL:0002138',
 'EC ae

In [20]:
final_mapping_dict = {key: hlca_v1_ontology_mapping_dict.get(key, None) for key in object_cell_types}

final_mapping_dict

{'Classical monocytes': 'CL:0000860',
 'Monocyte-derived Mφ': None,
 'Alveolar macrophages': 'CL:0000583',
 'CD8 T cells': 'CL:0000625',
 'NK cells': 'CL:0000623',
 'EC general capillary': 'CL:0002144',
 'Non-classical monocytes': 'CL:0000875',
 'Interstitial Mφ perivascular': None,
 'AT2': 'CL:0002063',
 'CD4 T cells': 'CL:0000624',
 'AT1': 'CL:0002062',
 'Alveolar fibroblasts': 'CL:4028004',
 'DC2': 'CL:0002399',
 'Smooth muscle': 'CL:0019019',
 'B cells': 'CL:0000236',
 'EC arterial': 'CL:1001568',
 'EC aerocyte capillary': 'CL:0002144',
 'Mast cells': 'CL:0000097',
 'EC venous pulmonary': 'CL:0002543',
 'Plasma cells': 'CL:0000786',
 'Transitional Club-AT2': None,
 'Multiciliated (non-nasal)': 'CL:0002145',
 'EC venous systemic': 'CL:0002543',
 'Basal resting': 'CL:0002633',
 'Adventitial fibroblasts': 'CL:4028006',
 'Lymphatic EC mature': 'CL:0002138',
 'Pericytes': 'CL:0009089',
 'Alveolar Mφ proliferating': None,
 'Club (nasal)': 'CL:0000158',
 'Myofibroblasts': 'CL:0000186',
 '

In [21]:
# fill in Nones:
final_mapping_dict['Monocyte-derived Mφ'] = 'CL:0000861'
final_mapping_dict['Interstitial Mφ perivascular'] = 'CL:1001603'
final_mapping_dict['Transitional Club-AT2'] = 'CL:0002632'
final_mapping_dict['Alveolar Mφ proliferating'] = 'CL:0000583'
final_mapping_dict['Alveolar Mφ CCL3+'] = 'CL:0000583'
final_mapping_dict['Alveolar Mφ MT-positive'] = 'CL:0000583'

final_mapping_dict

{'Classical monocytes': 'CL:0000860',
 'Monocyte-derived Mφ': 'CL:0000861',
 'Alveolar macrophages': 'CL:0000583',
 'CD8 T cells': 'CL:0000625',
 'NK cells': 'CL:0000623',
 'EC general capillary': 'CL:0002144',
 'Non-classical monocytes': 'CL:0000875',
 'Interstitial Mφ perivascular': 'CL:1001603',
 'AT2': 'CL:0002063',
 'CD4 T cells': 'CL:0000624',
 'AT1': 'CL:0002062',
 'Alveolar fibroblasts': 'CL:4028004',
 'DC2': 'CL:0002399',
 'Smooth muscle': 'CL:0019019',
 'B cells': 'CL:0000236',
 'EC arterial': 'CL:1001568',
 'EC aerocyte capillary': 'CL:0002144',
 'Mast cells': 'CL:0000097',
 'EC venous pulmonary': 'CL:0002543',
 'Plasma cells': 'CL:0000786',
 'Transitional Club-AT2': 'CL:0002632',
 'Multiciliated (non-nasal)': 'CL:0002145',
 'EC venous systemic': 'CL:0002543',
 'Basal resting': 'CL:0002633',
 'Adventitial fibroblasts': 'CL:4028006',
 'Lymphatic EC mature': 'CL:0002138',
 'Pericytes': 'CL:0009089',
 'Alveolar Mφ proliferating': 'CL:0000583',
 'Club (nasal)': 'CL:0000158',
 'M

In [22]:
adata.obs[AUTHOR_CELL_TYPE] = adata.obs['ident']
adata.obs[CELL_TYPE_ONTOLOGY_ID] = adata.obs[AUTHOR_CELL_TYPE].map(final_mapping_dict)

In [23]:
adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=False)

Classical monocytes             28301
Monocyte-derived Mφ             22705
Alveolar macrophages            13493
CD8 T cells                      9086
NK cells                         5328
EC general capillary             4053
Non-classical monocytes          3752
Interstitial Mφ perivascular     3735
AT2                              3130
CD4 T cells                      2920
AT1                              1994
Alveolar fibroblasts             1521
DC2                              1364
Smooth muscle                    1347
B cells                          1155
EC arterial                       990
EC aerocyte capillary             790
Mast cells                        713
EC venous pulmonary               597
Plasma cells                      534
Transitional Club-AT2             491
Multiciliated (non-nasal)         461
EC venous systemic                411
Basal resting                     319
Adventitial fibroblasts           237
Lymphatic EC mature               195
Pericytes   

In [24]:
adata.obs[CELL_TYPE_ONTOLOGY_ID].value_counts(dropna=False)

CL:0000860    28301
CL:0000861    22705
CL:0000583    13664
CL:0000625     9086
CL:0000623     5328
CL:0002144     4843
CL:0000875     3752
CL:1001603     3735
CL:0002063     3155
CL:0000624     2920
CL:0002062     1994
CL:4028004     1521
CL:0002399     1364
CL:0019019     1347
CL:0000236     1155
CL:0002543     1008
CL:1001568      990
CL:0000097      713
CL:0000786      534
CL:0002632      491
CL:0002145      461
CL:0002633      344
CL:0002138      266
CL:4028006      237
CL:0000158      203
CL:0009089      184
CL:0000186       98
CL:2000093       89
CL:0000084       32
CL:0000451       19
CL:0000784       15
CL:0005012       14
CL:0000192       14
CL:0005006       10
CL:0002480        3
CL:0000077        2
CL:1000223        2
CL:0000057        1
Name: cell_type_ontology_term_id, dtype: int64

In [25]:
cell_types_adata = adata.obs[AUTHOR_CELL_TYPE].value_counts(dropna=True).index.tolist()

marker_genes_df = marker_genes_df[marker_genes_df[AUTHOR_CELL_TYPE].isin(cell_types_adata)]

marker_genes_df[CELL_TYPE_ONTOLOGY_ID] = marker_genes_df[AUTHOR_CELL_TYPE].map(final_mapping_dict)

marker_genes_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  marker_genes_df[CELL_TYPE_ONTOLOGY_ID] = marker_genes_df[AUTHOR_CELL_TYPE].map(final_mapping_dict)


Unnamed: 0,author_cell_type,author_cell_type_markers,cell_type_ontology_term_id
0,Alveolar macrophages,MS4A7;C1QA;HLA-DQB1;HLA-DMA;HLA-DPB1;HLA-DPA1;...,CL:0000583
1,NK cells,GZMA;CD7;CCL4;CST7;NKG7;GNLY;CTSW;CCL5;GZMB;PRF1,CL:0000623
2,AT2,SEPP1;PGC;NAPSA;SFTPD;SLC34A2;CYB5A;MUC1;S100A...,CL:0002063
3,Alveolar Mφ CCL3+,MCEMP1;UPP1;HLA-DQA1;C5AR1;HLA-DMA;AIF1;LST1;L...,CL:0000583
4,Suprabasal,PRDX2;KRT19;SFN;TACSTD2;KRT5;LDHB;KRT17;KLK11;...,CL:0002633
5,Basal resting,CYR61;PERP;IGFBP2;KRT19;KRT5;KRT17;KRT15;S100A...,CL:0002633
6,EC venous pulmonary,VWF;MGP;GNG11;RAMP2;SPARCL1;IGFBP7;IFI27;CLDN5...,CL:0002543
7,CD8 T cells,CD8A;CD3E;CCL4;CD2;CXCR4;GZMA;NKG7;IL32;CD3D;CCL5,CL:0000625
8,EC arterial,SPARCL1;SOX17;IFI27;TM4SF1;A2M;CLEC14A;GIMAP7;...,CL:1001568
9,Peribronchial fibroblasts,IGFBP7;COL1A2;COL3A1;A2M;BGN;DCN;MGP;LUM;MFAP4...,CL:2000093


In [26]:
adata.uns[MARKER_GENES] = marker_genes_df.astype(str)

# Check whether ENSEMBL IDs in var

In [27]:
adata.var

OR4F5
AL627309.1
AL627309.3
AL627309.5
AL627309.4
...
AC023491.2
AC007325.1
AC007325.4
AC007325.2
Sars-CoV2


In [28]:
gene_names = pd.read_csv(ENSEMBL_ID_PATH, sep='\t', header=None, names=['gene_id', 'gene_name', 'feature_type'])

gene_names

Unnamed: 0,gene_id,gene_name,feature_type
0,ENSG00000243485,MIR1302-2HG,Gene Expression
1,ENSG00000237613,FAM138A,Gene Expression
2,ENSG00000186092,OR4F5,Gene Expression
3,ENSG00000238009,AL627309.1,Gene Expression
4,ENSG00000239945,AL627309.3,Gene Expression
...,...,...,...
36653,Podoplanin,Podoplanin_A0127,Antibody Capture
36654,VEGFR3,VEGFR3_A0865,Antibody Capture
36655,CD186,CD186_A0804,Antibody Capture
36656,CCR5,CCR5_A0141,Antibody Capture


In [29]:
gene_names

Unnamed: 0,gene_id,gene_name,feature_type
0,ENSG00000243485,MIR1302-2HG,Gene Expression
1,ENSG00000237613,FAM138A,Gene Expression
2,ENSG00000186092,OR4F5,Gene Expression
3,ENSG00000238009,AL627309.1,Gene Expression
4,ENSG00000239945,AL627309.3,Gene Expression
...,...,...,...
36653,Podoplanin,Podoplanin_A0127,Antibody Capture
36654,VEGFR3,VEGFR3_A0865,Antibody Capture
36655,CD186,CD186_A0804,Antibody Capture
36656,CCR5,CCR5_A0141,Antibody Capture


In [30]:
# 9 non-mapped ENSEMBL IDs
len(np.intersect1d(adata.var.index.tolist(), gene_names['gene_name'].tolist()))

30660

In [31]:
#show non-mapped ENSEMBL IDs
non_mapped = np.setdiff1d(adata.var.index.tolist(), gene_names['gene_name'].tolist())

#remove '.1' at end of non-mapped ENSEMBL IDs
non_mapped_mod = [x[:-2] if x[-2:] == '.1' else x for x in non_mapped]

non_mapped

array(['ARMCX5-GPRASP2.1', 'CYB561D2.1', 'GGT1.1', 'GOLGA8M.1',
       'HSPA14.1', 'LINC01238.1', 'MATR3.1', 'TBCE.1', 'TMSB15B.1'],
      dtype='<U17')

In [32]:
# show rows in gene_names with non_mapped_mod
gene_names[gene_names['gene_name'].isin(non_mapped_mod)]

Unnamed: 0,gene_id,gene_name,feature_type
3235,ENSG00000285053,TBCE,Gene Expression
3237,ENSG00000284770,TBCE,Gene Expression
5946,ENSG00000237940,LINC01238,Gene Expression
5950,ENSG00000261186,LINC01238,Gene Expression
6504,ENSG00000114395,CYB561D2,Gene Expression
6507,ENSG00000271858,CYB561D2,Gene Expression
10565,ENSG00000280987,MATR3,Gene Expression
10567,ENSG00000015479,MATR3,Gene Expression
17692,ENSG00000284024,HSPA14,Gene Expression
17693,ENSG00000187522,HSPA14,Gene Expression


In [33]:
# change gene_name in row of non_unique gene names to unmapped
gene_names.loc[gene_names['gene_id'] == 'ENSG00000285053', 'gene_name'] = 'TBCE.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000261186', 'gene_name'] = 'LINC01238.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000271858', 'gene_name'] = 'CYB561D2.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000015479', 'gene_name'] = 'MATR3.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000284024', 'gene_name'] = 'HSPA14.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000261480', 'gene_name'] = 'GOLGA8M.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000286070', 'gene_name'] = 'GGT1.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000286237', 'gene_name'] = 'ARMCX5-GPRASP2.1'
gene_names.loc[gene_names['gene_id'] == 'ENSG00000269226', 'gene_name'] = 'TMSB15B.1'

In [34]:
np.setdiff1d(adata.var.index.tolist(), gene_names['gene_name'].tolist())

array([], dtype='<U17')

In [35]:
ensembl_dict = dict(zip(gene_names['gene_name'], gene_names['gene_id']))
ensembl_dict

{'MIR1302-2HG': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'AL627309.1': 'ENSG00000238009',
 'AL627309.3': 'ENSG00000239945',
 'AL627309.2': 'ENSG00000239906',
 'AL627309.5': 'ENSG00000241860',
 'AL627309.4': 'ENSG00000241599',
 'AP006222.2': 'ENSG00000286448',
 'AL732372.1': 'ENSG00000236601',
 'OR4F29': 'ENSG00000284733',
 'AC114498.1': 'ENSG00000235146',
 'OR4F16': 'ENSG00000284662',
 'AL669831.2': 'ENSG00000229905',
 'LINC01409': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC01128': 'ENSG00000228794',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'AL645608.6': 'ENSG00000272438',
 'AL645608.2': 'ENSG00000230699',
 'AL645608.4': 'ENSG00000241180',
 'LINC02593': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'PERM1': 'ENSG00000187642',
 'AL645608.7': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AL6

In [36]:
feature_dict = dict(zip(gene_names['gene_name'], gene_names['feature_type']))

In [37]:
adata.var['gene_symbol'] = adata.var.index
adata.var['ensembl_id'] = adata.var['gene_symbol'].map(ensembl_dict)
adata.var['feature_type'] = adata.var['gene_symbol'].map(feature_dict)
adata.var.index.name = 'index'

adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id,feature_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OR4F5,OR4F5,ENSG00000186092,Gene Expression
AL627309.1,AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,AL627309.3,ENSG00000239945,Gene Expression
AL627309.5,AL627309.5,ENSG00000241860,Gene Expression
AL627309.4,AL627309.4,ENSG00000241599,Gene Expression
...,...,...,...
AC023491.2,AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,AC007325.4,ENSG00000278817,Gene Expression
AC007325.2,AC007325.2,ENSG00000277196,Gene Expression


In [38]:
adata.var['ensembl_id'].isna().sum()

0

In [39]:
adata.var['gene_symbol'].isna().sum()

0

In [40]:
adata.var['feature_type'].isna().sum()

0

In [41]:
adata.var = adata.var.astype(str)

adata.var.index = adata.var.index.astype(str)

In [42]:
adata.var

Unnamed: 0_level_0,gene_symbol,ensembl_id,feature_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OR4F5,OR4F5,ENSG00000186092,Gene Expression
AL627309.1,AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,AL627309.3,ENSG00000239945,Gene Expression
AL627309.5,AL627309.5,ENSG00000241860,Gene Expression
AL627309.4,AL627309.4,ENSG00000241599,Gene Expression
...,...,...,...
AC023491.2,AC023491.2,ENSG00000278633,Gene Expression
AC007325.1,AC007325.1,ENSG00000276017,Gene Expression
AC007325.4,AC007325.4,ENSG00000278817,Gene Expression
AC007325.2,AC007325.2,ENSG00000277196,Gene Expression


In [43]:
adata.var.dtypes

gene_symbol     object
ensembl_id      object
feature_type    object
dtype: object

In [44]:
adata.obs.columns

Index(['donor_id', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'percent.mt',
       'predicted.ann_level_1', 'predicted.ann_level_2',
       'predicted.ann_level_3', 'predicted.ann_level_4',
       'predicted.ann_level_5', 'predicted.ann_finest_level',
       'annotation_type', 'ident', 'sample_ID', 'protocol_URL', 'institute',
       'sample_collection_site', 'sample_collection_relative_time_point',
       'library_ID', 'library_ID_repository', 'author_batch_notes',
       'organism_ontology_term_id', 'manner_of_death', 'sample_source',
       'sex_ontology_term_id', 'sample_collection_method', 'tissue_type',
       'sampled_site_condition', 'tissue_ontology_term_id', 'tissue_free_text',
       'sample_preservation_method', 'suspension_type', 'cell_enrichment',
       'cell_viability_percentage', 'cell_number_loaded',
       'sample_collection_year', 'assay_ontology_term_id',
       'library_preparation_batch', 'library_sequencing_run',
       'sequenced_fragment', 'sequencing_platfor

In [45]:
adata.uns[MARKER_GENES].dtypes

author_cell_type              object
author_cell_type_markers      object
cell_type_ontology_term_id    object
dtype: object

# Check raw data

In [52]:
# make checks for adata before saving raw

string_cols = ['cell_number_loaded', 'library_sequencing_run']

for col in string_cols:
    adata.obs[col] = adata.obs[col].astype(str)

numeric_cols = ['nCount_RNA', 'nFeature_RNA', 'nCount_ADT', 'percent.mt',
    'cell_viability_percentage', 'sample_collection_year',
    'manner_of_death']

for col in numeric_cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors='coerce')

# Make df indices str
adata.obs.index = adata.obs.index.astype(str)
adata.var.index = adata.var.index.astype(str)
adata.var_names = adata.var_names.astype(str)

adata.var = adata.var.astype(str)

In [135]:
for col in adata.obs.columns:
    print(f"Column: {col}")
    print(adata.obs[col].dtype)
    print(adata.obs[col].value_counts(dropna=False))
    print("------------------------------------")

Column: donor_id
category
D116    11182
D291     9790
D312     9700
D122     8721
D239     8041
D404     7855
D373     6623
D341     6526
D292     6251
D305     6123
D356     5894
D271     5559
D307     4538
D391     4128
D403     3853
D376     3217
D283     2599
Name: donor_id, dtype: int64
------------------------------------
Column: nCount_RNA
float64
577.0      74
600.0      63
578.0      63
584.0      62
575.0      61
           ..
27297.0     1
63452.0     1
19881.0     1
71064.0     1
23410.0     1
Name: nCount_RNA, Length: 35134, dtype: int64
------------------------------------
Column: nFeature_RNA
Int32
430     98
391     93
442     91
396     90
389     90
        ..
8452     1
8556     1
7817     1
7316     1
<NA>     0
Name: nFeature_RNA, Length: 7452, dtype: Int64
------------------------------------
Column: nCount_ADT
float64
625.0      164
640.0      160
633.0      155
678.0      153
604.0      152
          ... 
4305.0       1
3875.0       1
3226.0       1
7674.0      

In [131]:
adata.obs.dtypes

donor_id                                    category
nCount_RNA                                   float64
nFeature_RNA                                   Int32
nCount_ADT                                   float64
percent.mt                                   float64
predicted.ann_level_1                       category
predicted.ann_level_2                       category
predicted.ann_level_3                       category
predicted.ann_level_4                       category
predicted.ann_level_5                       category
predicted.ann_finest_level                  category
annotation_type                             category
ident                                       category
sample_ID                                   category
protocol_URL                                category
institute                                   category
sample_collection_site                      category
sample_collection_relative_time_point       category
library_ID                                  ca

In [147]:
adata.X.toarray().max()

15619.0

In [53]:
adata.X = adata.X.astype(np.int64)

In [54]:
adata.raw = adata

In [150]:
adata.X

<110600x30669 sparse matrix of type '<class 'numpy.int64'>'
	with 292010395 stored elements in Compressed Sparse Row format>

In [151]:
adata.raw.X

<110600x30669 sparse matrix of type '<class 'numpy.int64'>'
	with 292010395 stored elements in Compressed Sparse Row format>

In [152]:
adata.X.toarray().max()

15619

In [153]:
adata.raw.X.toarray().max()

15619

## Validation result

### UNS Validation
- OK: Tier 1 UNS Google Sheet
- OK: Tier 1 UNS AnnData Object

### OBS Validation
- OK: Tier 1 OBS Google Sheet
- OK: Tier 1 OBS AnnData Object

# Data Submission Status

- CHECK: Data in X
- CHECK: Tier 1 Metadata in OBS
- CHECK: Cell Ontology IDs in OBS
- CHECK: Author cell type in OBS
- CHECK: Marker genes in UNS
- CHECK: ENSEMBL IDs in var


### REVISION

DONE

In [55]:
adata.write_h5ad(join(OUTPUT_PATH_PREREVISION, f"{DATASET_ID}.h5ad"))
adata.write_h5ad(join(OUTPUT_PATH_POSTREVISION, f"{DATASET_ID}.h5ad"))