In [1]:
import scanpy as sc
import pandas as pd
import anndata as ad
import anndata2ri
import os
from os.path import join

from hlca_v2.ingestion_utils import get_gspread_df, ValidationWorkflow, AnnDataMerger

GSPREAD_JSON = "/home/icb/raphael.kfuri-rubens/data/hlca_v2/google_sheets_api/hlca-v2-8d5fea10d8f3.json"

In [2]:
DATASET_ID = "Zemin_Zhang_publ"
H5AD_PATH = f"/home/icb/raphael.kfuri-rubens/data/hlca_v2/{DATASET_ID}/{DATASET_ID}.h5ad"

In [3]:
# Name constants
AUTHOR_CELL_TYPE_L0 = 'author_cell_type_level_0'
AUTHOR_CELL_TYPE_L1 = 'author_cell_type_level_1'

CELL_TYPE_ONTOLOGY_ID_L0 = 'cell_type_ontology_term_id_level_0'
CELL_TYPE_ONTOLOGY_ID_L1 = 'cell_type_ontology_term_id_level_1'

CELL_TYPE_ONTOLOGY_LABEL_L0 = 'cell_type_ontology_term_label_level_0'
CELL_TYPE_ONTOLOGY_LABEL_L1 = 'cell_type_ontology_term_label_level_1'

AUTHOR_CELL_TYPE_DESCRIPTION_L0 = 'author_cell_type_description_level_0'
AUTHOR_CELL_TYPE_DESCRIPTION_L1 = 'author_cell_type_description_level_1'

MARKER_GENES_L0 = 'author_cell_type_markers_level_0'
MARKER_GENES_L1 = 'author_cell_type_markers_level_1'

# Finest grained annotation will be generic dataset cell type
AUTHOR_CELL_TYPE = 'author_cell_type'
CELL_TYPE_ONTOLOGY_ID = 'cell_type_ontology_term_id'
CELL_TYPE_ONTOLOGY_LABEL = 'cell_type_ontology_term_label'
MARKER_GENES = 'author_cell_type_markers'
AUTHOR_CELL_TYPE_DESCRIPTION = 'author_cell_type_description'

# Load data

In [4]:
adata = sc.read_h5ad(H5AD_PATH)
#obs = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "obs")
#uns = get_gspread_df(GSPREAD_JSON, DATASET_ID, "tier_1", "uns")

In [11]:
adata.obs.columns

Index(['celltype', 'majorType', 'sampleID', 'PatientID', 'datasets', 'City',
       'Age', 'Sex', 'Sample type', 'CoVID-19 severity', 'Sample time',
       'Sampling day (Days after symptom onset)', 'SARS-CoV-2',
       'Single cell sequencing platform', 'BCR single cell sequencing',
       'TCR single cell sequencing', 'Outcome', 'Comorbidities',
       'COVID-19-related medication and anti-microbials', 'Leukocytes [G/L]',
       'Neutrophils [G/L]', 'Lymphocytes [G/L]', 'Unpublished'],
      dtype='object')

In [16]:
sample_types_lung = [
    'fresh BALF',
    'fresh Sputum',
    'fresh PFMC'
]

adata = adata[adata.obs['Sample type'].isin(sample_types_lung)]
adata

View of AnnData object with n_obs × n_vars = 63516 × 27943
    obs: 'celltype', 'majorType', 'sampleID', 'PatientID', 'datasets', 'City', 'Age', 'Sex', 'Sample type', 'CoVID-19 severity', 'Sample time', 'Sampling day (Days after symptom onset)', 'SARS-CoV-2', 'Single cell sequencing platform', 'BCR single cell sequencing', 'TCR single cell sequencing', 'Outcome', 'Comorbidities', 'COVID-19-related medication and anti-microbials', 'Leukocytes [G/L]', 'Neutrophils [G/L]', 'Lymphocytes [G/L]', 'Unpublished'
    var: 'EnsemblID'

In [19]:
adata.obs['CoVID-19 severity'].value_counts()

severe/critical    54078
mild/moderate       9438
Name: CoVID-19 severity, dtype: int64

-> no healthy lung samples
-> only BALF, sputum, PFMC in mild-severe CODID19 patients