# 01 - Preprocessing COVID dataset

In [29]:
from pathlib import Path
import os
import scanpy as sc
import pandas as pd
import numpy as np
import hdf5plugin

In [19]:
covidDataPath = Path("../data/covid") # covid data's path

covidData = sc.read(covidDataPath / "covidObj.h5ad") # loading Covid data

In [20]:
print(f'COVID dataset \n Cells: {covidData.X.shape[0]} \n Genes: {covidData.X.shape[1]}\n')

COVID dataset 
 Cells: 375438 
 Genes: 14063



In [21]:
covidData.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Infection_status,Severity,log10GenesPerUMI,percent.mt,percent.Heme,Study,seurat_clusters,new.annot2
560_cell.588_AGTTGGAGAACG_1,560_cell,285.0,236,Covid,Severe,0.800229,11.561265,0.000000,Wilk,26,4
560_cell.637_TTGACACATACC_1,560_cell,555.0,284,Covid,Severe,0.821141,8.294501,17.334576,Wilk,34,4
560_cell.639_CACCATAAGAAT_1,560_cell,523.0,284,Covid,Severe,0.828072,2.131783,15.116279,Wilk,55,14
560_cell.647_TTGTGAAACAGT_1,560_cell,675.0,295,Covid,Severe,0.813503,7.190083,23.636364,Wilk,10,4
560_cell.652_CTAAGTTGCTTT_1,560_cell,461.0,301,Covid,Severe,0.836481,9.232264,7.580175,Wilk,50,1
...,...,...,...,...,...,...,...,...,...,...,...
TGTACCTTACACTCGAGAAAGGGTCAG_22_5,BN-28,1629.0,659,Healthy,Healthy,0.882532,21.009867,0.000000,SS_C2,15,8
AACTGTATTACTATCCTCCTAGATAGA_22_5,BN-05,709.0,390,Covid,Non-severe,0.907000,20.129870,0.779221,SS_C2,23,4
CACATTGCAGCTAACTCACAGGCATTT_22_5,BN-28,446.0,310,Healthy,Healthy,0.931754,17.892644,0.397614,SS_C2,23,4
TCTCTTCAACAATTGATCAGCCGCAAG_22_5,BN-12,893.0,586,Covid,Severe,0.940330,13.244353,0.410678,SS_C2,31,2


In [22]:
# From Mukund et al. supplementary information 
# URL: https://www.frontiersin.org/journals/immunology/articles/10.3389/fimmu.2021.738073/full#supplementary-material
column_names = [
'cluster',
'number of cells',
'SingleR Annotation',	
'Human_gene_atlas enrichment(of ClusterMarkers)',
'Final Cluster Identities'
]

# Read clusterIdentities.tsv to map cluster number with cluster name
clusterIdentities = pd.read_csv(covidDataPath / 'clusterIdentities.tsv', sep="\t", header=None, names=column_names)
cluster_identity_map = dict(zip(clusterIdentities["cluster"], clusterIdentities["Final Cluster Identities"]))

# Create new metadata column called 'celltype' to assign names to clusters 
covidData.obs['celltype'] = covidData.obs["seurat_clusters"].map(cluster_identity_map).astype("category")

# List of 5 studies to assign adata.obs.batch_id
study_names = [
    'Wilk',
    'Lee',
    'Bali',
    'SS_C1',
    'SS_C2'
    ]

                
# define unique numerical labels for cell types and study
celltype_id_labels = covidData.obs["celltype"].astype("category").cat.codes.values
batch_id_labels = covidData.obs["Study"].astype("category").cat.codes.values


id2type = dict(enumerate(covidData.obs["celltype"].astype("category").cat.categories))
id2batch = dict(enumerate(covidData.obs["Study"].astype("category").cat.categories))

covidData.obs["celltype_id"] = celltype_id_labels
covidData.obs["batch_id"] = batch_id_labels

# Check number of unique cell types and studies in paper

num_types_cell = len(np.unique(celltype_id_labels))
num_types_batch = len(np.unique(batch_id_labels))

print(f"Number of unique final cluster annotations: {num_types_cell}")
print(f"Number of unique studies: {num_types_batch}")

print(id2batch)
print(id2type)

Number of unique final cluster annotations: 20
Number of unique studies: 5
{0: 'Bali', 1: 'Lee', 2: 'SS_C1', 3: 'SS_C2', 4: 'Wilk'}
{0: 'B lymphoblast', 1: 'CD14+ Monocytes', 2: 'CD16+ Monocytes', 3: 'CD34+ Progenitor cells', 4: 'CD4+ T cells', 5: 'CD8+ T cells', 6: 'Intermediate monocytes', 7: 'Low-density basophils', 8: 'Low-density neutrophils', 9: 'MAIT cells', 10: 'Megakaryocytes', 11: 'Myeloid dendritic cells', 12: 'NK cells', 13: 'Naïve B cells', 14: 'Naïve CD4+ T cells', 15: 'Naïve CD8+ T cells', 16: 'Non-switched memory B cells', 17: 'Plasmablasts', 18: 'Plasmacytoid dendritic cells', 19: 'gd T cells'}


In [None]:
covidData.write_h5ad(covidDataPath/'covidData.h5ad'
                         compression=hdf5plugin.FILTERS["zstd"]
)

ValueError: '_index' is a reserved name for dataframe columns.

In [27]:
covidData.var.columns

Index(['features'], dtype='object')

In [None]:
# Rename features -> gene_name
covidData.var.rename(columns={'features': 'gene_name'}, inplace=True)

# Set gene_name as variable index
covidData.var.set_index(covidData.var["gene_name"], inplace=True)
covidData.var["gene_name"] = covidData.var.index.tolist()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Infection_status,Severity,log10GenesPerUMI,percent.mt,percent.Heme,Study,seurat_clusters,new.annot2,celltype,celltype_id,batch_id
560_cell.588_AGTTGGAGAACG_1,560_cell,285.0,236,Covid,Severe,0.800229,11.561265,0.000000,Wilk,26,4,CD14+ Monocytes,1,4
560_cell.637_TTGACACATACC_1,560_cell,555.0,284,Covid,Severe,0.821141,8.294501,17.334576,Wilk,34,4,CD14+ Monocytes,1,4
560_cell.639_CACCATAAGAAT_1,560_cell,523.0,284,Covid,Severe,0.828072,2.131783,15.116279,Wilk,55,14,Low-density neutrophils,8,4
560_cell.647_TTGTGAAACAGT_1,560_cell,675.0,295,Covid,Severe,0.813503,7.190083,23.636364,Wilk,10,4,CD14+ Monocytes,1,4
560_cell.652_CTAAGTTGCTTT_1,560_cell,461.0,301,Covid,Severe,0.836481,9.232264,7.580175,Wilk,50,1,CD4+ T cells,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGTACCTTACACTCGAGAAAGGGTCAG_22_5,BN-28,1629.0,659,Healthy,Healthy,0.882532,21.009867,0.000000,SS_C2,15,8,MAIT cells,9,3
AACTGTATTACTATCCTCCTAGATAGA_22_5,BN-05,709.0,390,Covid,Non-severe,0.907000,20.129870,0.779221,SS_C2,23,4,CD14+ Monocytes,1,3
CACATTGCAGCTAACTCACAGGCATTT_22_5,BN-28,446.0,310,Healthy,Healthy,0.931754,17.892644,0.397614,SS_C2,23,4,CD14+ Monocytes,1,3
TCTCTTCAACAATTGATCAGCCGCAAG_22_5,BN-12,893.0,586,Covid,Severe,0.940330,13.244353,0.410678,SS_C2,31,2,CD8+ T cells,5,3
