In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Volumes/RicoData2/ReHeat2/raw")

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
sc_dat = sc.read_h5ad(filename = "Koenig2022_DCM/Koenig2022_DCM.h5ad")
sc_dat.obs["barcode"] = sc_dat.obs.index.values

In [3]:
sc_dat.obs.columns

Index(['orig_ident', 'nCount_RNA', 'nFeature_RNA', 'percent_mito',
       'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res_0_03', 'SCT_snn_res_0_04',
       'SCT_snn_res_0_05', 'SCT_snn_res_0_06', 'SCT_snn_res_0_07',
       'SCT_snn_res_0_08', 'SCT_snn_res_0_09', 'SCT_snn_res_0_1',
       'SCT_snn_res_0_2', 'SCT_snn_res_0_3', 'SCT_snn_res_0_4',
       'SCT_snn_res_0_5', 'SCT_snn_res_0_6', 'SCT_snn_res_0_7',
       'SCT_snn_res_0_8', 'SCT_snn_res_0_9', 'SCT_snn_res_1',
       'seurat_clusters', 'Names', 'Condition', 'ident', 'barcode'],
      dtype='object')

In [4]:
sc_dat.obs["Condition"].drop_duplicates()

H_ZC-11-292_TAAGTGCAGCAGGTCA    Donor
H_ZC-LVAD_CCTTCGATCCTAAGTG        DCM
Name: Condition, dtype: category
Categories (2, object): ['DCM', 'Donor']

In [5]:
disease_df = {'Condition': ['Donor','DCM'],
              'disease_code': ["NF","DCM"],
              'heart_failure': ["NF", "HF"]}

disease_df = pd.DataFrame(disease_df)

In [6]:
disease_df

Unnamed: 0,Condition,disease_code,heart_failure
0,Donor,NF,NF
1,DCM,DCM,HF


In [7]:
new_codes = sc_dat.obs.merge(disease_df, on='Condition', 
                           how='left')[["barcode","Condition","disease_code","heart_failure"]]
new_codes.set_index("barcode", inplace = True)
new_codes = new_codes.loc[sc_dat.obs.index.values, :]
sc_dat.obs["disease_code"] = new_codes["disease_code"].values
sc_dat.obs["heart_failure"] = new_codes["heart_failure"].values

In [8]:
sc_dat.obs[["Condition","disease_code","heart_failure"]].drop_duplicates()

Unnamed: 0,Condition,disease_code,heart_failure
H_ZC-11-292_TAAGTGCAGCAGGTCA,Donor,NF,NF
H_ZC-LVAD_CCTTCGATCCTAAGTG,DCM,DCM,HF


In [9]:
available_cells = sc_dat.obs[["Names"]].drop_duplicates()
available_cells["cell_type_uni"] = "none"
available_cells

Unnamed: 0,Names,cell_type_uni
H_ZC-11-292_TAAGTGCAGCAGGTCA,Endocardium,none
H_ZC-11-292_TACACGACACGGTGTC,Endothelium,none
TWCM-11-74_TTGGCAACAAACGCGA,Cardiomyocytes,none
TWCM-13-208_GTAACTGTCGGAAACG,Mast_Cells,none
H_ZC-11-292_GCTTGAATCTGTTTGT,Macrophages,none
H_ZC-11-292_GAAACTCAGCCACGCT,Monocytes,none
H_ZC-11-292_GGGATGACACAGGTTT,B_Cells,none
TWCM-13-104_ACACCAAGTACCTACA,Fibroblasts,none
H_ZC-11-292_GGACAGAAGCTGGAAC,Pericytes,none
H_ZC-LVAD_CCTACCATCTGCCAGG,Smooth_Muscle,none


In [10]:
available_cells.loc[available_cells["Names"].str.contains('Endothelium'), 'cell_type_uni'] = "Endo"
available_cells.loc[available_cells["Names"].str.contains('Cardiomyocytes'), 'cell_type_uni'] = "CM"
available_cells.loc[available_cells["Names"].str.contains('Mast_Cells'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["Names"].str.contains('Macrophages'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["Names"].str.contains('Monocytes'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["Names"].str.contains('B_Cells'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["Names"].str.contains('Fibroblast'), 'cell_type_uni'] = "Fib"
available_cells.loc[available_cells["Names"].str.contains('Pericytes'), 'cell_type_uni'] = "PC"
available_cells.loc[available_cells["Names"].str.contains('Smooth_Muscle'), 'cell_type_uni'] = "vSMCs"
available_cells.loc[available_cells["Names"].str.contains('T/NK_Cells'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["Names"].str.contains('Lymphatic'), 'cell_type_uni'] = "Lymphoid"

In [11]:
# Generates a new dataframe that keeps the new cell-type annotation
new_cts = sc_dat.obs.merge(available_cells, on='Names', 
                           how='left')[["barcode","cell_type_uni"]]
new_cts.set_index("barcode", inplace = True)
new_cts = new_cts.loc[sc_dat.obs.index.values, :]
sc_dat.obs["cell_type_uni"] = new_cts["cell_type_uni"].values

In [12]:
sc_dat.obs[["Names", "cell_type_uni"]].drop_duplicates()

Unnamed: 0,Names,cell_type_uni
H_ZC-11-292_TAAGTGCAGCAGGTCA,Endocardium,none
H_ZC-11-292_TACACGACACGGTGTC,Endothelium,Endo
TWCM-11-74_TTGGCAACAAACGCGA,Cardiomyocytes,CM
TWCM-13-208_GTAACTGTCGGAAACG,Mast_Cells,Myeloid
H_ZC-11-292_GCTTGAATCTGTTTGT,Macrophages,Myeloid
H_ZC-11-292_GAAACTCAGCCACGCT,Monocytes,Myeloid
H_ZC-11-292_GGGATGACACAGGTTT,B_Cells,Lymphoid
TWCM-13-104_ACACCAAGTACCTACA,Fibroblasts,Fib
H_ZC-11-292_GGACAGAAGCTGGAAC,Pericytes,PC
H_ZC-LVAD_CCTACCATCTGCCAGG,Smooth_Muscle,vSMCs


In [13]:
sc_dat = sc_dat[sc_dat.obs[["cell_type_uni"]].values != "none",]

In [14]:
# Filter obs to contain things that are relevant
sc_dat.obs = sc_dat.obs[['orig_ident',
                         "cell_type_uni",
                         'Condition',
                         'disease_code',
                        'heart_failure']]

In [15]:
sc_dat.obs = sc_dat.obs.rename(columns={"orig_ident": "sample_id", "cell_type_uni":"cell_type"})

In [16]:
sc_dat.write_h5ad("/Volumes/RicoData2/ReHeat2/simplified/Koenig2022_DCM.h5ad")