In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Volumes/RicoData2/ReHeat2/raw")

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
sc_dat = sc.read_h5ad(filename = "Armute2023_LVAD/GSE226314_global.h5ad")

In [3]:
sc_dat.obs["barcode"] = sc_dat.obs.index.values

In [4]:
list(sc_dat.obs.columns)

['orig_ident',
 'nCount_RNA',
 'nFeature_RNA',
 'percent_mt',
 'scrublet_score',
 'scrublet_cluster_score',
 'bh_pval',
 'nCount_SCT',
 'nFeature_SCT',
 'SCT_snn_res_0_3',
 'seurat_clusters',
 'cell_type',
 'condition',
 'RNA_snn_res_0_2',
 'RNA_snn_res_0_3',
 'RNA_snn_res_0_4',
 'RNA_snn_res_0_5',
 'RNA_snn_res_0_6',
 'Fibro_z',
 'Pericyte_z',
 'SMC_z',
 'SCT_snn_res_0_1',
 'SCT_snn_res_0_2',
 'SCT_snn_res_0_4',
 'SCT_snn_res_0_5',
 'ident',
 'barcode']

In [5]:
sc_dat.obs["condition"].drop_duplicates()

TWCM-190-R-post_AACCATGGTTACTGAC      Rpost
TWCM-229-R-pre_CGGACTGCAATCTACG        Rpre
TWCM-359-NR-post_CTTGGCTGTTCGCTAA    NRpost
TWCM-359-NR-pre_ACATACGCACATCTTT      NRpre
TWCM-11-103_ACACTGAGTGTGACGA          Donor
Name: condition, dtype: category
Categories (5, object): ['Donor', 'NRpost', 'NRpre', 'Rpost', 'Rpre']

In [6]:
disease_df = {'condition': ['Rpost',
                            'Rpre',
                            'NRpost',
                            'NRpre',
                            'Donor'],
              'disease_code': ["DCM_rec","DCM","DCM","DCM","NF"],
              'response': ["recovered","recovered","not_recovered","not_recovered","not_applicable"],
              'biopsy': ["post","pre","post","pre","not_applicable"],
              'heart_failure': ["HF", "HF", "HF", "HF","NF"]}

disease_df = pd.DataFrame(disease_df)

In [7]:
disease_df

Unnamed: 0,condition,disease_code,response,biopsy,heart_failure
0,Rpost,DCM_rec,recovered,post,HF
1,Rpre,DCM,recovered,pre,HF
2,NRpost,DCM,not_recovered,post,HF
3,NRpre,DCM,not_recovered,pre,HF
4,Donor,NF,not_applicable,not_applicable,NF


In [8]:
new_codes = sc_dat.obs.merge(disease_df, on='condition', 
                           how='left')[["barcode","disease_code","heart_failure", "response", "biopsy"]]

In [9]:
new_codes.set_index("barcode", inplace = True)
new_codes = new_codes.loc[sc_dat.obs.index.values, :]

In [10]:
sc_dat.obs["disease_code"] = new_codes["disease_code"].values
sc_dat.obs["heart_failure"] = new_codes["heart_failure"].values
sc_dat.obs["response"] = new_codes["response"].values
sc_dat.obs["biopsy"] = new_codes["biopsy"].values

In [11]:
sc_dat.obs[["condition","disease_code","heart_failure", "response", "biopsy"]].drop_duplicates()

Unnamed: 0,condition,disease_code,heart_failure,response,biopsy
TWCM-190-R-post_AACCATGGTTACTGAC,Rpost,DCM_rec,HF,recovered,post
TWCM-229-R-pre_CGGACTGCAATCTACG,Rpre,DCM,HF,recovered,pre
TWCM-359-NR-post_CTTGGCTGTTCGCTAA,NRpost,DCM,HF,not_recovered,post
TWCM-359-NR-pre_ACATACGCACATCTTT,NRpre,DCM,HF,not_recovered,pre
TWCM-11-103_ACACTGAGTGTGACGA,Donor,NF,NF,not_applicable,not_applicable


In [12]:
available_cells = sc_dat.obs[["cell_type"]].drop_duplicates()
available_cells

Unnamed: 0,cell_type
TWCM-190-R-post_AACCATGGTTACTGAC,Adipocyte
TWCM-13-192_ATTATCCGTTCCATGA,Fibroblast
TWCM-14-173_TTGCCGTGTCAATACC,Epicardium
TWCM-359-NR-post_CGTCACTGTCCGAACC,Mast
TWCM-397-NR-post_AAAGATGCACAGGAGT,Myeloid
TWCM-190-R-post_AAACCTGAGCGGATCA,Cardiomyocyte
TWCM-229-R-pre_ATCATGGTCTTGAGAC,Endocardium
TWCM-239-R-post_TACTTGTCAAAGGTGC,Endothelium
TWCM-373-NR-post_AGTGAGGAGTGGCACA,Glia
TWCM-410-NR-post_GGGAGATAGGAGTTGC,SMC


In [13]:
available_cells["cell_type_uni"] = "none"
available_cells

Unnamed: 0,cell_type,cell_type_uni
TWCM-190-R-post_AACCATGGTTACTGAC,Adipocyte,none
TWCM-13-192_ATTATCCGTTCCATGA,Fibroblast,none
TWCM-14-173_TTGCCGTGTCAATACC,Epicardium,none
TWCM-359-NR-post_CGTCACTGTCCGAACC,Mast,none
TWCM-397-NR-post_AAAGATGCACAGGAGT,Myeloid,none
TWCM-190-R-post_AAACCTGAGCGGATCA,Cardiomyocyte,none
TWCM-229-R-pre_ATCATGGTCTTGAGAC,Endocardium,none
TWCM-239-R-post_TACTTGTCAAAGGTGC,Endothelium,none
TWCM-373-NR-post_AGTGAGGAGTGGCACA,Glia,none
TWCM-410-NR-post_GGGAGATAGGAGTTGC,SMC,none


In [14]:
available_cells.loc[available_cells["cell_type"].str.contains('Fibroblast'), 'cell_type_uni'] = "Fib"
available_cells.loc[available_cells["cell_type"].str.contains('Myeloid'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type"].str.contains('Mast'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type"].str.contains('Cardiomyocyte'), 'cell_type_uni'] = "CM"
available_cells.loc[available_cells["cell_type"].str.contains('Endothelium'), 'cell_type_uni'] = "Endo"
available_cells.loc[available_cells["cell_type"].str.contains('SMC'), 'cell_type_uni'] = "vSMCs"
available_cells.loc[available_cells["cell_type"].str.contains('TNKCells'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["cell_type"].str.contains('Lymphatic'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["cell_type"].str.contains('Pericyte'), 'cell_type_uni'] = "PC"

In [15]:
# Generates a new dataframe that keeps the new cell-type annotation
new_cts = sc_dat.obs.merge(available_cells, on='cell_type', 
                           how='left')[["barcode","cell_type_uni"]]
new_cts.set_index("barcode", inplace = True)
new_cts = new_cts.loc[sc_dat.obs.index.values, :]

In [16]:
sc_dat.obs["cell_type_uni"] = new_cts["cell_type_uni"].values

In [17]:
sc_dat.obs[["cell_type", "cell_type_uni"]].drop_duplicates()

Unnamed: 0,cell_type,cell_type_uni
TWCM-190-R-post_AACCATGGTTACTGAC,Adipocyte,none
TWCM-13-192_ATTATCCGTTCCATGA,Fibroblast,Fib
TWCM-14-173_TTGCCGTGTCAATACC,Epicardium,none
TWCM-359-NR-post_CGTCACTGTCCGAACC,Mast,Myeloid
TWCM-397-NR-post_AAAGATGCACAGGAGT,Myeloid,Myeloid
TWCM-190-R-post_AAACCTGAGCGGATCA,Cardiomyocyte,CM
TWCM-229-R-pre_ATCATGGTCTTGAGAC,Endocardium,none
TWCM-239-R-post_TACTTGTCAAAGGTGC,Endothelium,Endo
TWCM-373-NR-post_AGTGAGGAGTGGCACA,Glia,none
TWCM-410-NR-post_GGGAGATAGGAGTTGC,SMC,vSMCs


In [18]:
sc_dat = sc_dat[sc_dat.obs[["cell_type_uni"]].values != "none",]

In [19]:
sc_dat.obs = sc_dat.obs[["orig_ident","condition","cell_type_uni", "disease_code","heart_failure", "response", "biopsy"]]

In [20]:
sc_dat.obs = sc_dat.obs.rename(columns={"orig_ident": "sample_id", "cell_type_uni":"cell_type"})

In [21]:
sc_dat.write_h5ad("/Volumes/RicoData2/ReHeat2/simplified/Armute2023_LVAD.h5ad")