In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Volumes/RicoData2/ReHeat2/raw")

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
sc_dat = sc.read_h5ad(filename = "./Simonson2023_ICM/ICM_scportal_05.24.2022.h5ad")
sc_dat.obs["barcode"] = sc_dat.obs.index.values

In [11]:
#Making a lighter object
del sc_dat.layers["cellranger_raw"]

In [3]:
sc_dat.obs.columns

Index(['biosample_id', 'donor_id', 'disease', 'sex', 'age',
       'cell_type_leiden0.5', 'sub_cluster', 'cellbender_ncount',
       'cellbender_ngenes', 'cellranger_percent_mito', 'exon_prop',
       'cellbender_entropy', 'cellranger_doublet_scores', 'barcode'],
      dtype='object')

In [4]:
sc_dat.obs[['donor_id', 'disease']].drop_duplicates().shape

(15, 2)

In [5]:
sc_dat.obs[['donor_id']].drop_duplicates().shape

(15, 1)

In [6]:
sc_dat.obs[['disease']].drop_duplicates()

Unnamed: 0,disease
TCCCACACAATAGTGA-1-0,ICM
ACCAACAAGGGTTTCT-1-1,NF


In [7]:
disease_df = {'disease': ['ICM', 'NF'],
              'disease_code': ["ICM","NF"],
              'heart_failure': ["HF", "NF"]}

disease_df = pd.DataFrame(disease_df)

In [8]:
new_codes = sc_dat.obs.merge(disease_df, on='disease', 
                           how='left')[["barcode","disease","disease_code","heart_failure"]]
new_codes.set_index("barcode", inplace = True)
new_codes = new_codes.loc[sc_dat.obs.index.values, :]
sc_dat.obs["disease_code"] = new_codes["disease_code"].values
sc_dat.obs["heart_failure"] = new_codes["heart_failure"].values

In [9]:
sc_dat.obs[["disease","disease_code","heart_failure"]].drop_duplicates()

Unnamed: 0,disease,disease_code,heart_failure
TCCCACACAATAGTGA-1-0,ICM,ICM,HF
ACCAACAAGGGTTTCT-1-1,NF,NF,NF


In [12]:
available_cells = sc_dat.obs[["cell_type_leiden0.5"]].drop_duplicates()
available_cells

Unnamed: 0,cell_type_leiden0.5
TCCCACACAATAGTGA-1-0,Cardiomyocyte I
ATTCGTTGTAACAGTA-1-0,Endothelial II
TCTGCCACACAATGTC-1-0,Lymphatic endothelial
ATACTTCCAGAGAGGG-1-0,Proliferating macrophage
ATTCCTAAGTTCCAGT-1-0,Fibroblast
TCCTCCCCACAAATGA-1-0,Pericyte
CAACAGTTCCCTCTAG-1-0,Neuronal
TGGTAGTAGGCGCTTC-1-0,VSMC
AGACACTTCTACAGGT-1-0,Lymphocyte
TGAGGAGGTTACCTTT-1-0,Macrophage


In [13]:
available_cells["cell_type_uni"] = "none"
available_cells

Unnamed: 0,cell_type_leiden0.5,cell_type_uni
TCCCACACAATAGTGA-1-0,Cardiomyocyte I,none
ATTCGTTGTAACAGTA-1-0,Endothelial II,none
TCTGCCACACAATGTC-1-0,Lymphatic endothelial,none
ATACTTCCAGAGAGGG-1-0,Proliferating macrophage,none
ATTCCTAAGTTCCAGT-1-0,Fibroblast,none
TCCTCCCCACAAATGA-1-0,Pericyte,none
CAACAGTTCCCTCTAG-1-0,Neuronal,none
TGGTAGTAGGCGCTTC-1-0,VSMC,none
AGACACTTCTACAGGT-1-0,Lymphocyte,none
TGAGGAGGTTACCTTT-1-0,Macrophage,none


In [14]:
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Cardiomyocyte'), 'cell_type_uni'] = "CM"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Endothelial'), 'cell_type_uni'] = "Endo"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Macrophage', flags=re.IGNORECASE), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Fibroblast'), 'cell_type_uni'] = "Fib"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Pericyte'), 'cell_type_uni'] = "PC"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('VSMC'), 'cell_type_uni'] = "vSMCs"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Lymphocyte'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["cell_type_leiden0.5"].str.contains('Mast'), 'cell_type_uni'] = "Myeloid"

In [15]:
available_cells

Unnamed: 0,cell_type_leiden0.5,cell_type_uni
TCCCACACAATAGTGA-1-0,Cardiomyocyte I,CM
ATTCGTTGTAACAGTA-1-0,Endothelial II,Endo
TCTGCCACACAATGTC-1-0,Lymphatic endothelial,none
ATACTTCCAGAGAGGG-1-0,Proliferating macrophage,Myeloid
ATTCCTAAGTTCCAGT-1-0,Fibroblast,Fib
TCCTCCCCACAAATGA-1-0,Pericyte,PC
CAACAGTTCCCTCTAG-1-0,Neuronal,none
TGGTAGTAGGCGCTTC-1-0,VSMC,vSMCs
AGACACTTCTACAGGT-1-0,Lymphocyte,Lymphoid
TGAGGAGGTTACCTTT-1-0,Macrophage,Myeloid


In [16]:
# Generates a new dataframe that keeps the new cell-type annotation
new_cts = sc_dat.obs.merge(available_cells, on='cell_type_leiden0.5', 
                           how='left')[["barcode","cell_type_uni"]]
new_cts.set_index("barcode", inplace = True)
new_cts = new_cts.loc[sc_dat.obs.index.values, :]
sc_dat.obs["cell_type_uni"] = new_cts["cell_type_uni"].values

In [18]:
sc_dat.obs[["cell_type_leiden0.5", "cell_type_uni"]].drop_duplicates()

Unnamed: 0,cell_type_leiden0.5,cell_type_uni
TCCCACACAATAGTGA-1-0,Cardiomyocyte I,CM
ATTCGTTGTAACAGTA-1-0,Endothelial II,Endo
TCTGCCACACAATGTC-1-0,Lymphatic endothelial,none
ATACTTCCAGAGAGGG-1-0,Proliferating macrophage,Myeloid
ATTCCTAAGTTCCAGT-1-0,Fibroblast,Fib
TCCTCCCCACAAATGA-1-0,Pericyte,PC
CAACAGTTCCCTCTAG-1-0,Neuronal,none
TGGTAGTAGGCGCTTC-1-0,VSMC,vSMCs
AGACACTTCTACAGGT-1-0,Lymphocyte,Lymphoid
TGAGGAGGTTACCTTT-1-0,Macrophage,Myeloid


In [19]:
sc_dat = sc_dat[sc_dat.obs[["cell_type_uni"]].values != "none",]

In [20]:
# Filter obs to contain things that are relevant
sc_dat.obs = sc_dat.obs[['donor_id', 'disease', 'sex', 'age',
                         'cell_type_uni',
                         'disease_code',
                        'heart_failure']]

In [21]:
sc_dat.obs = sc_dat.obs.rename(columns={"donor_id": "sample_id", "cell_type_uni":"cell_type"})

In [22]:
sc_dat.obs

Unnamed: 0,sample_id,disease,sex,age,cell_type,disease_code,heart_failure
TCCCACACAATAGTGA-1-0,P1364,ICM,female,55,CM,ICM,HF
AATGCCACAACTTCTT-1-0,P1364,ICM,female,55,CM,ICM,HF
ATTACTCCAATAGTAG-1-0,P1364,ICM,female,55,CM,ICM,HF
ATCGTGACACCTGTCT-1-0,P1364,ICM,female,55,CM,ICM,HF
CAGTTAGCAACCGCTG-1-0,P1364,ICM,female,55,CM,ICM,HF
...,...,...,...,...,...,...,...
TCGTAGACACACTTAG-1-14,P1801,NF,male,42,Endo,NF,NF
TGAATGCTCATGCCAA-1-14,P1801,NF,male,42,PC,NF,NF
GTGGAGATCGGCTGAC-1-14,P1801,NF,male,42,Lymphoid,NF,NF
TGAGTCAGTGGGCTCT-1-14,P1801,NF,male,42,Myeloid,NF,NF


In [23]:
sc_dat.write_h5ad("/Volumes/RicoData2/ReHeat2/simplified/Simonson2023_ICM.h5ad")