In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Volumes/RicoData2/ReHeat2/raw")

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
sc_dat = sc.read_h5ad(filename = "./Reichart2022_DCM/scell_all.h5ad")
sc_dat.obs["barcode"] = sc_dat.obs.index.values

In [3]:
sc_dat.X = sc_dat.raw.X

In [4]:
sc_dat.var

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000243485,False,MIR1302-2HG,NCBITaxon:9606,gene
ENSG00000237613,False,FAM138A,NCBITaxon:9606,gene
ENSG00000186092,False,OR4F5,NCBITaxon:9606,gene
ENSG00000238009,False,RP11-34P13.7,NCBITaxon:9606,gene
ENSG00000239945,False,RP11-34P13.8,NCBITaxon:9606,gene
...,...,...,...,...
ENSG00000277856,False,ENSG00000277856.1,NCBITaxon:9606,gene
ENSG00000275063,False,ENSG00000275063.1,NCBITaxon:9606,gene
ENSG00000271254,False,ENSG00000271254.6,NCBITaxon:9606,gene
ENSG00000277475,False,ENSG00000277475.1,NCBITaxon:9606,gene


In [6]:
sc_dat.obs.columns

Index(['Sample', 'donor_id', 'Region_x', 'Primary.Genetic.Diagnosis',
       'n_genes', 'n_counts', 'percent_mito', 'percent_ribo',
       'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states',
       'Assigned', 'self_reported_ethnicity_ontology_term_id',
       'disease_ontology_term_id', 'cell_type_ontology_term_id',
       'sex_ontology_term_id', 'assay_ontology_term_id',
       'organism_ontology_term_id', 'is_primary_data',
       'tissue_ontology_term_id', 'development_stage_ontology_term_id',
       'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex',
       'tissue', 'self_reported_ethnicity', 'development_stage', 'barcode'],
      dtype='object')

In [10]:
sc_dat.obs[['Sample', 'donor_id',"Region_x","tissue"]].drop_duplicates().to_csv("/Users/ricardoramirez/Dropbox/PostDoc/Research/ReHeaT2/results/reichart_cleanup/sampling_specs.csv")

In [7]:
#Keep only LV and assigned cells
sc_dat = sc_dat[sc_dat.obs[["Region_x"]].values == "LV",]
sc_dat = sc_dat[sc_dat.obs[["Assigned"]].values == True,]

In [8]:
list(sc_dat.obs.columns)

['Sample',
 'donor_id',
 'Region_x',
 'Primary.Genetic.Diagnosis',
 'n_genes',
 'n_counts',
 'percent_mito',
 'percent_ribo',
 'scrublet_score_z',
 'scrublet_score_log',
 'solo_score',
 'cell_states',
 'Assigned',
 'self_reported_ethnicity_ontology_term_id',
 'disease_ontology_term_id',
 'cell_type_ontology_term_id',
 'sex_ontology_term_id',
 'assay_ontology_term_id',
 'organism_ontology_term_id',
 'is_primary_data',
 'tissue_ontology_term_id',
 'development_stage_ontology_term_id',
 'suspension_type',
 'cell_type',
 'assay',
 'disease',
 'organism',
 'sex',
 'tissue',
 'self_reported_ethnicity',
 'development_stage',
 'barcode']

In [9]:
sc_dat.obs[["donor_id", "disease"]].drop_duplicates().sort_values(by = "disease")

Unnamed: 0,donor_id,disease
259846,H84,non-compaction cardiomyopathy
379094,H35,dilated cardiomyopathy
340887,H02,dilated cardiomyopathy
353920,H81,dilated cardiomyopathy
877014,IC_H03,dilated cardiomyopathy
...,...,...
236661,H51,normal
221966,H53,normal
455279,H46,normal
38958,ED_H20,normal


In [10]:
sc_dat.obs[["donor_id"]].drop_duplicates()

Unnamed: 0,donor_id
2428,ED_H25
7466,DT4
26042,DP2
28877,DO1
38958,ED_H20
...,...
856954,D7
868756,IC_H01
871234,IC_H02
877014,IC_H03


In [11]:
disease_df = {'disease': ['dilated cardiomyopathy', 
                          'normal',
                          'arrhythmogenic right ventricular cardiomyopathy',
                          'non-compaction cardiomyopathy'],
              'disease_code': ["DCM","NF","ARVC","NCC"],
              'heart_failure': ["HF", "NF", "HF", "HF"]}

disease_df = pd.DataFrame(disease_df)

In [12]:
disease_df

Unnamed: 0,disease,disease_code,heart_failure
0,dilated cardiomyopathy,DCM,HF
1,normal,NF,NF
2,arrhythmogenic right ventricular cardiomyopathy,ARVC,HF
3,non-compaction cardiomyopathy,NCC,HF


In [13]:
new_codes = sc_dat.obs.merge(disease_df, on='disease', 
                           how='left')[["barcode","disease_code","heart_failure"]]

In [14]:
new_codes

Unnamed: 0,barcode,disease_code,heart_failure
0,2428,NF,NF
1,2429,NF,NF
2,2430,NF,NF
3,2431,NF,NF
4,2432,NF,NF
...,...,...,...
612452,878976,DCM,HF
612453,878977,DCM,HF
612454,878978,DCM,HF
612455,878979,DCM,HF


In [15]:
new_codes.set_index("barcode", inplace = True)

In [16]:
new_codes = new_codes.loc[sc_dat.obs.index.values, :]

In [17]:
new_codes

Unnamed: 0_level_0,disease_code,heart_failure
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
2428,NF,NF
2429,NF,NF
2430,NF,NF
2431,NF,NF
2432,NF,NF
...,...,...
878976,DCM,HF
878977,DCM,HF
878978,DCM,HF
878979,DCM,HF


In [18]:
sc_dat.obs["disease_code"] = new_codes["disease_code"].values
sc_dat.obs["heart_failure"] = new_codes["heart_failure"].values

  sc_dat.obs["disease_code"] = new_codes["disease_code"].values


In [19]:
sc_dat.obs[["disease_code","heart_failure","disease"]].drop_duplicates()

Unnamed: 0,disease_code,heart_failure,disease
2428,NF,NF,normal
7466,DCM,HF,dilated cardiomyopathy
184124,ARVC,HF,arrhythmogenic right ventricular cardiomyopathy
259846,NCC,HF,non-compaction cardiomyopathy


In [20]:
# Now to make the dictionary of matched cell types
available_cells = sc_dat.obs[["cell_type"]].drop_duplicates()

In [21]:
available_cells

Unnamed: 0,cell_type
2428,mural cell
2429,cardiac muscle cell
2431,endothelial cell
2440,cardiac neuron
2441,fibroblast of cardiac tissue
2459,lymphocyte
2487,mast cell
3034,myeloid cell
3692,fat cell


In [22]:
available_cells["cell_type_uni"] = "none"

In [23]:
available_cells.loc[available_cells["cell_type"].str.contains('muscle'), 'cell_type_uni'] = "CM"
available_cells.loc[available_cells["cell_type"].str.contains('endothelial'), 'cell_type_uni'] = "Endo"
available_cells.loc[available_cells["cell_type"].str.contains('fibroblast'), 'cell_type_uni'] = "Fib"
available_cells.loc[available_cells["cell_type"].str.contains('lymphocyte'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["cell_type"].str.contains('mast'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type"].str.contains('myeloid'), 'cell_type_uni'] = "Myeloid"

In [24]:
new_cts = sc_dat.obs.merge(available_cells, on='cell_type', 
                           how='left')[["barcode","cell_type_uni"]]
new_cts.set_index("barcode", inplace = True)
new_cts = new_cts.loc[sc_dat.obs.index.values, :]

In [25]:
sc_dat.obs["cell_type_uni"] = new_cts["cell_type_uni"].values

In [26]:
sc_dat.obs.loc[sc_dat.obs["cell_states"].str.contains('PC'), 'cell_type_uni'] = "PC"
sc_dat.obs.loc[sc_dat.obs["cell_states"].str.contains('SMC'), 'cell_type_uni'] = "vSMCs"

In [27]:
sc_dat = sc_dat[sc_dat.obs[["cell_type_uni"]].values != "none",]

In [28]:
sc_dat.obs[["cell_type","cell_type_uni"]].drop_duplicates()

Unnamed: 0,cell_type,cell_type_uni
2428,mural cell,vSMCs
2429,cardiac muscle cell,CM
2431,endothelial cell,Endo
2432,mural cell,PC
2441,fibroblast of cardiac tissue,Fib
2459,lymphocyte,Lymphoid
2487,mast cell,Myeloid
3034,myeloid cell,Myeloid


In [29]:
# Filter obs to contain things that are relevant
sc_dat.obs = sc_dat.obs[['donor_id',
                         'Primary.Genetic.Diagnosis',
                         'cell_type_uni',
                         'assay',
                         'suspension_type',
                         'disease',
                         'sex',
                         'disease_code',
                         'heart_failure']]

In [30]:
sc_dat.obs = sc_dat.obs.rename(columns={"donor_id": "sample_id", "cell_type_uni":"cell_type"})

In [31]:
sc_dat.write_h5ad("/Volumes/RicoData2/ReHeat2/simplified/Reichart2022_DCM.h5ad")