In [1]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Users/ricardoramirez/Dropbox/PhD/Research/MOFAcell")

In [2]:
sc_dat = sc.read_h5ad(filename = "./data_DCMACM_Science/scell_all.h5ad")

In [40]:
# Filter object to only contain LV

In [3]:
sc_dat.obs[["Region_x"]].drop_duplicates()

Unnamed: 0,Region_x
0,RV
2428,LV


In [4]:
sc_dat = sc_dat[sc_dat.obs[["Region_x"]].values == "LV",]

In [5]:
list(sc_dat.obs.columns)

['Sample',
 'donor_id',
 'Region_x',
 'Primary.Genetic.Diagnosis',
 'n_genes',
 'n_counts',
 'percent_mito',
 'percent_ribo',
 'scrublet_score_z',
 'scrublet_score_log',
 'solo_score',
 'cell_states',
 'Assigned',
 'self_reported_ethnicity_ontology_term_id',
 'disease_ontology_term_id',
 'cell_type_ontology_term_id',
 'sex_ontology_term_id',
 'assay_ontology_term_id',
 'organism_ontology_term_id',
 'is_primary_data',
 'tissue_ontology_term_id',
 'development_stage_ontology_term_id',
 'suspension_type',
 'cell_type',
 'assay',
 'disease',
 'organism',
 'sex',
 'tissue',
 'self_reported_ethnicity',
 'development_stage']

In [6]:
# Filter only for annotated cells
sc_dat.obs[["Assigned"]].drop_duplicates()

Unnamed: 0,Assigned
2428,True
2578,False


In [7]:
sc_dat = sc_dat[sc_dat.obs[["Assigned"]].values == True,]

In [8]:
# Now to make the dictionary of matched cell types
available_cells = sc_dat.obs[["cell_type"]].drop_duplicates()

In [9]:
available_cells

Unnamed: 0,cell_type
2428,mural cell
2429,cardiac muscle cell
2431,endothelial cell
2440,cardiac neuron
2441,fibroblast of cardiac tissue
2459,lymphocyte
2487,mast cell
3034,myeloid cell
3692,fat cell


In [10]:
available_cells["cell_type_uni"] = "none"

In [11]:
available_cells.loc[available_cells["cell_type"].str.contains('muscle'), 'cell_type_uni'] = "CM"
available_cells.loc[available_cells["cell_type"].str.contains('endothelial'), 'cell_type_uni'] = "Endo"
available_cells.loc[available_cells["cell_type"].str.contains('neuron'), 'cell_type_uni'] = "Neuronal"
available_cells.loc[available_cells["cell_type"].str.contains('fibroblast'), 'cell_type_uni'] = "Fib"
available_cells.loc[available_cells["cell_type"].str.contains('lymphocyte'), 'cell_type_uni'] = "Lymphoid"
available_cells.loc[available_cells["cell_type"].str.contains('mast'), 'cell_type_uni'] = "Mast"
available_cells.loc[available_cells["cell_type"].str.contains('myeloid'), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type"].str.contains('fat'), 'cell_type_uni'] = "Adipo"
available_cells.loc[available_cells["cell_type"].str.contains('mural'), 'cell_type_uni'] = "Mur"

In [12]:
available_cells

Unnamed: 0,cell_type,cell_type_uni
2428,mural cell,Mur
2429,cardiac muscle cell,CM
2431,endothelial cell,Endo
2440,cardiac neuron,Neuronal
2441,fibroblast of cardiac tissue,Fib
2459,lymphocyte,Lymphoid
2487,mast cell,Mast
3034,myeloid cell,Myeloid
3692,fat cell,Adipo


In [13]:
sc_dat.obs["barcode"] = sc_dat.obs.index.values

  sc_dat.obs["barcode"] = sc_dat.obs.index.values


In [14]:
new_cts = sc_dat.obs.merge(available_cells, on='cell_type', 
                           how='left')[["barcode","cell_type_uni"]]

In [15]:
sc_dat.obs.loc[new_cts["barcode"].values, "cell_type_uni"] = new_cts["cell_type_uni"].values

In [16]:
sc_dat.obs[["cell_type_uni","cell_type"]]

Unnamed: 0,cell_type_uni,cell_type
2428,Mur,mural cell
2429,CM,cardiac muscle cell
2430,CM,cardiac muscle cell
2431,Endo,endothelial cell
2432,Mur,mural cell
...,...,...
878976,CM,cardiac muscle cell
878977,Endo,endothelial cell
878978,Fib,fibroblast of cardiac tissue
878979,Fib,fibroblast of cardiac tissue


In [17]:
sc_dat.obs.loc[sc_dat.obs["cell_states"].str.contains('PC'), 'cell_type_uni'] = "PC"
sc_dat.obs.loc[sc_dat.obs["cell_states"].str.contains('SMC'), 'cell_type_uni'] = "vSMCs"

In [18]:
meta_data = sc_dat.obs[["donor_id", "Primary.Genetic.Diagnosis", "disease", "sex"]].drop_duplicates()

In [19]:
meta_data

Unnamed: 0,donor_id,Primary.Genetic.Diagnosis,disease,sex
2428,ED_H25,control,normal,male
7466,DT4,TTN,dilated cardiomyopathy,male
26042,DP2,PLN,dilated cardiomyopathy,female
28877,DO1,TNNT2,dilated cardiomyopathy,male
38958,ED_H20,control,normal,female
...,...,...,...,...
856954,D7,control,normal,male
868756,IC_H01,TTN,dilated cardiomyopathy,male
871234,IC_H02,TTN,dilated cardiomyopathy,male
877014,IC_H03,DSP,dilated cardiomyopathy,male


In [20]:
meta_data.to_csv("./data_DCMACM_Science/meta_data.csv")

In [21]:
cell_type_numbers = sc_dat.obs.groupby(["donor_id", "cell_type_uni"])["cell_type_uni"].count()
cell_type_numbers = cell_type_numbers.to_frame().rename({"cell_type_uni":"counts"}, axis=1).reset_index()

In [22]:
sc_dat.layers['counts'] = sc_dat.raw.X

In [None]:
padata = dc.get_pseudobulk(sc_dat, sample_col='donor_id', groups_col='cell_type_uni', layer='counts', min_prop=0, min_smpls=0)

In [62]:
pb_dat = pd.DataFrame(padata.X)
pb_dat.columns = padata.var.index.values
pb_dat.index = padata.obs.index.values
pb_dat.to_csv("./data_DCMACM_Science/pb_data.csv")

In [63]:
pb_coldata = padata.obs.copy()
pb_coldata["colname"] = pb_coldata.index.values
pb_coldata = pb_coldata.merge(cell_type_numbers, on = ["cell_type_uni","donor_id"], how = "left")

In [64]:
pb_coldata = pb_coldata[["donor_id", "Primary.Genetic.Diagnosis", "disease", "colname", "cell_type_uni", "counts"]]

In [65]:
pb_coldata

Unnamed: 0,donor_id,Primary.Genetic.Diagnosis,disease,colname,cell_type_uni,counts
0,D1,control,normal,D1_Adipo,Adipo,58
1,D2,control,normal,D2_Adipo,Adipo,72
2,D4,control,normal,D4_Adipo,Adipo,112
3,D5,control,normal,D5_Adipo,Adipo,66
4,D6,control,normal,D6_Adipo,Adipo,25
...,...,...,...,...,...,...
673,H84,TPM1,non-compaction cardiomyopathy,H84_vSMCs,vSMCs,203
674,IC_H01,TTN,dilated cardiomyopathy,IC_H01_vSMCs,vSMCs,110
675,IC_H02,TTN,dilated cardiomyopathy,IC_H02_vSMCs,vSMCs,202
676,IC_H03,DSP,dilated cardiomyopathy,IC_H03_vSMCs,vSMCs,38


In [66]:
pb_coldata.to_csv("./data_DCMACM_Science/pb_coldata.csv")