In [26]:
#!/usr/bin/env python
# coding: utf-8
import sys
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import decoupler as dc
import regex as re
os.chdir("/Volumes/RicoData2/ReHeat2/raw")

In [51]:
sc_dat = sc.read_h5ad(filename = "./Chaffin2022_DCM/counts.h5ad")
sc_dat.obs["barcode"] = sc_dat.obs.index.values

In [52]:
list(sc_dat.obs.columns)

['biosample_id',
 'donor_id',
 'disease',
 'sex',
 'age',
 'lvef',
 'cell_type_leiden0.6',
 'SubCluster',
 'cellbender_ncount',
 'cellbender_ngenes',
 'cellranger_percent_mito',
 'exon_prop',
 'cellbender_entropy',
 'cellranger_doublet_scores',
 'barcode']

In [53]:
#biosample_id and donor_id the same?
#merge over donor_id
sc_dat.obs[['biosample_id','donor_id', 'disease']].drop_duplicates()

Unnamed: 0,biosample_id,donor_id,disease
TTCTTCCGTTCAACGT-1-0,LV_1622_2_nf,P1622,NF
ACAGCCGCAAGCGAGT-1-1,LV_1422_1_hcm,P1422,HCM
CAACCAAAGACCCGCT-1-2,LV_1722_2_hcm,P1722,HCM
CTGCTCAAGGCATCAG-1-3,LV_1462_1_hcm,P1462,HCM
AACCTGAGTGTACATC-1-4,LV_1558_2_nf,P1558,NF
...,...,...,...
CAACAGTGTACGATGG-1-75,LV_1472_1_dcm,P1472,DCM
GTCAAACCACCCTATC-1-76,LV_1735_2_hcm,P1735,HCM
ATAGACCGTCACCGAC-1-77,LV_1600_2_nf,P1600,NF
ATGCATGCATGCGTGC-1-78,LV_1606_1_dcm,P1606,DCM


In [54]:
sc_dat.obs[['donor_id', 'disease']].drop_duplicates().shape

(42, 2)

In [55]:
sc_dat.obs[['donor_id']].drop_duplicates().shape

(42, 1)

In [56]:
#Are we working with counts?
sc_dat.X[:,0:4].sum(axis = 0)

matrix([[2.000e+00, 0.000e+00, 1.000e+00, 4.602e+03]], dtype=float32)

In [57]:
sc_dat

AnnData object with n_obs × n_vars = 592689 × 36601
    obs: 'biosample_id', 'donor_id', 'disease', 'sex', 'age', 'lvef', 'cell_type_leiden0.6', 'SubCluster', 'cellbender_ncount', 'cellbender_ngenes', 'cellranger_percent_mito', 'exon_prop', 'cellbender_entropy', 'cellranger_doublet_scores', 'barcode'
    var: 'gene_ids', 'feature_types', 'genome'
    obsm: 'X_umap'
    layers: 'cellranger_raw'

In [58]:
#Making a lighter object
del sc_dat.layers["cellranger_raw"]

In [59]:
sc_dat.obs[["disease"]].drop_duplicates()

Unnamed: 0,disease
TTCTTCCGTTCAACGT-1-0,NF
ACAGCCGCAAGCGAGT-1-1,HCM
ATAGACCAGCAGCCCT-1-8,DCM


In [60]:
disease_df = {'disease': ['NF', 'HCM','DCM'],
              'disease_code': ['NF', 'HCM','DCM'],
              'heart_failure': ['NF', 'HF','HF']}

disease_df = pd.DataFrame(disease_df)

In [61]:
disease_df

Unnamed: 0,disease,disease_code,heart_failure
0,NF,NF,NF
1,HCM,HCM,HF
2,DCM,DCM,HF


In [62]:
new_codes = sc_dat.obs.merge(disease_df, on='disease', 
                           how='left')[["barcode","disease_code","heart_failure"]]

In [63]:
new_codes

Unnamed: 0,barcode,disease_code,heart_failure
0,TTCTTCCGTTCAACGT-1-0,NF,NF
1,CATCCACCATCTAACG-1-0,NF,NF
2,ACCCAAACAGCTAACT-1-0,NF,NF
3,AAGGAATCAACTGGTT-1-0,NF,NF
4,TACCCGTAGCGTGCTC-1-0,NF,NF
...,...,...,...
592684,TTATTGCGTCGGTGTC-1-79,NF,NF
592685,GTCACGGGTTGTATGC-1-79,NF,NF
592686,GTCATGATCTTTCGAT-1-79,NF,NF
592687,GCGATCGTCAGAGTGG-1-79,NF,NF


In [64]:
new_codes.set_index("barcode", inplace = True)
new_codes = new_codes.loc[sc_dat.obs.index.values, :]

In [65]:
sc_dat.obs["disease_code"] = new_codes["disease_code"].values
sc_dat.obs["heart_failure"] = new_codes["heart_failure"].values

In [66]:
sc_dat.obs[["disease_code","heart_failure","disease"]].drop_duplicates()

Unnamed: 0,disease_code,heart_failure,disease
TTCTTCCGTTCAACGT-1-0,NF,NF,NF
ACAGCCGCAAGCGAGT-1-1,HCM,HF,HCM
ATAGACCAGCAGCCCT-1-8,DCM,HF,DCM


In [67]:
# Now to make the dictionary of matched cell types
available_cells = sc_dat.obs[["cell_type_leiden0.6"]].drop_duplicates()

In [68]:
available_cells

Unnamed: 0,cell_type_leiden0.6
TTCTTCCGTTCAACGT-1-0,Cardiomyocyte_I
CAGCAGCCAGTTCTAG-1-0,Adipocyte
ATTACCTGTTGTTGTG-1-0,Proliferating_macrophage
AAAGGGCTCTATTCGT-1-0,Cardiomyocyte_III
GTGACGCCATGAGATA-1-0,Macrophage
TCAGCAACATAATGAG-1-0,Endocardial
TACACCCAGTTCCATG-1-0,Fibroblast_I
AACAAAGTCATATGGC-1-0,Endothelial_III
TCCGAAATCCGCCTAT-1-0,Neuronal
CTGTGAACATACCAGT-1-0,Cardiomyocyte_II


In [69]:
available_cells["cell_type_uni"] = "none"

In [70]:
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Cardiomyocyte'), 'cell_type_uni'] = "CM"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Macrophage', flags=re.IGNORECASE), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Fibroblast', flags=re.IGNORECASE), 'cell_type_uni'] = "Fib"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('VSMC'), 'cell_type_uni'] = "vSMCs"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Endothelial', flags=re.IGNORECASE), 'cell_type_uni'] = "Endo"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Mast', flags=re.IGNORECASE), 'cell_type_uni'] = "Myeloid"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Pericyte', flags=re.IGNORECASE), 'cell_type_uni'] = "PC"
available_cells.loc[available_cells["cell_type_leiden0.6"].str.contains('Lymphocyte', flags=re.IGNORECASE), 'cell_type_uni'] = "Lymphoid"

In [71]:
new_cts = sc_dat.obs.merge(available_cells, on='cell_type_leiden0.6', 
                           how='left')[["barcode","cell_type_uni"]]
new_cts.set_index("barcode", inplace = True)
new_cts = new_cts.loc[sc_dat.obs.index.values, :]

In [72]:
sc_dat.obs["cell_type_uni"] = new_cts["cell_type_uni"].values

In [73]:
sc_dat.obs[["cell_type_leiden0.6","cell_type_uni"]].drop_duplicates()

Unnamed: 0,cell_type_leiden0.6,cell_type_uni
TTCTTCCGTTCAACGT-1-0,Cardiomyocyte_I,CM
CAGCAGCCAGTTCTAG-1-0,Adipocyte,none
ATTACCTGTTGTTGTG-1-0,Proliferating_macrophage,Myeloid
AAAGGGCTCTATTCGT-1-0,Cardiomyocyte_III,CM
GTGACGCCATGAGATA-1-0,Macrophage,Myeloid
TCAGCAACATAATGAG-1-0,Endocardial,none
TACACCCAGTTCCATG-1-0,Fibroblast_I,Fib
AACAAAGTCATATGGC-1-0,Endothelial_III,Endo
TCCGAAATCCGCCTAT-1-0,Neuronal,none
CTGTGAACATACCAGT-1-0,Cardiomyocyte_II,CM


In [74]:
sc_dat = sc_dat[sc_dat.obs[["cell_type_uni"]].values != "none",]

In [75]:
# Filter obs to contain things that are relevant
sc_dat.obs = sc_dat.obs[['donor_id',
                         'disease',
                         'sex',
                         'age',
                         'lvef',
                         'cell_type_uni',
                         'disease_code',
                         'heart_failure']]

In [76]:
sc_dat.obs = sc_dat.obs.rename(columns={"donor_id": "sample_id", "cell_type_uni":"cell_type"})

In [77]:
sc_dat.write_h5ad("/Volumes/RicoData2/ReHeat2/simplified/Chaffin2022_DCM.h5ad")