# Process Kuppe et al. data

## Set up Env

In [4]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix

In [5]:
import liana as li

In [6]:
# load muon and mofax
import muon as mu
import mofax as mofa

In [7]:
from prep_utils import filter_samples, filter_celltypes

### Load data

Params

In [8]:
dataset = 'kuppe'
groupby = 'cell_type'
sample_key = 'sample'
condition_key = 'patient_group'
batch_key = 'sex'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 10 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

In [9]:
adata = sc.read_h5ad(os.path.join('data', "kuppe.h5ad"), backed='r')

In [10]:
msk = np.array([patient in ['ischemic', 'myogenic'] for patient in adata.obs['patient_group']])
adata = adata[msk]

In [11]:
# NOTE: I use .raw
adata = adata.raw.to_adata()
adata.X = csr_matrix(adata.X)

In [12]:
# change to gene symbols
adata.var.index = adata.var['feature_name']

Filter samples, cell types, and genes

In [13]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

               sample
patient_group        
ischemic           10
myogenic           13


In [14]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [15]:
# Remove genes expressed in few cells, normalize
sc.pp.filter_genes(adata, min_cells=30)
adata



AnnData object with n_obs × n_vars = 165129 × 26305
    obs: 'sample', 'n_counts', 'n_genes', 'percent_mito', 'doublet_score', 'dissociation_score', 'cell_type_original', 'patient_region_id', 'donor_id', 'patient_group', 'major_labl', 'final_cluster', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_name', 'feature_reference', 'feature_biotype', 'n_cells'
    uns: 'X_approximate_distribution', 'batch_condition', 'cell_type_colors', 'cell_type_original_colors', 'default_embedding', 'schema_version', 'title'
    obsm: 'X_harmony', 'X_pca', 'X_umap'

In [16]:
import gc
gc.collect()

2789

##### Abbrevaite Cell types

In [17]:
cell_type_abbreviations = {'neuronal receptor cell': 'NRC',
    'mast cell': 'MC',
    'cardiac muscle myoblast': 'CMM',
    'smooth muscle myoblast': 'SMM',
    'pericyte': 'PC',
    'lymphoid lineage restricted progenitor cell': 'LLRPC',
    'immature innate lymphoid cell': 'IILC',
    'fibroblast of cardiac tissue': 'FCT',
    'cardiac endothelial cell': 'CEC',
    'adipocyte of epicardial fat of left ventricle': 'AEFLV',
    'native cell': 'NC'
}

In [18]:
# use the replace() method to recode the cell_type column
adata.obs['cell_abbr'] = adata.obs['cell_type'].replace(cell_type_abbreviations)

In [19]:
# change groupby to abbreviation
groupby = 'cell_abbr'

### Normalize

In [20]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### Run LIANA

In [21]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

Now running: CK376: 100%|██████████| 23/23 [04:37<00:00, 12.06s/it]


### Add Metadata & Write

In [22]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key

In [23]:
assert np.isin(['sample_key', 'batch_key', 'condition_key'], adata.uns_keys()).all()

In [24]:
adata.write_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"))

## Classify

In [25]:
import scanpy as sc
from classify_utils import classifier_pipe

In [26]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"), backed='r')



In [27]:
classifier_pipe(adata, dataset)

Creating views with: lr_means


100%|██████████| 121/121 [00:05<00:00, 22.83it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=428 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=419 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=497 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=477 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=447 features.

100%|██████████| 23/23 [02:00<00:00,  5.24s/it]


Running Tensor Factorization
Creating views with: expr_prod


100%|██████████| 121/121 [00:07<00:00, 16.46it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=428 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=419 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=497 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=477 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=447 features.

100%|██████████| 23/23 [02:03<00:00,  5.39s/it]


Running Tensor Factorization
Creating views with: lr_logfc


100%|██████████| 121/121 [00:05<00:00, 24.07it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=428 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=419 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=497 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=477 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=447 features.

100%|██████████| 23/23 [01:51<00:00,  4.86s/it]


Running Tensor Factorization
Creating views with: expr_prod


100%|██████████| 121/121 [00:05<00:00, 24.17it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=428 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=419 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=497 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=477 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=447 features.

100%|██████████| 23/23 [02:01<00:00,  5.28s/it]


Running Tensor Factorization
Creating views with: lrscore


100%|██████████| 121/121 [00:04<00:00, 27.37it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=428 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=419 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=497 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=477 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=447 features.

100%|██████████| 23/23 [01:56<00:00,  5.05s/it]


Running Tensor Factorization
Creating views with: lr_probs


100%|██████████| 121/121 [00:06<00:00, 18.95it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=275 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=266 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=334 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=292 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=292 features.

100%|██████████| 23/23 [02:06<00:00,  5.51s/it]


Running Tensor Factorization
Creating views with: magnitude_rank


100%|██████████| 121/121 [00:04<00:00, 25.70it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='CMM&NRC' group='group1' with N=23 samples and D=399 features...
Loaded view='NRC&CMM' group='group1' with N=23 samples and D=388 features...
Loaded view='CMM&CMM' group='group1' with N=23 samples and D=476 features...
Loaded view='NC&CMM' group='group1' with N=23 samples and D=439 features...
Loaded view='CMM&SMM' group='group1' with N=23 samples and D=417 features.

100%|██████████| 23/23 [02:04<00:00,  5.41s/it]


Running Tensor Factorization


In [32]:
adata.uns['auc'].sort_values('auc')

Unnamed: 0,reduction_name,score_key,fold,auc,tpr,fpr,train_split,test_split,dataset
51,tensor,lr_probs,0,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, 15, 1...","[4, 7, 12, 17, 21]",kuppe
48,mofa,lrscore,4,0.5,"[0.0, 0.5, 0.5, 1.0]","[0.0, 0.0, 1.0, 1.0]","[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1, 16, 20, 22]",kuppe
49,tensor,lrscore,4,0.5,"[0.0, 0.5, 0.5, 1.0]","[0.0, 0.0, 1.0, 1.0]","[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1, 16, 20, 22]",kuppe
50,mofa,lr_probs,0,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, 15, 1...","[4, 7, 12, 17, 21]",kuppe
52,mofa,lr_probs,1,0.5,"[0.0, 1.0]","[0.0, 1.0]","[0, 1, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 18...","[2, 10, 11, 13, 14]",kuppe
...,...,...,...,...,...,...,...,...,...
20,mofa,lr_logfc,0,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 13, 14, 15, 1...","[4, 7, 12, 17, 21]",kuppe
19,tensor,expr_prod,4,1.0,"[0.0, 0.5, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1, 16, 20, 22]",kuppe
18,mofa,expr_prod,4,1.0,"[0.0, 0.5, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[1, 16, 20, 22]",kuppe
24,mofa,lr_logfc,2,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[1, 2, 3, 4, 7, 9, 10, 11, 12, 13, 14, 16, 17,...","[0, 5, 6, 8, 15]",kuppe
