# Process Carraro et al.

## Set up Env

In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

In [2]:
import liana as li

In [3]:
from prep_utils import filter_samples, filter_celltypes

In [4]:
dataset = 'carraro'
groupby = 'major'
sample_key = 'orig.ident'
condition_key = 'type'
batch_key = 'lab'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 10 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

### Load data

In [None]:
adata = sc.read_h5ad(os.path.join('data', "carraro.h5ad"))
adata

Filter samples, cell types, and genes

In [None]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

In [None]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [None]:
# Remove genes expressed in few cells, normalize
sc.pp.filter_genes(adata, min_cells=30)
adata

In [None]:
# adata.X = sc.pp.combat(adata, key=batch_key)

### Normalize

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

In [None]:
adata.write_h5ad(os.path.join('data', "carraro_processed.h5ad"))

## Classify

In [5]:
from classify_utils import run_classifier, run_mofatalk, run_tensor_c2c, NestedDict
from sklearn.model_selection import StratifiedKFold



In [6]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}_processed.h5ad"), backed='r')

In [7]:
methods = li.mt.show_methods()
# in case a method is missing Magnitude Score, use Specificity Score
methods['score_key'] = methods["Magnitude Score"].fillna(methods["Specificity Score"])
# remove Geometric Mean	method
methods = methods[methods['Method Name'] != 'Geometric Mean']
# drop duplicated scores (expr_prod for NATMI & Connectome)
methods = methods.drop_duplicates(subset=['Method Name', 'score_key'])
methods = methods[['Method Name', 'score_key']]

In [8]:
adata.uns['mofa_res'] = NestedDict()
adata.uns['tensor_res'] = NestedDict()
adata.uns['auc'] = pd.DataFrame(columns=['reduction_name', 'score_key', 'fold', 'auc', 'tpr', 'fpr', 'train_split', 'test_split'])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [9]:
for score_key in methods['score_key'][:2]:
    print(f"Creating views with: {score_key}")

    run_mofatalk(adata=adata, score_key=score_key, sample_key=sample_key, condition_key=condition_key, batch_key=batch_key)
    
    run_tensor_c2c(adata=adata, score_key=score_key, sample_key=sample_key, condition_key=condition_key)
    
    run_classifier(adata=adata, skf=skf, score_key=score_key)


Creating views with: lr_means


100%|██████████| 16/16 [00:00<00:00, 184.99it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='Secretory&Ciliated' group='group1' with N=22 samples and D=199 features...
Loaded view='Secretory&Secretory' group='group1' with N=22 samples and D=253 features...
Loaded view='Secretory&Basal' group='group1' with N=22 samples and D=290 features...
Loaded view='Basal&Basal' group='group1' with N=22 samples and D=387 features...
Loaded view='Basal&Secretory' group='g

100%|██████████| 22/22 [00:21<00:00,  1.04it/s]


Running Tensor Factorization
Creating views with: expr_prod


100%|██████████| 16/16 [00:00<00:00, 208.53it/s]



        #########################################################
        ###           __  __  ____  ______                    ### 
        ###          |  \/  |/ __ \|  ____/\    _             ### 
        ###          | \  / | |  | | |__ /  \ _| |_           ### 
        ###          | |\/| | |  | |  __/ /\ \_   _|          ###
        ###          | |  | | |__| | | / ____ \|_|            ###
        ###          |_|  |_|\____/|_|/_/    \_\              ###
        ###                                                   ### 
        ######################################################### 
       
 
        
Loaded view='Secretory&Ciliated' group='group1' with N=22 samples and D=199 features...
Loaded view='Secretory&Secretory' group='group1' with N=22 samples and D=253 features...
Loaded view='Secretory&Basal' group='group1' with N=22 samples and D=290 features...
Loaded view='Basal&Basal' group='group1' with N=22 samples and D=387 features...
Loaded view='Basal&Secretory' group='g

100%|██████████| 22/22 [00:22<00:00,  1.01s/it]


Running Tensor Factorization


In [11]:
adata.uns['auc']['dataset'] = dataset

In [12]:
adata.uns['auc'].sort_values('auc', ascending=False)

Unnamed: 0,reduction_name,score_key,fold,auc,tpr,fpr,train_split,test_split,dataset
10,mofa,expr_prod,0,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 4, 6, 8, 9, 10, 12, 14, 15, 16, 1...","[5, 7, 11, 13, 19]",carraro
11,tensor,expr_prod,0,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 4, 6, 8, 9, 10, 12, 14, 15, 16, 1...","[5, 7, 11, 13, 19]",carraro
2,mofa,lr_means,1,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[2, 4, 10, 18, 20]",carraro
3,tensor,lr_means,1,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[2, 4, 10, 18, 20]",carraro
4,mofa,lr_means,2,1.0,"[0.0, 0.5, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[15, 16, 17, 21]",carraro
5,tensor,lr_means,2,1.0,"[0.0, 0.5, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[15, 16, 17, 21]",carraro
13,tensor,expr_prod,1,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[2, 4, 10, 18, 20]",carraro
8,mofa,lr_means,4,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 4, 5, 7, 10, 11, 13, 14, 15, 16, ...","[6, 8, 9, 12]",carraro
9,tensor,lr_means,4,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 2, 3, 4, 5, 7, 10, 11, 13, 14, 15, 16, ...","[6, 8, 9, 12]",carraro
12,mofa,expr_prod,1,1.0,"[0.0, 0.3333333333333333, 1.0, 1.0]","[0.0, 0.0, 0.0, 1.0]","[0, 1, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...","[2, 4, 10, 18, 20]",carraro


In [14]:
adata.uns['auc'].to_csv(os.path.join('data', 'results', f'{dataset}.csv'), index=False)