# Process Carraro et al.

Note: This one has brutal batch effect (by lab); should just discard this one

## Set up Env

In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

In [2]:
from prep_utils import filter_samples, filter_celltypes, check_group_balance

In [3]:
dataset = 'carraro'
groupby = 'major'
sample_key = 'orig.ident'
condition_key = 'type'
batch_key = 'lab'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 20 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

## Preprocess

### Load data

In [4]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}.h5ad"))
adata

AnnData object with n_obs × n_vars = 40709 × 31229
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'lab', 'type', 'mutation', 'percent.mt', 'percent.rp', 'major', 'minor', 'ident'
    uns: 'X_name'

In [None]:
adata.obs[[sample_key, condition_key]].drop_duplicates().nunique()

Filter samples, cell types, and genes

In [None]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

### Check Group balance

In [None]:
adata = check_group_balance(adata, condition_key, sample_key)

Groups are imbalanced!
Status
Control    10
ILD        18
Name: Status, dtype: int64
Balancing groups...


In [None]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [None]:
# Remove genes expressed in few cells
sc.pp.filter_genes(adata, min_cells=min_cells)
adata

### Normalize

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### Add Metadata

In [None]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key
adata.uns['groupby'] = groupby


In [None]:
assert np.isin(['sample_key', 'batch_key', 'condition_key', 'groupby'], adata.uns_keys()).all()

## Run LIANA

In [None]:
import liana as li

In [None]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

### Write

In [None]:
adata.write_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"))

## Run Dimensionality Reductions

In [None]:
import os
import scanpy as sc
from classify_utils import dim_reduction_pipe

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"), backed='r')

In [None]:
dim_reduction_pipe(adata, dataset)

## Run Classifier

In [5]:
import os
import scanpy as sc
from classify_utils import run_classifier

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
adata = sc.read_h5ad(os.path.join('data', 'results', f"{dataset}_dimred.h5ad"), backed='r')

In [7]:
run_classifier(adata, dataset)

0: [ 0  1  2  3  5  6  7  8 10 14 15 17 20 21], [ 4  9 11 12 13 16 18 19]
1: [ 0  4  5  7  9 10 11 12 13 14 16 17 18 19 20], [ 1  2  3  6  8 15 21]
2: [ 1  2  3  4  6  8  9 11 12 13 15 16 18 19 21], [ 0  5  7 10 14 17 20]
0: [ 0  1  3  4  5  6  9 13 15 16 17 18 19 20], [ 2  7  8 10 11 12 14 21]
1: [ 0  1  2  4  5  7  8  9 10 11 12 14 15 16 21], [ 3  6 13 17 18 19 20]
2: [ 2  3  6  7  8 10 11 12 13 14 17 18 19 20 21], [ 0  1  4  5  9 15 16]




0: [ 0  2  4  5  7  8 11 15 16 17 18 19 20 21], [ 1  3  6  9 10 12 13 14]
1: [ 1  3  4  5  6  9 10 11 12 13 14 15 16 18 20], [ 0  2  7  8 17 19 21]
2: [ 0  1  2  3  6  7  8  9 10 12 13 14 17 19 21], [ 4  5 11 15 16 18 20]
0: [ 0  1  2  5  6  7  9 13 14 15 16 17 19 21], [ 3  4  8 10 11 12 18 20]
1: [ 0  2  3  4  7  8  9 10 11 12 13 14 17 18 20], [ 1  5  6 15 16 19 21]
2: [ 1  3  4  5  6  8 10 11 12 15 16 18 19 20 21], [ 0  2  7  9 13 14 17]
0: [ 0  1  5  6  9 10 11 12 14 15 16 17 18 21], [ 2  3  4  7  8 13 19 20]
1: [ 0  1  2  3  4  6  7  8  9 10 11 13 17 19 20], [ 5 12 14 15 16 18 21]
2: [ 2  3  4  5  7  8 12 13 14 15 16 18 19 20 21], [ 0  1  6  9 10 11 17]


In [8]:
adata.uns['evaluate']

Unnamed: 0,reduction_name,score_key,state,fold,auroc,tpr,fpr,f1_score,oob_score,train_split,test_split,test_classes,dataset
0,mofa,expr_prod,0,0,0.666667,"[0.0, 0.0, 1.0, 1.0]","[0.0, 0.3333333333333333, 0.3333333333333333, ...",0.708333,0.571429,"[0, 1, 2, 3, 5, 6, 7, 8, 10, 14, 15, 17, 20, 21]","[4, 9, 11, 12, 13, 16, 18, 19]","[1, 0, 0, 1, 1, 0, 1, 1]",carraro
1,tensor,expr_prod,0,0,0.666667,"[0.0, 0.16666666666666666, 0.3333333333333333,...","[0.0, 0.0, 0.0, 0.5, 0.5, 1.0]",0.642857,0.357143,"[0, 1, 2, 3, 5, 6, 7, 8, 10, 14, 15, 17, 20, 21]","[4, 9, 11, 12, 13, 16, 18, 19]","[1, 0, 0, 1, 1, 0, 1, 1]",carraro
2,mofa,lr_logfc,0,0,0.600000,"[0.0, 0.2, 0.4, 0.4, 0.6, 0.6, 0.8, 0.8, 1.0]","[0.0, 0.0, 0.0, 0.3333333333333333, 0.33333333...",0.480769,0.357143,"[0, 1, 2, 3, 5, 6, 7, 8, 10, 14, 15, 17, 20, 21]","[4, 9, 11, 12, 13, 16, 18, 19]","[1, 0, 0, 1, 1, 0, 1, 1]",carraro
3,tensor,lr_logfc,0,0,0.916667,"[0.0, 0.16666666666666666, 0.8333333333333334,...","[0.0, 0.0, 0.0, 0.5, 0.5, 1.0]",0.642857,0.642857,"[0, 1, 2, 3, 5, 6, 7, 8, 10, 14, 15, 17, 20, 21]","[4, 9, 11, 12, 13, 16, 18, 19]","[1, 0, 0, 1, 1, 0, 1, 1]",carraro
4,mofa,lr_means,0,0,0.466667,"[0.0, 0.0, 0.6, 0.6, 0.8, 0.8, 1.0]","[0.0, 0.3333333333333333, 0.3333333333333333, ...",0.416667,0.714286,"[0, 1, 2, 3, 5, 6, 7, 8, 10, 14, 15, 17, 20, 21]","[4, 9, 11, 12, 13, 16, 18, 19]","[1, 0, 0, 1, 1, 0, 1, 1]",carraro
...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,tensor,lr_probs,4,2,0.125000,"[0.0, 0.0, 0.0, 0.3333333333333333, 0.66666666...","[0.0, 0.25, 0.75, 0.75, 1.0, 1.0]",0.285714,0.733333,"[2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 16, 18, 19,...","[0, 1, 6, 9, 10, 11, 17]","[0, 1, 1, 0, 0, 0, 1]",carraro
176,mofa,lrscore,4,2,0.250000,"[0.0, 0.0, 0.0, 1.0, 1.0]","[0.0, 0.5, 0.75, 0.75, 1.0]",0.257143,0.533333,"[2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 16, 18, 19,...","[0, 1, 6, 9, 10, 11, 17]","[0, 1, 1, 0, 0, 0, 1]",carraro
177,tensor,lrscore,4,2,0.166667,"[0.0, 0.0, 0.0, 0.3333333333333333, 0.33333333...","[0.0, 0.25, 0.5, 0.5, 1.0, 1.0]",0.257143,0.600000,"[2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 16, 18, 19,...","[0, 1, 6, 9, 10, 11, 17]","[0, 1, 1, 0, 0, 0, 1]",carraro
178,mofa,magnitude_rank,4,2,0.833333,"[0.0, 0.3333333333333333, 0.6666666666666666, ...","[0.0, 0.0, 0.0, 0.5, 0.5, 1.0]",0.514286,0.533333,"[2, 3, 4, 5, 7, 8, 12, 13, 14, 15, 16, 18, 19,...","[0, 1, 6, 9, 10, 11, 17]","[0, 1, 1, 0, 0, 0, 1]",carraro


In [None]:
adata.uns['auc'].sort_values(by=['score_key', 'reduction_name', 'fold'])