# Process Habermann et al.

## Set up Env

In [1]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix

In [2]:
import liana as li

In [3]:
from prep_utils import filter_samples, filter_celltypes, map_gene_symbols, check_group_balance

In [4]:
dataset = 'habermann'
groupby = 'celltype'
sample_key = 'Sample_Name'
condition_key = 'Status'
batch_key = 'Sample_Source'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 20 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type

## Preprocess

### Load data

In [5]:
adata = sc.read_h5ad(os.path.join('data', f"{dataset}.h5ad"), backed='r')
adata

AnnData object with n_obs × n_vars = 114396 × 33694 backed at 'data/habermann.h5ad'
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Diagnosis', 'Sample_Name', 'Sample_Source', 'Status', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'population', 'celltype', 'ident'
    uns: 'X_name'

In [6]:
adata.obs[[sample_key, condition_key]].drop_duplicates().nunique()

Sample_Name    30
Status          2
dtype: int64

#### Filter samples acc to QC

In [7]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

         Sample_Name
Status              
Control           10
ILD               18


### Check Group balance

In [None]:
if adata.is_view:
    adata = adata.to_memory()

In [8]:
adata = check_group_balance(adata, condition_key, sample_key)

Groups are imbalanced!
Status
Control    10
ILD        18
Name: Status, dtype: int64
Balancing groups...


In [9]:
adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(adata.obs[condition_key]).count()

Unnamed: 0_level_0,Status,Sample_Name
Status,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,10,10
ILD,10,10


Filter cell types, and genes

In [None]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [None]:
# Remove genes expressed in few cells
sc.pp.filter_genes(adata, min_cells=min_cells)
adata

### Normalize

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### Add Metadata

In [None]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key
adata.uns['groupby'] = groupby


In [None]:
assert np.isin(['sample_key', 'batch_key', 'condition_key', 'groupby'], adata.uns_keys()).all()

## Run LIANA

In [None]:
import liana as li

In [None]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

### Write

In [None]:
adata.write_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"))

## Run Dimensionality Reductions

In [None]:
import os
import scanpy as sc
from classify_utils import dim_reduction_pipe

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"), backed='r')

In [None]:
dim_reduction_pipe(adata, dataset)

## Run Classifier

In [None]:
import os
import scanpy as sc
from classify_utils import run_classifier

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'results', f"{dataset}_dimred.h5ad"), backed='r')

In [None]:
run_classifier(adata, dataset)

In [None]:
adata.uns['evaluate']

In [None]:
adata.uns['auc'].sort_values(by=['score_key', 'reduction_name', 'fold'])