# Process Kuppe et al. data

## Set up Env

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix, issparse

In [2]:
import liana as li

In [3]:
dataset_name = 'kuppe'

In [4]:
from prep_utils import filter_samples, filter_celltypes, check_group_balance

### Load data

In [None]:
adata = sc.read_h5ad(os.path.join('data', "kuppe.h5ad"), backed='r')

In [None]:
if conditions_to_keep is not None:
    msk = np.array([patient in conditions_to_keep for patient in adata.obs[condition_key]])
    adata = adata[msk]

In [None]:
if use_raw:
    adata = adata.raw.to_adata()

In [None]:
if not issparse(adata.X):
    adata.X = csr_matrix(adata.X)

In [None]:
# change to gene symbols
if change_var_to is not None:
    adata.var.index = adata.var[change_var_to]

Filter samples

In [None]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

### Check Group balance

In [None]:
adata = check_group_balance(adata, condition_key, sample_key)

Filter cell types and genes

In [None]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [None]:
# Remove genes expressed in few cells, normalize
sc.pp.filter_genes(adata, min_cells=min_cells)
adata

### Normalize

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### Add Metadata

In [None]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key
adata.uns['groupby'] = groupby


In [None]:
assert np.isin(['sample_key', 'batch_key', 'condition_key', 'groupby'], adata.uns_keys()).all()

## Run LIANA

In [None]:
import liana as li

In [None]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

### Write

In [None]:
adata.write_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"))

## Run Dimensionality Reductions

In [None]:
import os
import scanpy as sc
from classify_utils import dim_reduction_pipe

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"), backed='r')

In [None]:
dim_reduction_pipe(adata, dataset, use_gpu=True)

## Run Classifier

In [None]:
import os
import scanpy as sc
from classify_utils import run_classifier

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'results', f"{dataset}_dimred.h5ad"), backed='r')

In [None]:
run_classifier(adata, dataset)

In [None]:
adata.uns['evaluate']

In [None]:
adata.uns['evaluate'].sort_values(by=['score_key', 'reduction_name', 'fold'])

##### Abbrevaite Cell types

In [None]:
# cell_type_abbreviations = {'neuronal receptor cell': 'NRC',
#     'mast cell': 'MC',
#     'cardiac muscle myoblast': 'CMM',
#     'smooth muscle myoblast': 'SMM',
#     'pericyte': 'PC',
#     'lymphoid lineage restricted progenitor cell': 'LLRPC',
#     'immature innate lymphoid cell': 'IILC',
#     'fibroblast of cardiac tissue': 'FCT',
#     'cardiac endothelial cell': 'CEC',
#     'adipocyte of epicardial fat of left ventricle': 'AEFLV',
#     'native cell': 'NC'
# }
# # use the replace() method to recode the cell_type column
# adata.obs[groupby] = adata.obs[groupby].replace(cell_type_abbreviations)