# Process Kuppe et al. data

## Set up Env

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix, issparse

In [2]:
import liana as li

Params

In [3]:
# THESE TO A COMMON CSV
dataset = 'kuppe'
groupby = 'cell_type'
sample_key = 'sample'
condition_key = 'patient_group'
batch_key = 'sex'

min_cells_per_sample = 700
sample_zcounts_max = 3
sample_zcounts_min = -2

# set filtering parameters
min_cells = 20 # min number of cells per cell type
min_samples = 5 # min number of samples that pass the threshold per cell type
use_raw = True
change_var_to = 'feature_name'
conditions_to_keep = ['ischemic', 'myogenic']

In [4]:
from prep_utils import filter_samples, filter_celltypes, check_group_balance

### Load data

In [5]:
adata = sc.read_h5ad(os.path.join('data', "kuppe.h5ad"), backed='r')

In [6]:
if conditions_to_keep is not None:
    msk = np.array([patient in conditions_to_keep for patient in adata.obs[condition_key]])
    adata = adata[msk]

In [7]:
if use_raw:
    adata = adata.raw.to_adata()

In [8]:
if not issparse(adata.X):
    adata.X = csr_matrix(adata.X)

In [9]:
# change to gene symbols
if change_var_to is not None:
    adata.var.index = adata.var[change_var_to]

Filter samples

In [10]:
adata = filter_samples(adata, sample_key, condition_key, min_cells_per_sample, sample_zcounts_max, sample_zcounts_min)

               sample
patient_group        
ischemic           10
myogenic           13


### Check Group balance

In [11]:
adata = check_group_balance(adata, condition_key, sample_key)

Groups are balanced!


Filter cell types and genes

In [12]:
adata = filter_celltypes(adata=adata, groupby=groupby, sample_key=sample_key, min_cells=min_cells, min_samples=min_samples)

In [13]:
# Remove genes expressed in few cells, normalize
sc.pp.filter_genes(adata, min_cells=min_cells)
adata



AnnData object with n_obs × n_vars = 164831 × 26806
    obs: 'sample', 'n_counts', 'n_genes', 'percent_mito', 'doublet_score', 'dissociation_score', 'cell_type_original', 'patient_region_id', 'donor_id', 'patient_group', 'major_labl', 'final_cluster', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_name', 'feature_reference', 'feature_biotype', 'n_cells'
    uns: 'X_approximate_distribution', 'batch_condition', 'cell_type_colors', 'cell_type_original_colors', 'default_embedding', 'schema_version', 'title'
    obsm: 'X_harmony', 'X_pca', 'X_umap'

In [15]:
# Remove genes expressed in few cells
sc.pp.filter_genes(adata, min_cells=min_cells)
adata

AnnData object with n_obs × n_vars = 164831 × 26806
    obs: 'sample', 'n_counts', 'n_genes', 'percent_mito', 'doublet_score', 'dissociation_score', 'cell_type_original', 'patient_region_id', 'donor_id', 'patient_group', 'major_labl', 'final_cluster', 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_name', 'feature_reference', 'feature_biotype', 'n_cells'
    uns: 'X_approximate_distribution', 'batch_condition', 'cell_type_colors', 'cell_type_original_colors', 'default_embedding', 'schema_version', 'title'
    obsm: 'X_harmony', 'X_pca', 'X_umap'

### Normalize

In [17]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

### Add Metadata

In [18]:
adata.uns['sample_key'] = sample_key
adata.uns['batch_key'] = batch_key
adata.uns['condition_key'] = condition_key
adata.uns['groupby'] = groupby


In [19]:
assert np.isin(['sample_key', 'batch_key', 'condition_key', 'groupby'], adata.uns_keys()).all()

## Run LIANA

In [20]:
import liana as li

In [21]:
li.mt.rank_aggregate.by_sample(adata, groupby=groupby, use_raw=False, sample_key=sample_key, verbose=True, n_perms=None)

Now running: CK376: 100%|██████████| 23/23 [05:28<00:00, 14.28s/it]


### Write

In [22]:
adata.write_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"))

## Run Dimensionality Reductions

In [25]:
import os
import scanpy as sc
from classify_utils import dim_reduction_pipe



In [None]:
adata = sc.read_h5ad(os.path.join('data', 'interim', f"{dataset}_processed.h5ad"), backed='r')

In [None]:
dim_reduction_pipe(adata, dataset, use_gpu=True)

## Run Classifier

In [None]:
import os
import scanpy as sc
from classify_utils import run_classifier

In [None]:
adata = sc.read_h5ad(os.path.join('data', 'results', f"{dataset}_dimred.h5ad"), backed='r')

In [None]:
run_classifier(adata, dataset)

In [None]:
adata.uns['evaluate']

In [None]:
adata.uns['auc'].sort_values(by=['score_key', 'reduction_name', 'fold'])

##### Abbrevaite Cell types

In [None]:
# cell_type_abbreviations = {'neuronal receptor cell': 'NRC',
#     'mast cell': 'MC',
#     'cardiac muscle myoblast': 'CMM',
#     'smooth muscle myoblast': 'SMM',
#     'pericyte': 'PC',
#     'lymphoid lineage restricted progenitor cell': 'LLRPC',
#     'immature innate lymphoid cell': 'IILC',
#     'fibroblast of cardiac tissue': 'FCT',
#     'cardiac endothelial cell': 'CEC',
#     'adipocyte of epicardial fat of left ventricle': 'AEFLV',
#     'native cell': 'NC'
# }
# # use the replace() method to recode the cell_type column
# adata.obs[groupby] = adata.obs[groupby].replace(cell_type_abbreviations)