# Process Full Dataset

In [1]:
import os
import sys
import numpy as np
import pandas as pd

import scanpy as sc
import liana as li

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('.'), '..')))
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('.'), '..', 'classification')))
from classification.processer import DatasetHandler
from classification.prep_utils import filter_samples




In [2]:
# Dataset params
dataset_name = 'kuppe'
ds = DatasetHandler('kuppe')
dataset_info = ds.dataset_params['kuppe']
defaults = ds.dataset_params['defaults']

groupby = 'cell_type_original' # NOTE: the same cell types but readable
sample_key = dataset_info.get('sample_key', defaults['sample_key'])
condition_key = dataset_info.get('condition_key', defaults['condition_key'])
min_cells_per_sample = dataset_info.get('min_cells_per_sample', defaults['min_cells_per_sample'])
sample_zcounts_max = dataset_info.get('sample_zcounts_max', defaults['sample_zcounts_max'])
sample_zcounts_min = dataset_info.get('sample_zcounts_min', defaults['sample_zcounts_min'])
min_cells = dataset_info.get('min_cells', defaults['min_cells'])
min_samples = dataset_info.get('min_samples', defaults['min_samples'])
use_raw = dataset_info.get('use_raw', defaults['use_raw'])
change_var_to = dataset_info.get('change_var_to', defaults['change_var_to'])
conditions_to_keep = dataset_info.get('conditions_to_keep', defaults['conditions_to_keep'])
n_factors = dataset_info.get('n_factors', defaults['n_factors'])

In [3]:
resource = pd.read_csv(os.path.join('results', 'lr_pairs.csv'))
cell_pairs = pd.read_csv(os.path.join('results', 'cell_pairs.csv'))

In [4]:
exact_abbreviations = {
    'Adipocyte': 'AD',
    'Cardiomyocyte': 'CM',
    'Endothelial': 'EN',
    'Fibroblast': 'FB',
    'Pericyte': 'PC',
    'Proliferating': 'PR',
    'Vascular_SMCs': 'VM',
    'Neuronal': 'NE',
    'Myeloid': 'MY',
    'Mast': 'MA',
    'Lymphoid': 'LY',
    'Cycling cells': 'CC'
}

Filter

In [5]:
adata = sc.read_h5ad(os.path.join('..' ,'classification', 'data', 'kuppe.h5ad'), backed='r')
adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(condition_key).count()
# remap
remap = {'vSMCs':'Vascular_SMCs'}
adata.obs[groupby].replace(remap, inplace=True)
adata.obs[groupby]
# # keep only targets of interest
# msk = adata.obs[groupby].isin(interactions['target'])
# adata = adata[msk, :]
adata = adata.to_memory().raw.to_adata()
adata = filter_samples(adata,
                        sample_key = sample_key,
                        condition_key = condition_key,
                        min_cells_per_sample= min_cells_per_sample,
                        sample_zcounts_max = sample_zcounts_max,
                        sample_zcounts_min = sample_zcounts_min)


               sample
patient_group        
fibrotic            5
ischemic           10
myogenic           13


In [6]:
sc.pp.filter_genes(adata, min_cells=min_cells)
adata.var.index = adata.var[change_var_to]
adata.layers['counts'] = adata.X.copy()
# Normalize
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)



In [7]:
# abbrevaite
adata.obs[groupby] = adata.obs[groupby].replace(exact_abbreviations)

In [8]:
adata.obs[groupby] = adata.obs[groupby].astype('str')

LIANA

In [9]:
# write to file
li.mt.rank_aggregate.by_sample(adata, 
                               groupby=groupby,
                               use_raw=False,
                               sample_key=sample_key,
                               expr_prop=0.1,
                               verbose=True,
                               n_perms=None,
                               resource=resource[['ligand', 'receptor']], # NOTE: would only work with this -> make sure it simply selects ligand-receptor columns
                               groupby_pairs=cell_pairs,
                               )

Now running: CK376: 100%|██████████| 28/28 [02:57<00:00,  6.35s/it]


In [11]:
adata.write_h5ad(os.path.join('results', 'kuppe_processed.h5ad'))