# Process Full Dataset

In [1]:
import os
import sys
import numpy as np
import pandas as pd

import scanpy as sc
import liana as li
from utils import filter_samples


In [2]:
groupby = 'cell_type_original' # NOTE: the same cell types but readable
sample_key = 'sample'
condition_key = 'patient_group'
min_cells_per_sample = 1000
sample_zcounts_max = 3
sample_zcounts_min = -2
min_cells = 20
min_samples = 5
change_var_to = 'feature_name'

In [3]:
resource = pd.read_csv(os.path.join('results', 'lr_pairs.csv'))
cell_pairs = pd.read_csv(os.path.join('results', 'cell_pairs.csv'))

In [4]:
exact_abbreviations = {
    'Adipocyte': 'AD',
    'Cardiomyocyte': 'CM',
    'Endothelial': 'EN',
    'Fibroblast': 'FB',
    'Pericyte': 'PC',
    'Proliferating': 'PR',
    'Vascular_SMCs': 'VM',
    'Neuronal': 'NE',
    'Myeloid': 'MY',
    'Mast': 'MA',
    'Lymphoid': 'LY',
    'Cycling cells': 'CC'
}

Filter

In [5]:
adata = sc.read_h5ad(os.path.join('..' ,'classification', 'data', 'kuppe.h5ad'), backed='r')
adata.obs[[condition_key, sample_key]].drop_duplicates().groupby(condition_key).count()
# remap
remap = {'vSMCs':'Vascular_SMCs'}
adata.obs[groupby].replace(remap, inplace=True)
adata.obs[groupby]
adata = adata.to_memory().raw.to_adata()
adata = filter_samples(adata,
                        sample_key = sample_key,
                        condition_key = condition_key,
                        min_cells_per_sample= min_cells_per_sample,
                        sample_zcounts_max = sample_zcounts_max,
                        sample_zcounts_min = sample_zcounts_min)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




               sample
patient_group        
fibrotic            5
ischemic           10
myogenic           13




In [6]:
sc.pp.filter_genes(adata, min_cells=min_cells)
adata.var.index = adata.var[change_var_to]
adata.layers['counts'] = adata.X.copy()
# Normalize
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)



In [7]:
# abbrevaite
adata.obs[groupby] = adata.obs[groupby].replace(exact_abbreviations)



In [8]:
adata.obs[groupby] = adata.obs[groupby].astype('str')

LIANA

In [9]:
# write to file
li.mt.rank_aggregate.by_sample(adata, 
                               groupby=groupby,
                               use_raw=False,
                               sample_key=sample_key,
                               expr_prop=0.05,
                               verbose=True,
                               n_perms=None,
                               resource=resource[['ligand', 'receptor']], # TODO: would only work with this -> make sure it simply selects ligand-receptor columns
                               groupby_pairs=cell_pairs,
                               )

AnnData expects .var.index to contain strings, but got values like:
    ['CSTF2T', 'ALDH1A2', 'SNRPG', 'ACADSB', 'LILRA6']

    Inferred to be: categorical

AnnData expects .var.index to contain strings, but got values like:
    ['CSTF2T', 'ALDH1A2', 'SNRPG', 'ACADSB', 'LILRA6']

    Inferred to be: categorical

AnnData expects .var.index to contain strings, but got values like:
    ['CSTF2T', 'ALDH1A2', 'SNRPG', 'ACADSB', 'LILRA6']

    Inferred to be: categorical

AnnData expects .var.index to contain strings, but got values like:
    ['CSTF2T', 'ALDH1A2', 'SNRPG', 'ACADSB', 'LILRA6']

    Inferred to be: categorical

AnnData expects .var.index to contain strings, but got values like:
    ['CSTF2T', 'ALDH1A2', 'SNRPG', 'ACADSB', 'LILRA6']

    Inferred to be: categorical

AnnData expects .var.index to contain strings, but got values like:
    ['CSTF2T', 'ALDH1A2', 'SNRPG', 'ACADSB', 'LILRA6']

    Inferred to be: categorical

AnnData expects .var.index to contain strings, but got val

In [10]:
adata.write_h5ad(os.path.join('results', 'kuppe_processed.h5ad'))

... storing 'cell_type_original' as categorical


In [11]:
adata.uns['liana_res'].to_csv(os.path.join('..', '..', 'figures', 'source', 'SuppDataFig3_LRs.csv'))