In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

Load anndata

In [2]:
adata = sc.read_h5ad("test.h5ad")
# lognorm should be the default (expected)
adata.X = adata.layers['logcounts']

Parameters

In [3]:
groupby='label'

In [4]:
de_method='wilcoxon'

In [5]:
resource_name = 'consensus'

In [6]:
complex_policy = 'min'

In [7]:
complex_cols = ['ligand_means' , 'receptor_means'] # specific for every method

In [8]:
add_cols = ['ligand', 'receptor'] # additional columns to return

In [9]:
key_cols = ['source', 'target', 'ligand_complex', 'receptor_complex']

In [10]:
verbose = False

In [11]:
# need to figure out when to define this
relevant_cols = key_cols + complex_cols + add_cols

Run LIANA PIPE

In [None]:
from liana import liana_pipe

In [None]:
lr_res = liana_pipe(adata=adata, 
                    groupby=groupby, 
                    resource_name=resource_name,
                    de_method=de_method,
                    verbose=verbose,
                    _key_cols=key_cols,
                    _complex_cols=complex_cols,
                    _add_cols=add_cols,
                    complex_policy=complex_policy,
                    resource=None)

#### LIANA PIPE LINE BY LINE

In [12]:
from liana.utils.pre import check_mat, check_if_covered, format_vars, filter_resource
from liana.resource import select_resource
from liana.resource.select_resource import explode_complexes
from liana.steady.liana_pipe import _get_lr
from liana.utils.reassemble_complexes import reassemble_complexes
from scipy.sparse import csr_matrix
from statsmodels.distributions.empirical_distribution import ECDF

I need to double check if every step makes sense, as I build unit tests!!!

For example, are there any duplicated rows (source, target, ligand_complex, receptor_complex)

Double check again if correct subunit is kept, etc

In [13]:
adata.X = check_mat(adata.X, True)

Converting mat to CSR format


In [14]:
# Define idents col name
adata.obs.label = adata.obs[groupby]

Load resource

In [15]:
resource = select_resource(resource_name='consensus')

In [16]:
# Decomplexify
resource = explode_complexes(resource)

In [17]:
# Filter Resource
resource = filter_resource(resource, adata.var_names)

Create entities

In [18]:
entities = np.union1d(np.unique(resource["ligand"]), np.unique(resource["receptor"]))

Check overlap between resource and adata

In [19]:
check_if_covered(entities, adata.var_keys)

Get global mean for SCA before filtering

In [20]:
if 'mat_mean' in add_cols: # SHOULD BE METHOD NAME!
    adata.uns['mat_mean'] = np.mean(adata.X)

In [21]:
# Filter to only include the relevant genes
adata = adata[:,np.intersect1d(entities, adata.var.index)]
adata

View of AnnData object with n_obs × n_vars = 90 × 375
    obs: 'label'
    uns: 'X_name', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'seurat_annotations'
    layers: 'logcounts'

Get Stats

In [22]:
lr_res = _get_lr(adata, resource, key_cols + complex_cols + add_cols, de_method)

  self.data[key] = value
  next(self.gen)


In [23]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,ligand,receptor
0,B,B,LGALS9,PTPRC,0.374320,0.761533,LGALS9,PTPRC
1,B,B,CD22,PTPRC,0.554265,0.761533,CD22,PTPRC
2,B,B,LGALS1,PTPRC,0.611330,0.761533,LGALS1,PTPRC
3,B,B,LGALS9,CD44,0.374320,0.655391,LGALS9,CD44
4,B,B,ADAM10,CD44,0.119797,0.655391,ADAM10,CD44
...,...,...,...,...,...,...,...,...
505,NK,NK,CALM1,KCNN4,2.516959,0.000000,CALM1,KCNN4
506,NK,NK,PTPN6,CD300LF,0.652221,0.000000,PTPN6,CD300LF
507,NK,NK,NUCB2,ERAP1,0.156951,0.162055,NUCB2,ERAP1
508,NK,NK,SOCS2,EPOR,0.230338,0.000000,SOCS2,EPOR


Recomplexify

In [None]:
# If I want to implement anything else but the min as complex policy, I would need to change this:
# temp = temp[[x=="INHBA_INHBB" for x in temp['ligand_complex']]].sort_values('receptor_complex')
# temp = temp[[x=="ACVR1B_ACVR2A" for x in temp['receptor_complex']]]

In [24]:
lr_res = reassemble_complexes(lr_res, key_cols, complex_cols, complex_policy)

  lr_res = lr_res.obj.merge(cols_dict[col], left_on=left_on,


### SCORES

In [None]:
class ScoreClass:
    def __init__(self, method_name, method_cols, score_fun, magnitude, specificity, permute, reference):
        self.method_name = method_name
        self.method_cols = method_cols
        self.score_fun = score_fun
        self.magnitude = magnitude
        self.specificity = specificity
        self.permute = permute
        self.reference = reference
    
    # describe self
    def description(self):
        print(f"{self.method_name} uses {self.specificity} and {self.magnitude}")

In [None]:
# ScoreClass(method_name='cellphonedb',
#           method_cols = ['ligand_means', 'receptor_means'],
#           score_fun = )

CellPhoneDB re-implement

In [79]:
perms, ligand_pos, receptor_pos, labels_pos = _get_means_perms(adata=adata, lr_res=lr_res, n_perms=1000, seed=69)

In [81]:
lr_res[['lr_means', 'pvals']] = lr_res.apply(_build_ecdf, axis=1, result_type="expand")

In [25]:
def _simple_mean(x, y): return(x + y)/2

def _build_ecdf(x):
    if((x.ligand_means==0) | (x.receptor_means==0)): return 1

    # Permutations lr mean
    ligand_perms = perms[:, labels_pos[x.source], ligand_pos[x.ligand]]
    receptor_perms = perms[:, labels_pos[x.target], receptor_pos[x.receptor]]
    lr_perms = _simple_mean(ligand_perms, receptor_perms)
    
    # actual lr_mean
    lr_mean = _simple_mean(x.ligand_means, x.receptor_means)
    
    return (lr_mean, (1 - ECDF(lr_perms)(lr_mean)))

Permutations (Independent Fun)

In [84]:
def _get_means_perms(adata, lr_res, n_perms, seed):
    # initialize rng
    rng = np.random.default_rng(seed=seed)
    
    # define labels and dict
    labels = adata.obs.label.cat.categories
    labels_dict = {label:adata.obs.label.isin([label]) for label in labels}
    
    # indexes to be shuffled
    idx = np.arange(adata.X.shape[0])
    
    # Perm should be a cube /w dims (nperms x idents x ngenes)
    perms = np.zeros((n_perms, labels.shape[0], adata.shape[1]))
    
    # Assign permuted matrix
    for perm in range(n_perms):
        perm_idx = rng.permutation(idx)
        perm_mat = adata.X[perm_idx].copy()
        # populate matrix /w permuted means
        for cind in range(labels.shape[0]):
            perms[perm, cind] = perm_mat[labels_dict[labels[cind]]].mean(0)
    
    # Get indeces for each gene and label in the permutations
    ligand_pos = { entity:np.where(adata.var_names==entity)[0][0] for entity in lr_res['ligand'] }
    receptor_pos = { entity:np.where(adata.var_names==entity)[0][0] for entity in lr_res['receptor'] }
    labels_pos = { labels[pos]:pos for pos in range(labels.shape[0]) }
    
    return perms, ligand_pos, receptor_pos, labels_pos

In [82]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,ligand,receptor,lr_means,pvals
0,B,B,LGALS9,PTPRC,0.374320,0.761533,LGALS9,PTPRC,0.567927,0.890
1,B,B,CD22,PTPRC,0.554265,0.761533,CD22,PTPRC,0.657899,0.647
2,B,B,LGALS1,PTPRC,0.611330,0.761533,LGALS1,PTPRC,0.686431,1.000
3,B,B,LGALS9,CD44,0.374320,0.655391,LGALS9,CD44,0.514855,0.163
4,B,B,ADAM10,CD44,0.119797,0.655391,ADAM10,CD44,0.387594,0.613
...,...,...,...,...,...,...,...,...,...,...
4282,NK,NK,CALM1,KCNN4,2.516959,0.000000,CALM1,KCNN4,1.000000,1.000
4283,NK,NK,PTPN6,CD300LF,0.652221,0.000000,PTPN6,CD300LF,1.000000,1.000
4284,NK,NK,NUCB2,ERAP1,0.156951,0.162055,NUCB2,ERAP1,0.159503,0.066
4285,NK,NK,SOCS2,EPOR,0.230338,0.000000,SOCS2,EPOR,1.000000,1.000


Calculate means_sums for NATMI

In [None]:
def _sum_means(lr_res, what, on):
    return lr_res.join(lr_res.groupby(on)[what].sum(), on=on, rsuffix='_sums')

In [None]:
lr_res = _sum_means(lr_res, what='ligand_means', on=['ligand_complex', 'receptor_complex', 'target'])
lr_res = _sum_means(lr_res, what='receptor_means', on=['ligand_complex', 'receptor_complex', 'source'])

In [None]:
lr_res

NATMI fun

In [None]:
def _natmi_score(x):
    lig = (x.ligand_means / x.ligand_means_sums)
    rec = (x.receptor_means / x.receptor_means_sums)
    return  lig * rec

In [None]:
lr_res['edge_specificity'] = lr_res.apply(_natmi_score, axis=1)

In [None]:
lr_res.sort_values('edge_specificity', ascending=False)

SCA re-implement

In [None]:
def _sca_score(x):
    lr_sqrt = np.sqrt(x.ligand_means) * np.sqrt(x.receptor_means)
    return lr_sqrt / (lr_sqrt + x.mat_mean)

In [None]:
lr_res['lrscore'] = lr_res.apply(_sca_score, axis=1)

In [None]:
lr_res.sort_values('lrscore', ascending=False)

In [None]:
lr_res[(lr_res.ligand_complex=='TGFB1') & (lr_res.receptor_complex=='ACVR1_TGFBR1_TGFBR2')].sort_values('lrscore', ascending=False)

logFC re-implement

In [None]:
lr_res['logfc'] = lr_res[['ligand_logfoldchanges', 'receptor_logfoldchanges']].mean(1)

In [None]:
lr_res.sort_values(by='logfc', key=abs, ascending=False)

Re-implement Connectome

In [None]:
lr_res['edge_weight'] = lr_res[['ligand_zscores', 'receptor_zscores']].mean(1)