In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

Load anndata

In [2]:
adata = sc.read_h5ad("test.h5ad")
# lognorm should be the default (expected)
adata.X = adata.layers['logcounts']

Parameters

In [None]:
groupby='label'

In [None]:
de_method='wilcoxon'

In [None]:
resource_name = 'consensus'

In [None]:
complex_policy = 'min'

In [None]:
complex_cols = ['ligand_means' , 'receptor_means'] # specific for every method

In [None]:
add_cols = ['ligand', 'receptor'] # additional columns to return

In [None]:
key_cols = ['source', 'target', 'ligand_complex', 'receptor_complex']

In [None]:
verbose = False

In [None]:
# need to figure out when to define this
relevant_cols = key_cols + complex_cols + add_cols

In [None]:
# only relevant if perms are done
n_perms = 1000
seed = 69

Run LIANA PIPE

In [3]:
from liana import liana_pipe

In [None]:
lr_res = liana_pipe(adata=adata, 
                    groupby=groupby, 
                    resource_name=resource_name,
                    de_method=de_method,
                    n_perms=n_perms,
                    seed=seed,
                    verbose=verbose,
                    _key_cols=key_cols,
                    _complex_cols=complex_cols,
                    _add_cols=add_cols,
                    resource=None)

In [None]:
lr_res

#### LIANA PIPE LINE BY LINE

In [None]:
from liana.utils.pre import check_mat, check_if_covered, format_vars, filter_resource
from liana.resource import select_resource
from liana.resource.select_resource import explode_complexes
from liana.steady.liana_pipe import _get_lr
from liana.utils.reassemble_complexes import reassemble_complexes
from scipy.sparse import csr_matrix
from liana.scores import get_means_perms

I need to double check if every step makes sense, as I build unit tests!!!

For example, are there any duplicated rows (source, target, ligand_complex, receptor_complex)

Double check again if correct subunit is kept, etc

In [None]:
adata.X = check_mat(adata.X, True)

In [None]:
# Define idents col name
adata.obs.label = adata.obs[groupby]

Load resource

In [None]:
resource = select_resource(resource_name='consensus')

In [None]:
# Decomplexify
resource = explode_complexes(resource)

In [None]:
# Filter Resource
resource = filter_resource(resource, adata.var_names)

Create entities

In [None]:
entities = np.union1d(np.unique(resource["ligand"]), np.unique(resource["receptor"]))

Check overlap between resource and adata

In [None]:
check_if_covered(entities, adata.var_keys)

Get global mean for SCA before filtering

In [None]:
if 'mat_mean' in add_cols: # SHOULD BE METHOD NAME!
    adata.uns['mat_mean'] = np.mean(adata.X)

In [None]:
# Filter to only include the relevant genes
adata = adata[:,np.intersect1d(entities, adata.var.index)]
adata

Get Stats

In [None]:
lr_res = _get_lr(adata, resource, key_cols + complex_cols + add_cols, de_method)

In [None]:
lr_res

Recomplexify

In [None]:
# If I want to implement anything else but the min as complex policy, I would need to change this:
# temp = temp[[x=="INHBA_INHBB" for x in temp['ligand_complex']]].sort_values('receptor_complex')
# temp = temp[[x=="ACVR1B_ACVR2A" for x in temp['receptor_complex']]]

In [None]:
lr_res = reassemble_complexes(lr_res, key_cols, complex_cols, complex_policy)

### SCORES

CellPhoneDB re-implement

In [9]:
from statsmodels.distributions.empirical_distribution import ECDF

In [10]:
def _simple_mean(x, y): return(x + y)/2

def _cpdb_score(x, perms, ligand_pos, receptor_pos, labels_pos):
    if((x.ligand_means==0) | (x.receptor_means==0)): return 1

    # Permutations lr mean
    ligand_perms = perms[:, labels_pos[x.source], ligand_pos[x.ligand]]
    receptor_perms = perms[:, labels_pos[x.target], receptor_pos[x.receptor]]
    lr_perms = _simple_mean(ligand_perms, receptor_perms)
    
    # actual lr_mean
    lr_mean = _simple_mean(x.ligand_means, x.receptor_means)
    
    return (lr_mean, (1 - ECDF(lr_perms)(lr_mean)))

SCORE CLASS

In [11]:
class ScoreClass:
    def __init__(self, method_name, complex_cols, add_cols, fun, magnitude, specificity, permute, reference):
        self.method_name = method_name # method name
        self.complex_cols = complex_cols # complex-relevant columns
        self.add_colls = add_cols # additional columns
        self.fun = fun # Function to run
        self.magnitude = magnitude # Name of the col
        self.specificity = specificity # Name of the col
        self.permute = permute # True/False
        self.reference = reference # Publication
    
    # describe self
    def describe(self):
        print(f"{self.method_name} uses `{self.magnitude}` and `{self.specificity}` as measures of expression strength and interaction specificity, respectively")
    
    def reference(self):
        print(reference)

In [12]:
cpdb_score = ScoreClass(method_name = "cellphonedb",
                        complex_cols = ['ligand_means', 'receptor_means'], 
                        add_cols = ['ligand', 'receptor'],
                        fun = _cpdb_score,
                        magnitude = 'lr_means',
                        specificity = 'pvals',
                        permute = True,
                        reference = 'Efremova et al., 2020')

In [13]:
def cellphonedb(adata, groupby, resource_name='consensus', resource=None, n_perms=1000, seed=69, de_method='wilcoxon', verbose=False):
    
    adata.uns['liana_res'] = liana_pipe(adata=adata,
                                        groupby=groupby,
                                        resource_name=resource_name,
                                        resource=resource,
                                        de_method=de_method,
                                        verbose=verbose,
                                        _complex_cols=cpdb_score.complex_cols,
                                        _add_cols=cpdb_score.add_colls,
                                        _score = cpdb_score,
                                        n_perms = n_perms,
                                        seed=seed,
                                       )
    
    return adata

In [22]:
%%time
adata = cellphonedb(adata, groupby='label', n_perms=1000)

CPU times: user 3.79 s, sys: 54.3 ms, total: 3.84 s
Wall time: 3.73 s


try w/ real data

In [30]:
adata = sc.datasets.pbmc3k_processed()
labels = adata.obs.louvain
cells = adata.obs_names

In [31]:
adata = sc.datasets.pbmc3k()

In [41]:
adata = adata[[x in cells for x in adata.obs_names]]

In [43]:
adata.obs['label'] = labels

  adata.obs['label'] = labels


In [45]:
adata = cellphonedb(adata, groupby='label', n_perms=1000)

ValueError: Data matrix has wrong shape (2638, 16579), need to be (2638, 32738).

In [None]:
# ScoreClass(method_name='cellphonedb',
#           method_cols = ['ligand_means', 'receptor_means'],
#           score_fun = )

In [None]:
if cpdb_score.permute:
    perms, ligand_pos, receptor_pos, labels_pos = get_means_perms(adata=adata, lr_res=lr_res, n_perms=1000, seed=69)

In [None]:
%%time
lr_res[[cpdb_score.magnitude, cpdb_score.specificity]] = lr_res.apply(cpdb_score.fun, axis=1, result_type="expand", perms=perms, ligand_pos=ligand_pos, receptor_pos=receptor_pos, labels_pos=labels_pos)

In [None]:
lr_res

In [None]:
lr_res

Vectorize this thing

In [None]:
def _cpdb_score(x):
    if((x.ligand_means==0) | (x.receptor_means==0)): return 1

    # Permutations lr mean
    ligand_perms = perms[:, labels_pos[x.source], ligand_pos[x.ligand]]
    receptor_perms = perms[:, labels_pos[x.target], receptor_pos[x.receptor]]
    lr_perms = _simple_mean(ligand_perms, receptor_perms)
    
    # actual lr_mean
    lr_mean = _simple_mean(x.ligand_means, x.receptor_means)
    
    return (lr_mean, (1 - ECDF(lr_perms)(lr_mean)))

Calculate means_sums for NATMI

In [None]:
def _sum_means(lr_res, what, on):
    return lr_res.join(lr_res.groupby(on)[what].sum(), on=on, rsuffix='_sums')

In [None]:
lr_res = _sum_means(lr_res, what='ligand_means', on=['ligand_complex', 'receptor_complex', 'target'])
lr_res = _sum_means(lr_res, what='receptor_means', on=['ligand_complex', 'receptor_complex', 'source'])

In [None]:
lr_res

NATMI fun

Both magnitude and specificity! Returned to unpack!!!

In [None]:
def _natmi_score(x):
    lig = (x.ligand_means / x.ligand_means_sums)
    rec = (x.receptor_means / x.receptor_means_sums)
    return  lig * rec

In [None]:
lr_res['edge_specificity'] = lr_res.apply(_natmi_score, axis=1)

In [None]:
lr_res.sort_values('edge_specificity', ascending=False)

SCA re-implement

return Specificity as None

In [None]:
def _sca_score(x):
    lr_sqrt = np.sqrt(x.ligand_means) * np.sqrt(x.receptor_means)
    return lr_sqrt / (lr_sqrt + x.mat_mean)

In [None]:
lr_res['lrscore'] = lr_res.apply(_sca_score, axis=1)

In [None]:
lr_res.sort_values('lrscore', ascending=False)

In [None]:
lr_res[(lr_res.ligand_complex=='TGFB1') & (lr_res.receptor_complex=='ACVR1_TGFBR1_TGFBR2')].sort_values('lrscore', ascending=False)

logFC re-implement

Magnitude as None

In [None]:
lr_res['logfc'] = lr_res[['ligand_logfoldchanges', 'receptor_logfoldchanges']].mean(1)

In [None]:
lr_res.sort_values(by='logfc', key=abs, ascending=False)

Re-implement Connectome

Both Specificity and Magnitude

In [None]:
lr_res['edge_weight'] = lr_res[['ligand_zscores', 'receptor_zscores']].mean(1)