In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from statsmodels.distributions.empirical_distribution import ECDF

Load anndata and resource

In [2]:
# adata = sc.datasets.pbmc3k()
# adata.X

In [3]:
adata = sc.read_h5ad("../test.h5ad")

In [4]:
adata.X = csr_matrix(adata.X.astype(np.int64))

In [5]:
adata.X

<90x10249 sparse matrix of type '<class 'numpy.int64'>'
	with 103311 stored elements in Compressed Sparse Row format>

Process resource

In [6]:
resource = pd.read_csv("../consensus.csv", index_col=False)
resource = resource[['source_genesymbol', 'target_genesymbol']]
resource = resource.rename(columns={'source_genesymbol':'ligand',
                                    'target_genesymbol':'receptor'})

In [7]:
resource = pd.read_csv("../consensus.csv", index_col=False)
resource = resource[['source_genesymbol', 'target_genesymbol']]
resource = resource.rename(columns={'source_genesymbol':'ligand',
                                    'target_genesymbol':'receptor'})
resource['interaction'] = resource['ligand'] + '|' + resource['receptor']

# Decomplexify
resource = (resource.set_index('interaction')
            .apply(lambda x: x.str.split('_'))
            .explode(['receptor'])
            .explode('ligand')
            .reset_index()
           )

resource[['ligand_complex','receptor_complex']] = resource['interaction'].str.split('|',expand=True)
resource


Unnamed: 0,interaction,ligand,receptor,ligand_complex,receptor_complex
0,LGALS9|PTPRC,LGALS9,PTPRC,LGALS9,PTPRC
1,LGALS9|MET,LGALS9,MET,LGALS9,MET
2,LGALS9|CD44,LGALS9,CD44,LGALS9,CD44
3,LGALS9|LRP1,LGALS9,LRP1,LGALS9,LRP1
4,LGALS9|CD47,LGALS9,CD47,LGALS9,CD47
...,...,...,...,...,...
5853,BMP2|ACTR2,BMP2,ACTR2,BMP2,ACTR2
5854,BMP15|ACTR2,BMP15,ACTR2,BMP15,ACTR2
5855,CSF1|CSF3R,CSF1,CSF3R,CSF1,CSF3R
5856,IL36G|IFNAR1,IL36G,IFNAR1,IL36G,IFNAR1


In [8]:
ligands = np.unique(resource["ligand"])
receptors = np.unique(resource["receptor"])
entities = np.union1d(ligands, receptors)

In [9]:
labels = adata.obs.label.cat.categories

In [10]:
labels

Index(['B', 'CD8 T', 'NK'], dtype='object')

Process adata

In [11]:
# adata.layers['counts'] = adata.X

In [12]:
# lognorm should be the default (expected)
adata.X = adata.layers['logcounts']
adata.layers['scaled'] = sc.pp.scale(adata, copy=True).X

In [13]:
# Get global mean for SCA before filtering
mat_mean = np.mean(adata.X)

In [14]:
# Filter to only include the relevant genes
adata = adata[:,np.intersect1d(entities, adata.var.index)]
adata

View of AnnData object with n_obs × n_vars = 90 × 553
    obs: 'label'
    uns: 'X_name', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'seurat_annotations'
    layers: 'logcounts', 'scaled'

In [15]:
resource = resource[(np.isin(resource.ligand, adata.var_names)) & (np.isin(resource.receptor, adata.var_names))]

In [16]:
# Only keep interactions /w complexes for which all subunits are present
temp_res = resource[['_' in x for x in resource['interaction']]].copy()
temp_res['interaction'] = temp_res['ligand_complex'] + '_' + resource['receptor_complex']
# Get those not with all subunits
temp_res = temp_res[np.logical_not([ all([x in adata.var_names for x in entity.split('_')]) for entity in temp_res.interaction])]
temp_res['interaction'] = temp_res['ligand_complex'] + '|' + resource['receptor_complex']

In [17]:
# filter them
resource = resource[~resource.interaction.isin(temp_res.interaction)]

### Stats

In [18]:
sc.tl.rank_genes_groups(adata, 'label', method='wilcoxon')

  self.data[key] = value
  next(self.gen)


In [19]:
dedict = {label:sc.get.rank_genes_groups_df(adata, label).assign(label=label).sort_values('names') for label in labels}

Calculate Mean, Sum and z-scores by group

In [20]:
# check if all is gucci
list(adata.var_names) == list(dedict['B']['names'])

True

In [21]:
for label in labels: 
    temp = adata[adata.obs.label.isin([label])].copy()
    # dedict[label]['sums'] = temp.X.sum(0)
    dedict[label]['means'] = temp.X.mean(0).A.flatten()
    dedict[label]['zscores'] = temp.layers['scaled'].mean(0)

Join Means

In [22]:
pairs = pd.DataFrame(np.array(np.meshgrid(labels, labels)).reshape(2, np.size(labels) * np.size(labels)).T).rename(columns={0: "source", 1: "target"})

In [23]:
def join_means(source, target):
    source_stats = dedict[source].copy()
    source_stats.columns = source_stats.columns.map(lambda x: 'ligand_' + str(x))
    source_stats = source_stats.rename(columns={'ligand_names':'ligand', 'ligand_label':'source'})
    
    target_stats = dedict[target].copy()
    target_stats.columns = target_stats.columns.map(lambda x: 'receptor_' + str(x))
    target_stats = target_stats.rename(columns={'receptor_names':'receptor', 'receptor_label':'target'})
    
    bound = resource.merge(source_stats).merge(target_stats)
    
    return bound

In [24]:
lr_res = pd.concat([join_means(source, target) for source, target in zip(pairs['source'], pairs['target'])])

In [25]:
lr_res['mat_mean'] = mat_mean

In [26]:
lr_res

Unnamed: 0,interaction,ligand,receptor,ligand_complex,receptor_complex,ligand_scores,ligand_logfoldchanges,ligand_pvals,ligand_pvals_adj,source,ligand_means,ligand_zscores,receptor_scores,receptor_logfoldchanges,receptor_pvals,receptor_pvals_adj,target,receptor_means,receptor_zscores,mat_mean
0,LGALS9|PTPRC,LGALS9,PTPRC,LGALS9,PTPRC,1.433668,2.440976,0.151667,0.823608,B,0.374320,0.377836,-3.440802,-1.272587,0.000580,0.008909,B,0.761533,-0.415538,0.199056
1,CD22|PTPRC,CD22,PTPRC,CD22,PTPRC,2.730388,3.604738,0.006326,0.063285,B,0.554265,0.618197,-3.440802,-1.272587,0.000580,0.008909,B,0.761533,-0.415538,0.199056
2,LGALS1|PTPRC,LGALS1,PTPRC,LGALS1,PTPRC,-2.935809,-2.024960,0.003327,0.036073,B,0.611330,-0.460597,-3.440802,-1.272587,0.000580,0.008909,B,0.761533,-0.415538,0.199056
3,LGALS9|CD44,LGALS9,CD44,LGALS9,CD44,1.433668,2.440976,0.151667,0.823608,B,0.374320,0.377836,-0.419401,-0.198308,0.674923,0.960181,B,0.655391,-0.051623,0.199056
4,ADAM10|CD44,ADAM10,CD44,ADAM10,CD44,0.089872,-0.020321,0.928389,0.974192,B,0.119797,-0.002707,-0.419401,-0.198308,0.674923,0.960181,B,0.655391,-0.051623,0.199056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,CALM1|KCNN4,CALM1,KCNN4,CALM1,KCNN4,3.560631,1.149412,0.000370,0.006200,NK,2.516959,0.526745,-0.385164,-25.897530,0.700116,0.969362,NK,0.000000,-0.183226,0.199056
506,PTPN6|CD300LF,PTPN6,CD300LF,PTPN6,CD300LF,-0.813125,-0.725591,0.416146,0.969362,NK,0.652221,-0.199005,-0.128388,-24.651196,0.897842,0.969362,NK,0.000000,-0.105409,0.199056
507,NUCB2|ERAP1,NUCB2,ERAP1,NUCB2,ERAP1,0.141227,0.482593,0.887691,0.969362,NK,0.156951,0.065837,0.654780,2.871128,0.512610,0.969362,NK,0.162055,0.282299,0.199056
508,SOCS2|EPOR,SOCS2,EPOR,SOCS2,EPOR,0.684737,1.819167,0.493510,0.969362,NK,0.230338,0.243138,-0.128388,-24.620756,0.897842,0.969362,NK,0.000000,-0.105409,0.199056


Recomplexify

In [27]:
# def recomplexify(lr_res, grps, complex_cols, complex_policy = 'min'):


# def _reduce_complexes():
    

In [28]:
complex_policy = 'min'

In [29]:
grps = ['source', 'target', 'ligand_complex', 'receptor_complex']

In [30]:
complex_cols = ['ligand_means' , 'receptor_means'] # specific for every method

In [31]:
add_cols = ['mat_mean', 'ligand', 'receptor']

In [32]:
# subset /w only means here - to be extended to all columns
lr_res = lr_res[ grps + complex_cols + add_cols ]

In [33]:
# temp = temp[[x=="INHBA_INHBB" for x in temp['ligand_complex']]].sort_values('receptor_complex')
# temp = temp[[x=="ACVR1B_ACVR2A" for x in temp['receptor_complex']]]

In [34]:
lr_res = lr_res.groupby(grps)

In [35]:
lr_res.obj

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand,receptor
0,B,B,LGALS9,PTPRC,0.374320,0.761533,0.199056,LGALS9,PTPRC
1,B,B,CD22,PTPRC,0.554265,0.761533,0.199056,CD22,PTPRC
2,B,B,LGALS1,PTPRC,0.611330,0.761533,0.199056,LGALS1,PTPRC
3,B,B,LGALS9,CD44,0.374320,0.655391,0.199056,LGALS9,CD44
4,B,B,ADAM10,CD44,0.119797,0.655391,0.199056,ADAM10,CD44
...,...,...,...,...,...,...,...,...,...
505,NK,NK,CALM1,KCNN4,2.516959,0.000000,0.199056,CALM1,KCNN4
506,NK,NK,PTPN6,CD300LF,0.652221,0.000000,0.199056,PTPN6,CD300LF
507,NK,NK,NUCB2,ERAP1,0.156951,0.162055,0.199056,NUCB2,ERAP1
508,NK,NK,SOCS2,EPOR,0.230338,0.000000,0.199056,SOCS2,EPOR


In [36]:
# Functions to be used to reduce the complexes
aggs = set([complex_policy, 'min']) # set to remove if both are min

In [37]:
complex_cols

['ligand_means', 'receptor_means']

In [38]:
cols_dict = {}

for col in complex_cols:
    cols_dict[col] = lr_res[col].agg(aggs).reset_index().copy().\
    rename(columns={agg:col.split('_')[0] + '_' + agg for agg in aggs})

In [39]:
lr_res = lr_res.obj.copy()

In [40]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand,receptor
0,B,B,LGALS9,PTPRC,0.374320,0.761533,0.199056,LGALS9,PTPRC
1,B,B,CD22,PTPRC,0.554265,0.761533,0.199056,CD22,PTPRC
2,B,B,LGALS1,PTPRC,0.611330,0.761533,0.199056,LGALS1,PTPRC
3,B,B,LGALS9,CD44,0.374320,0.655391,0.199056,LGALS9,CD44
4,B,B,ADAM10,CD44,0.119797,0.655391,0.199056,ADAM10,CD44
...,...,...,...,...,...,...,...,...,...
505,NK,NK,CALM1,KCNN4,2.516959,0.000000,0.199056,CALM1,KCNN4
506,NK,NK,PTPN6,CD300LF,0.652221,0.000000,0.199056,PTPN6,CD300LF
507,NK,NK,NUCB2,ERAP1,0.156951,0.162055,0.199056,NUCB2,ERAP1
508,NK,NK,SOCS2,EPOR,0.230338,0.000000,0.199056,SOCS2,EPOR


In [41]:
for col in complex_cols:
    # left is lr_res /w the actual column name
    left_on = grps + [col]
    # right is the min subunit for that column
    join_key = col.split('_')[0] + '_min' # ligand_min or receptor_min
    right_on = grps + [join_key]
    
    # Here, I join the min value and keep only those rows that match
    lr_res = lr_res.merge(cols_dict[col], left_on=left_on, right_on=right_on).drop(join_key, 1)


  lr_res = lr_res.merge(cols_dict[col], left_on=left_on, right_on=right_on).drop(join_key, 1)


In [42]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand,receptor
0,B,B,LGALS9,PTPRC,0.374320,0.761533,0.199056,LGALS9,PTPRC
1,B,B,CD22,PTPRC,0.554265,0.761533,0.199056,CD22,PTPRC
2,B,B,LGALS1,PTPRC,0.611330,0.761533,0.199056,LGALS1,PTPRC
3,B,B,LGALS9,CD44,0.374320,0.655391,0.199056,LGALS9,CD44
4,B,B,ADAM10,CD44,0.119797,0.655391,0.199056,ADAM10,CD44
...,...,...,...,...,...,...,...,...,...
4282,NK,NK,CALM1,KCNN4,2.516959,0.000000,0.199056,CALM1,KCNN4
4283,NK,NK,PTPN6,CD300LF,0.652221,0.000000,0.199056,PTPN6,CD300LF
4284,NK,NK,NUCB2,ERAP1,0.156951,0.162055,0.199056,NUCB2,ERAP1
4285,NK,NK,SOCS2,EPOR,0.230338,0.000000,0.199056,SOCS2,EPOR


In [43]:
# More than those in LIANA - why? Duplicates?

CellPhoneDB re-implement

In [44]:
lr_res = lr_res.iloc[:, 0:9].copy()

In [45]:
lr_res['lr_mean'] = lr_res[['ligand_means', 'receptor_means']].mean(1)

In [46]:
perms = {}
ad = adata.copy()

In [47]:
idx = np.arange(adata.X.shape[0])
rng = np.random.default_rng(seed=69)

In [171]:
n_perms = 1000

In [172]:
# Perm should be a cube of nperms x ngenes x #celltypes
perms = np.zeros((n_perms, labels.shape[0], adata.shape[1]))

In [173]:
labels_dict = {label:adata.obs.label.isin([label]) for label in labels}

In [174]:
labels[0]

'B'

In [175]:
# Assign Perms
for perm in range(n_perms):
    perm_idx = rng.permutation(idx)
    perm_mat = adata.X[perm_idx].copy()
    
    # populate matrix /w permuted means
    for cind in range(labels.shape[0]):
        perms[perm, cind] = perm_mat[labels_dict[labels[cind]]].mean(0)

In [176]:
# if working /w complexes
# ligand_pos = { entity:[ np.where(adata.var_names==x)[0][0] for x in entity.split('_') ] for entity in lr_res['ligand'] }
# receptor_pos = { entity:[ np.where(adata.var_names==x)[0][0] for x in entity.split('_') ] for entity in lr_res['receptor']}

In [177]:
ligand_pos = { entity:np.where(adata.var_names==entity)[0][0] for entity in lr_res['ligand'] }
receptor_pos = { entity:np.where(adata.var_names==entity)[0][0] for entity in lr_res['receptor'] }

In [178]:
labels_pos = { labels[pos]:pos for pos in range(labels.shape[0]) }

In [189]:
simple_mean = lambda x, y: (x + y)/2

def _build_ecdf(x):
    ligand_perms = perms[:, labels_pos[x.source], ligand_pos[x.ligand]]
    receptor_perms = perms[:, labels_pos[x.target], receptor_pos[x.receptor]]
    
    return ECDF(simple_mean(ligand_perms, receptor_perms))(x.lr_mean)

In [190]:
lr_res['p_val'] = lr_res.apply(_build_ecdf, axis=1)

In [191]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand,receptor,lr_mean
0,B,B,LGALS9,PTPRC,0.374320,0.761533,0.199056,LGALS9,PTPRC,0.567927
1,B,B,CD22,PTPRC,0.554265,0.761533,0.199056,CD22,PTPRC,0.657899
2,B,B,LGALS1,PTPRC,0.611330,0.761533,0.199056,LGALS1,PTPRC,0.686432
3,B,B,LGALS9,CD44,0.374320,0.655391,0.199056,LGALS9,CD44,0.514855
4,B,B,ADAM10,CD44,0.119797,0.655391,0.199056,ADAM10,CD44,0.387594
...,...,...,...,...,...,...,...,...,...,...
4282,NK,NK,CALM1,KCNN4,2.516959,0.000000,0.199056,CALM1,KCNN4,1.258479
4283,NK,NK,PTPN6,CD300LF,0.652221,0.000000,0.199056,PTPN6,CD300LF,0.326110
4284,NK,NK,NUCB2,ERAP1,0.156951,0.162055,0.199056,NUCB2,ERAP1,0.159503
4285,NK,NK,SOCS2,EPOR,0.230338,0.000000,0.199056,SOCS2,EPOR,0.115169


In [192]:
ECDF(x[1], side='right')(0.657899)

AxisError: axis -1 is out of bounds for array of dimension 0

In [80]:
l_cind = labels_pos['B']
l_gind = ligand_pos['LGALS9']
r_cind = labels_pos['B']
r_gind = receptor_pos['PTPRC']

In [103]:
ecdf = ECDF([ np.mean(np.array(perms[perm, l_cind, l_gind], perms[perm, r_cind, r_gind])) for perm in range(n_perms)])

In [133]:
perms[:, 1, 1]

array([0.        , 0.05558536, 0.        , 0.        , 0.        ,
       0.05558536, 0.05558536, 0.05558536, 0.        , 0.05558536,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.05558536, 0.        , 0.        , 0.        ,
       0.        , 0.05558536, 0.        , 0.        , 0.05558536,
       0.        , 0.        , 0.05558536, 0.        , 0.05558536,
       0.05558536, 0.05558536, 0.        , 0.        , 0.05558536,
       0.        , 0.        , 0.        , 0.05558536, 0.05558536,
       0.        , 0.05558536, 0.        , 0.        , 0.05558536,
       0.05558536, 0.        , 0.        , 0.        , 0.        ,
       0.05558536, 0.        , 0.        , 0.05558536, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.05558536, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [110]:
1 - ecdf(0.57)

0.0

In [82]:
ecdf(6)

0.018000000000000002

In [None]:
For each entitity
    if len==0:
        get perms 
        mean lrs
    else:
        get perms of all units
        mean complexes
        mean lrs
    
    build lr ecdf
    estimate quantile

In [None]:
# Need index for gene and cluster

In [137]:
# 
x = perms[:, 2, 3]

In [138]:
y = perms[:, 2, 4]

In [146]:
fff = lambda x, y: (x + y)/2

In [147]:
fff(x,y)

array([0.1312128 , 0.1568791 , 0.21188381, 0.12077259, 0.15260656,
       0.15726257, 0.18429707, 0.10034341, 0.1845532 , 0.12205499,
       0.25609324, 0.21325523, 0.25137904, 0.14172405, 0.13633916,
       0.16730745, 0.17556587, 0.17443054, 0.13275182, 0.2298011 ,
       0.19006831, 0.25452776, 0.22986533, 0.15632454, 0.20401162,
       0.23902453, 0.16384661, 0.19606207, 0.14813   , 0.30211908,
       0.12326709, 0.12691608, 0.16006652, 0.20631939, 0.09543464,
       0.25064391, 0.1790577 , 0.18677406, 0.18028195, 0.2132177 ,
       0.19737856, 0.21813779, 0.14635746, 0.1044095 , 0.14530621,
       0.28101332, 0.21568239, 0.24288333, 0.1494697 , 0.17016404,
       0.23392531, 0.16187163, 0.20798502, 0.1426579 , 0.2144905 ,
       0.15175229, 0.19303794, 0.23505113, 0.19037419, 0.09756741,
       0.16208322, 0.1660434 , 0.20018185, 0.19748382, 0.20049691,
       0.20110026, 0.25515276, 0.20403454, 0.18367359, 0.15385273,
       0.14635903, 0.13834185, 0.22858047, 0.13893627, 0.17981

In [None]:
perms[5]['B'].A[0][receptor_pos[0][0]['PTPRC']]

In [None]:
pos_dict[1]

In [None]:
lr_res

In [None]:
perms[0]x

In [None]:
np.where(adata.var_names=='ACVR1B')[0][0]

In [None]:
# Create a numpy array from a list of numbers
arr = np.array([11, 12, 13, 14, 15, 16, 17, 15, 11, 12, 14, 15, 16, 17])
# Get the index of elements with value less than 16 and greater than 12
result = np.where((arr > 12) & (arr < 16))

In [None]:
result

In [None]:
perms[0]['B']

In [None]:
entities

In [None]:
perms[0]

In [None]:
adata.var_names

In [None]:
perms[0]['B']

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

In [None]:
ec = ECDF(perms[perm]['a'])

Calculate means_sums for NATMI

In [None]:
def _sum_means(lr_res, what, on):
    return lr_res.join(lr_res.groupby(on)[what].sum(), on=on, rsuffix='_sums')

In [None]:
lr_res = _sum_means(lr_res, what='ligand_means', on=['ligand_complex', 'receptor_complex', 'target'])
lr_res = _sum_means(lr_res, what='receptor_means', on=['ligand_complex', 'receptor_complex', 'source'])

In [None]:
lr_res

NATMI fun

In [None]:
def _natmi_score(x):
    lig = (x.ligand_means / x.ligand_means_sums)
    rec = (x.receptor_means / x.receptor_means_sums)
    return  lig * rec

In [None]:
lr_res['edge_specificity'] = lr_res.apply(_natmi_score, axis=1)

In [None]:
lr_res.sort_values('edge_specificity', ascending=False)

SCA re-implement

In [None]:
def _sca_score(x):
    lr_sqrt = np.sqrt(x.ligand_means) * np.sqrt(x.receptor_means)
    return lr_sqrt / (lr_sqrt + x.mat_mean)

In [None]:
lr_res['lrscore'] = lr_res.apply(_sca_score, axis=1)

In [None]:
lr_res.sort_values('lrscore', ascending=False)

In [None]:
lr_res[(lr_res.ligand_complex=='TGFB1') & (lr_res.receptor_complex=='ACVR1_TGFBR1_TGFBR2')].sort_values('lrscore', ascending=False)

logFC re-implement

In [None]:
lr_res['logfc'] = lr_res[['ligand_logfoldchanges', 'receptor_logfoldchanges']].mean(1)

In [None]:
lr_res.sort_values(by='logfc', key=abs, ascending=False)

Re-implement Connectome

In [None]:
lr_res['edge_weight'] = lr_res[['ligand_zscores', 'receptor_zscores']].mean(1)

Custom min0 function:


In [None]:
# Importing reduce for 
# rolling computations
from functools import reduce
  
# define a Custom aggregation 
# function for finding total
def mean0(series):
      return reduce(lambda x, y:0  if 0 in (x, y) else (x + y), series)