In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import decoupler as dc

Load anndata and resource

In [2]:
adata = sc.read_h5ad("../test.h5ad")

In [3]:
resource = pd.read_csv("../consensus.csv", index_col=False)
resource = resource[['source_genesymbol', 'target_genesymbol']]
resource = resource.rename(columns={'source_genesymbol':'ligand',
                                    'target_genesymbol':'receptor'})
resource['interaction'] = resource['ligand'] + '|' + resource['receptor']

# Decomplexify
resource = (resource.set_index('interaction')
            .apply(lambda x: x.str.split('_'))
            .explode(['receptor'])
            .explode('ligand')
            .reset_index()
           )

In [4]:
resource[['ligand_complex','receptor_complex']] = resource['interaction'].str.split('|',expand=True)

In [5]:
ligands = np.unique(resource["ligand"])
receptors = np.unique(resource["receptor"])
entities = np.union1d(ligands, receptors)

In [6]:
labels = adata.obs.label.cat.categories

Process adata

In [7]:
adata.layers['counts'] = adata.X

In [8]:
# lognorm should be the default (expected)
adata.X = adata.layers['logcounts']
adata.layers['scaled'] = sc.pp.scale(adata, copy=True).X

In [9]:
# Get global mean for SCA before filtering
mat_mean = np.mean(adata.X)

In [10]:
# Filter to only include the relevant genes
adata = adata[:,np.intersect1d(entities, adata.var.index)]

In [11]:
sc.tl.rank_genes_groups(adata, 'label', method='wilcoxon')

  self.data[key] = value
  next(self.gen)


In [12]:
dedict = {label:sc.get.rank_genes_groups_df(adata, label).assign(label=label).sort_values('names') for label in labels}

Calculate Mean, Sum and z-scores by group

In [13]:
# check if all is gucci
list(adata.var_names) == list(dedict['a']['names'])

True

In [14]:
for label in labels: 
    temp = adata[adata.obs.label.isin([label])]
    # dedict[label]['sums'] = temp.X.sum(0)
    dedict[label]['means'] = temp.X.mean(0)
    dedict[label]['zscores'] = temp.layers['scaled'].mean(0)

Join Means

In [15]:
pairs = pd.DataFrame(np.array(np.meshgrid(labels, labels)).reshape(2, np.size(labels) * np.size(labels)).T).rename(columns={0: "source", 1: "target"})

In [16]:
def join_means(source, target):
    source_stats = dedict[source].copy()
    source_stats.columns = source_stats.columns.map(lambda x: 'ligand_' + str(x))
    source_stats = source_stats.rename(columns={'ligand_names':'ligand', 'ligand_label':'source'})
    
    target_stats = dedict[target].copy()
    target_stats.columns = target_stats.columns.map(lambda x: 'receptor_' + str(x))
    target_stats = target_stats.rename(columns={'receptor_names':'receptor', 'receptor_label':'target'})
    
    bound = resource.merge(source_stats).merge(target_stats)
    
    return bound

In [17]:
lr_res = pd.concat([join_means(source, target) for source, target in zip(pairs['source'], pairs['target'])])

In [18]:
lr_res['mat_mean'] = mat_mean

Recomplexify

In [19]:
# def recomplexify(lr_res, grps, complex_cols, complex_policy = 'min'):


# def _reduce_complexes():
    

In [20]:
complex_policy = 'min'

In [21]:
grps = ['source', 'target', 'ligand_complex', 'receptor_complex']

In [22]:
complex_cols = ['ligand_means' , 'receptor_means'] # specific for every method

In [23]:
add_cols = ['mat_mean']

In [24]:
# subset /w only means here - to be extended to all columns
lr_res = lr_res[ grps + complex_cols + add_cols ]

In [25]:
# temp = temp[[x=="INHBA_INHBB" for x in temp['ligand_complex']]].sort_values('receptor_complex')
# temp = temp[[x=="ACVR1B_ACVR2A" for x in temp['receptor_complex']]]

In [26]:
lr_res = lr_res.groupby(grps)

In [27]:
lr_res.obj

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean
0,a,a,NRG2,MOG,3.562451,3.140472,3.40071
1,a,a,ADAM12,ITGB1,3.504846,3.177643,3.40071
2,a,a,CD14,ITGB1,3.399783,3.177643,3.40071
3,a,a,TGFB3,ITGB1,3.722674,3.177643,3.40071
4,a,a,CDH1,ITGA2_ITGB1,3.244582,3.177643,3.40071
...,...,...,...,...,...,...,...
1690,c,c,SERPINA7,SLC16A2,3.194547,2.888214,3.40071
1691,c,c,IGSF10,MILR1,3.302574,3.602220,3.40071
1692,c,c,BTN1A1,NEGR1,3.457612,3.259617,3.40071
1693,c,c,PI16,MFAP3L,3.591045,3.378227,3.40071


In [28]:
# Functions to be used to reduce the complexes
aggs = set([complex_policy, 'min']) # set to remove if both are min

In [29]:
complex_cols

['ligand_means', 'receptor_means']

In [30]:
cols_dict = {}

for col in complex_cols:
    cols_dict[col] = lr_res[col].agg(aggs).reset_index().copy().\
    rename(columns={agg:col.split('_')[0] + '_' + agg for agg in aggs})

In [31]:
lr_res = lr_res.obj.copy()

In [32]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean
0,a,a,NRG2,MOG,3.562451,3.140472,3.40071
1,a,a,ADAM12,ITGB1,3.504846,3.177643,3.40071
2,a,a,CD14,ITGB1,3.399783,3.177643,3.40071
3,a,a,TGFB3,ITGB1,3.722674,3.177643,3.40071
4,a,a,CDH1,ITGA2_ITGB1,3.244582,3.177643,3.40071
...,...,...,...,...,...,...,...
1690,c,c,SERPINA7,SLC16A2,3.194547,2.888214,3.40071
1691,c,c,IGSF10,MILR1,3.302574,3.602220,3.40071
1692,c,c,BTN1A1,NEGR1,3.457612,3.259617,3.40071
1693,c,c,PI16,MFAP3L,3.591045,3.378227,3.40071


In [33]:
for col in complex_cols:
    # left is lr_res /w the actual column name
    left_on = grps + [col]
    # right is the min subunit for that column
    join_key = col.split('_')[0] + '_min' # ligand_min or receptor_min
    right_on = grps + [join_key]
    
    # Here, I join the min value and keep only those rows that match
    lr_res = lr_res.merge(cols_dict[col], left_on=left_on, right_on=right_on).drop(join_key, 1)


  lr_res = lr_res.merge(cols_dict[col], left_on=left_on, right_on=right_on).drop(join_key, 1)


In [34]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean
0,a,a,NRG2,MOG,3.562451,3.140472,3.40071
1,a,a,ADAM12,ITGB1,3.504846,3.177643,3.40071
2,a,a,CD14,ITGB1,3.399783,3.177643,3.40071
3,a,a,TGFB3,ITGB1,3.722674,3.177643,3.40071
4,a,a,CDH1,ITGA2_ITGB1,3.244582,3.177643,3.40071
...,...,...,...,...,...,...,...
13387,c,c,SERPINA7,SLC16A2,3.194547,2.888214,3.40071
13388,c,c,IGSF10,MILR1,3.302574,3.602220,3.40071
13389,c,c,BTN1A1,NEGR1,3.457612,3.259617,3.40071
13390,c,c,PI16,MFAP3L,3.591045,3.378227,3.40071


In [None]:
# More than those in LIANA - why? Duplicates?

Calculate means_sums for NATMI

In [36]:
def _sum_means(lr_res, what, on):
    return lr_res.join(lr_res.groupby(on)[what].sum(), on=on, rsuffix='_sums')

In [37]:
lr_res = _sum_means(lr_res, what='ligand_means', on=['ligand_complex', 'receptor_complex', 'target'])
lr_res = _sum_means(lr_res, what='receptor_means', on=['ligand_complex', 'receptor_complex', 'source'])

In [38]:
lr_res

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand_means_sums,receptor_means_sums
0,a,a,NRG2,MOG,3.562451,3.140472,3.40071,10.612130,9.533417
1,a,a,ADAM12,ITGB1,3.504846,3.177643,3.40071,10.488648,9.569735
2,a,a,CD14,ITGB1,3.399783,3.177643,3.40071,10.030676,9.569735
3,a,a,TGFB3,ITGB1,3.722674,3.177643,3.40071,11.157440,9.569735
4,a,a,CDH1,ITGA2_ITGB1,3.244582,3.177643,3.40071,9.742043,9.569735
...,...,...,...,...,...,...,...,...,...
13387,c,c,SERPINA7,SLC16A2,3.194547,2.888214,3.40071,9.985497,8.948533
13388,c,c,IGSF10,MILR1,3.302574,3.602220,3.40071,9.905056,10.712031
13389,c,c,BTN1A1,NEGR1,3.457612,3.259617,3.40071,10.340106,9.940911
13390,c,c,PI16,MFAP3L,3.591045,3.378227,3.40071,10.632597,10.195775


NATMI fun

In [39]:
def _natmi_score(x):
    lig = (x.ligand_means / x.ligand_means_sums)
    rec = (x.receptor_means / x.receptor_means_sums)
    return  lig * rec

In [40]:
lr_res['edge_specificity'] = lr_res.apply(_natmi_score, axis=1)

In [41]:
lr_res.sort_values('edge_specificity', ascending=False)

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand_means_sums,receptor_means_sums,edge_specificity
334,a,a,GAL,GRM7,3.304187,3.672407,3.40071,9.358094,10.347617,0.125311
1817,b,a,CXCL1,GRM7,3.532443,3.672407,3.40071,10.074673,10.347617,0.124438
1834,b,a,CXCL16,GRM7,3.485066,3.672407,3.40071,10.011744,10.347617,0.123541
12447,c,c,CALR,ITGA2B,3.410261,3.465863,3.40071,9.735105,9.847004,0.123298
3530,c,a,ICAM5,ITGAL,3.433989,3.758787,3.40071,9.852126,10.681003,0.122660
...,...,...,...,...,...,...,...,...,...,...
1150,a,a,POMC,VIPR1,3.165336,3.242315,3.40071,9.821971,10.442572,0.100062
4129,c,a,LHB,VIPR1,3.158441,3.242315,3.40071,9.801594,10.442572,0.100052
5867,a,b,IL26,IL10RB_IL20RA,3.004146,3.492787,3.40071,9.800839,10.711255,0.099952
4131,c,a,GHRH,VIPR1,3.274156,3.242315,3.40071,10.207939,10.442572,0.099588


SCA re-implement

In [42]:
def _sca_score(x):
    lr_sqrt = np.sqrt(x.ligand_means) * np.sqrt(x.receptor_means)
    return lr_sqrt / (lr_sqrt + x.mat_mean)

In [45]:
lr_res['lrscore'] = lr_res.apply(_sca_score, axis=1)

In [46]:
lr_res.sort_values('lrscore', ascending=False)

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand_means_sums,receptor_means_sums,edge_specificity,lrscore
3827,c,a,PTGS2,CAV1,3.754951,3.820120,3.40071,11.033551,11.153421,0.526898,0.526898
851,a,a,PTGS2,CAV1,3.685490,3.820120,3.40071,11.033551,11.153421,0.524570,0.524570
12755,c,c,PTGS2,CAV1,3.754951,3.729842,3.40071,11.033551,11.153421,0.523916,0.523916
6580,b,b,COL9A1,ITGAV_ITGB8,3.750841,3.705049,3.40071,11.031500,10.774969,0.522947,0.522947
847,a,a,CCN1,CAV1,3.629694,3.820120,3.40071,10.802444,11.153421,0.522667,0.522667
...,...,...,...,...,...,...,...,...,...,...,...
12592,c,c,NECTIN3,NECTIN2,2.989327,3.045395,3.40071,9.444312,9.261166,0.470124,0.470124
9613,a,c,SELPLG,SIGLEC5,3.157058,2.832940,3.40071,9.699244,8.977784,0.467918,0.467918
2952,b,a,ALB,B2M_FCGRT,3.137545,2.846811,3.40071,9.875736,8.785906,0.467754,0.467754
13011,c,c,CGA,ADRB2,2.892203,3.087798,3.40071,9.065883,9.519078,0.467733,0.467733


In [49]:
lr_res[(lr_res.ligand_complex=='POSTN') & (lr_res.receptor_complex=='ITGAV_ITGB3')].sort_values('lrscore', ascending=False)

Unnamed: 0,source,target,ligand_complex,receptor_complex,ligand_means,receptor_means,mat_mean,ligand_means_sums,receptor_means_sums,edge_specificity,lrscore
12359,c,c,POSTN,ITGAV_ITGB3,3.621043,3.499646,3.40071,10.41095,10.014751,0.51143,0.51143
10871,b,c,POSTN,ITGAV_ITGB3,3.398673,3.499646,3.40071,10.41095,10.014751,0.50351,0.50351
9383,a,c,POSTN,ITGAV_ITGB3,3.391233,3.499646,3.40071,10.41095,10.014751,0.503236,0.503236
7895,c,b,POSTN,ITGAV_ITGB3,3.621043,3.269325,3.40071,10.41095,10.014751,0.502922,0.502922
3431,c,a,POSTN,ITGAV_ITGB3,3.621043,3.24578,3.40071,10.41095,10.014751,0.502019,0.502019
6407,b,b,POSTN,ITGAV_ITGB3,3.398673,3.269325,3.40071,10.41095,10.014751,0.495,0.495
4919,a,b,POSTN,ITGAV_ITGB3,3.391233,3.269325,3.40071,10.41095,10.014751,0.494726,0.494726
1943,b,a,POSTN,ITGAV_ITGB3,3.398673,3.24578,3.40071,10.41095,10.014751,0.494097,0.494097
455,a,a,POSTN,ITGAV_ITGB3,3.391233,3.24578,3.40071,10.41095,10.014751,0.493823,0.493823


logFC re-implement

In [44]:
lr_res['logfc'] = lr_res[['ligand_logfoldchanges', 'receptor_logfoldchanges']].mean(1)

KeyError: "None of [Index(['ligand_logfoldchanges', 'receptor_logfoldchanges'], dtype='object')] are in the [columns]"

In [None]:
lr_res.sort_values(by='logfc', key=abs, ascending=False)

Custom min0 function:


In [None]:
# Importing reduce for 
# rolling computations
from functools import reduce
  
# define a Custom aggregation 
# function for finding total
def mean0(series):
      return reduce(lambda x, y:0  if 0 in (x, y) else (x + y), series)