## Question: *when are molecules likely to have similar phenotypes?* 


In [144]:
%load_ext autoreload
%autoreload 2
%cd /Users/sabrieyuboglu/Documents/sabri/research/projects/milieu/milieu

import numpy as np

from scipy.sparse import csr_matrix
from scipy.stats import pearsonr, spearmanr, ttest_ind, ttest_rel
import seaborn as sns
import matplotlib.pyplot as plt

from milieu.data.associations import load_diseases, build_disease_matrix
from milieu.data.network_matrices import load_network_matrices
from milieu.data.network import Network

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/Users/sabrieyuboglu/Documents/sabri/research/projects/milieu/milieu


In [166]:
network = Network("data/networks/species_9606/huri/network.txt")

## Compute jaccard similarity  between proteins 

In [167]:
processes = load_diseases("data/associations/gene_ontology/species_9606/go_process/associations.csv")
functions = load_diseases("data/associations/gene_ontology/species_9606/go_function/associations.csv")
diseases = load_diseases("data/associations/disgenet/associations.csv")
drugs = load_diseases("data/associations/drugbank/associations.csv")

associations = {}
for dct in [diseases, functions, processes, drugs]:
    associations.update(dct)

In [168]:
association_matrix, _  = build_disease_matrix(associations, network)

In [175]:
association_matrix.shape

(4914, 8992)

In [169]:
def compute_jaccard(matrix):
    """
    Computes the pairwise jaccard similarity between 
    :param matrix: (nd.array) an NxD matrix where N is the # of sets and D is
        the maximum cardinality of the sets.  
    """
    intersection = (csr_matrix(matrix)
                        .dot(csr_matrix(matrix.T)).todense())
    union = np.zeros_like(intersection)
    union += matrix.sum(axis=1, keepdims=True)
    union += matrix.sum(axis=1, keepdims=True).T
    union -= intersection
    jaccard = np.array(np.nan_to_num(intersection / union, 0))
    return jaccard


In [172]:
association_jaccard = compute_jaccard(association_matrix.T)
np.fill_diagonal(association_jaccard, 0)

In [173]:
# test jaccard
from sklearn.metrics import jaccard_score
for _ in range(10000):
    i = np.random.randint(0, association_jaccard.shape[0])
    j = np.random.randint(0, association_jaccard.shape[0])
    if i == j:
        continue 

    computed = association_jaccard[i, j]
    value = jaccard_score(association_matrix[:, i], association_matrix[:, j])

    if computed != value:
        raise ValueError("Failed")
        
print("passed")

ValueError: Failed

In [46]:
mi_matrix = load_network_matrices({"mi": "data/networks/species_9606/huri/mutual_interactor"}, 
                                  network=network)["mi"]

In [47]:
pearsonr(mi_matrix[np.triu_indices(mi_matrix.shape[0], k=1)], 
         association_jaccard[np.triu_indices(association_jaccard.shape[0], k=1)])

(0.02120972491532083, 0.0)

In [48]:
x = network.adj_matrix
di_matrix = x / x.sum(axis=0, keepdims = True) / x.sum(axis=1, keepdims=True)

In [49]:
mi_values = mi_matrix[np.triu_indices(mi_matrix.shape[0], k=1)]
di_values = di_matrix[np.triu_indices(di_matrix.shape[0], k=1)]
adj_values = network.adj_matrix[np.triu_indices(network.adj_matrix.shape[0], k=1)]
jaccard_values = association_jaccard[np.triu_indices(association_jaccard.shape[0], k=1)]

In [50]:
# Claim: we findg that molecules with high mutual interactor scores are more similar than molecules with high direct interactor scores.
print(jaccard_values[mi_values >= np.percentile(mi_values, 99.9)].mean())
print(jaccard_values[di_values >= np.percentile(di_values, 99.9)].mean())


0.011688996860919494
0.0050902866818882675


In [101]:
k = adj_values.sum().astype(int)
ttest_rel(jaccard_values[mi_values.argsort()[-k:]],
          jaccard_values[di_values.argsort()[-k:]])

Ttest_relResult(statistic=16.27395083226033, pvalue=2.0066307152972687e-59)

In [57]:
k = adj_values.sum().astype(int)
ttest_ind(jaccard_values[np.argpartition(mi_values, -k)[-k:]],
          jaccard_values[np.argpartition(di_values, -k)[-k:]])

Ttest_indResult(statistic=16.05568064757644, pvalue=5.9682123171391184e-58)

In [56]:
k = adj_values.sum().astype(int)
print(jaccard_values[np.argpartition(mi_values, -k)[-k:]].mean())
print(jaccard_values[np.argpartition(di_values, -k)[-k:]].mean())

0.008811414816040475
0.0037057968986399605


In [37]:
adj_values[np.argpartition(jaccard_values, -k)[-k:]].mean()

0.042

In [96]:
(jaccard_values > 0.2).sum()

38750

In [99]:
len(list(network.get_interactions()))

62841

In [100]:
adj_values.sum()

62084.0