# Milieu

Milieu is a disease protein discovery algorithm based on the hypothesis that proteins associated with the same disease share mutual interactors in the protein-protein interaction network.   

In [42]:
%load_ext autoreload
%autoreload 2

import os

import networkx as nx

from milieu.data.network import PPINetwork
from milieu.data.associations import load_diseases
from milieu.milieu import MilieuDataset, Milieu
from milieu.paper.figures.network_vis import show_network

#os.chdir("/dfs/scratch0/sabri/milieu")
os.chdir("/Users/sabrieyuboglu/Documents/sabri/research/projects/milieu/milieu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the PPI Network

We use the protein-protein interaction network compiled by Menche *et al.*[1]. The network consists of 342,353 interactions between 21,557 proteins. Se
In `data/networks`, you can find this network `bio-pathways-network.txt`. See methods for a more detailed description of the network. 
You can also find two other protein-protein interaction networks `string-network.txt` and `bio-grid-network.txt`. See Supplementary Note 3 for a detailed description.

In [43]:
network = PPINetwork("data/networks/bio-pathways-network.txt")

## Build the *Milieu* Model

We use params

In [49]:
params = {
    "cuda": False,
    "device": 2,
    
    "batch_size": 200,
    "num_workers": 4,
    "num_epochs": 10,
    
    "optim_class": "Adam",
    "optim_args": {
        "lr": 0.01,
        "weight_decay": 0.0
    },
    
    "metric_configs": [
        {
            "name": "recall_at_25",
            "fn": "batch_recall_at", 
            "args": {"k":25}
        }
    ]
}

In [50]:
milieu = Milieu(network, params)

Milieu
Setting parameters...
Building model...
Building optimizer...
Done.


## Train the Model
*Milieu* is trained on a large set of known disease-protein associations. We use

In [51]:
diseases = list(load_diseases("data/go_associations/go_function/associations.csv", exclude_splits=["none"]).values())
train_diseases = diseases[:int(len(diseases)* 0.9)]
valid_diseases = diseases[int(len(diseases)* 0.9):]
train_dataset = MilieuDataset(network, diseases=train_diseases)
valid_dataset = MilieuDataset(network, diseases=valid_diseases)

In [52]:
milieu.train_model(train_dataset, valid_dataset)

Starting training for 10 epoch(s)
Epoch 1 of 10
Training


100%|██████████| 3/3 [00:07<00:00,  2.55s/it, loss=1.394]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.19it/s]

Epoch 2 of 10
Training



100%|██████████| 3/3 [00:08<00:00,  2.74s/it, loss=1.388]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.10it/s]

Epoch 3 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.69s/it, loss=1.384]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.26it/s]

Epoch 4 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.77s/it, loss=1.381]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.24it/s]

Epoch 5 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.55s/it, loss=1.380]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.29it/s]

Epoch 6 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.63s/it, loss=1.379]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.31it/s]

Epoch 7 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.55s/it, loss=1.376]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.30it/s]

Epoch 8 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.55s/it, loss=1.374]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.12it/s]

Epoch 9 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.71s/it, loss=1.371]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.29it/s]

Epoch 10 of 10
Training



100%|██████████| 3/3 [00:07<00:00,  2.54s/it, loss=1.368]

Validation



100%|██████████| 1/1 [00:00<00:00,  1.29it/s]


([{'recall_at_25': 0.05539358600583091},
  {'recall_at_25': 0.17060255371990066},
  {'recall_at_25': 0.1799940476190476},
  {'recall_at_25': 0.17862846884785658},
  {'recall_at_25': 0.16781490929705214},
  {'recall_at_25': 0.15434363189720332},
  {'recall_at_25': 0.16298665100961018},
  {'recall_at_25': 0.17565523431594857},
  {'recall_at_25': 0.17982823129251702},
  {'recall_at_25': 0.20525925925925925}],
 [defaultdict(list, {'recall_at_25': [0.231]}),
  defaultdict(list, {'recall_at_25': [0.2561904761904762]}),
  defaultdict(list, {'recall_at_25': [0.2053809523809524]}),
  defaultdict(list, {'recall_at_25': [0.234047619047619]}),
  defaultdict(list, {'recall_at_25': [0.2187142857142857]}),
  defaultdict(list, {'recall_at_25': [0.17219047619047614]}),
  defaultdict(list, {'recall_at_25': [0.19904761904761906]}),
  defaultdict(list, {'recall_at_25': [0.1367142857142857]}),
  defaultdict(list, {'recall_at_25': [0.20219047619047617]}),
  defaultdict(list, {'recall_at_25': [0.218190476190

## Predict Novel Associations

In [32]:
cholecystitis_proteins = ['ENG', 'ALDOA', 'GDF2', 'GPI', 'HK1', 'SMAD4','ARSA', 
                          'ABCB4', 'PKLR', 'BPGM', 'TPI1', 'ACVRL1']

In [53]:
diseases = load_diseases("data/go_associations/go_function/associations.csv", exclude_splits=["none"])

In [54]:
function = diseases["GO:0009975"] 
len(function.proteins)

11

In [60]:
function.name

'cyclase activity'

In [55]:
predicted_proteins = milieu.discover(entrez_ids=function.proteins, top_k=5)
predicted_proteins = list(zip(*predicted_proteins))[0]

In [58]:
cy_vis = show_network(network, function.proteins, predicted_proteins, id_format="entrez",
                      model=milieu,
                      show_seed_mi=True, excluded_interactions=[("mutual_interactor", "mutual_interactor")],
                      save_path=f"experiments/network_vis/function/{function.id}_cy.json")

In [59]:
cy_vis

Cytoscape(data={'elements': {'nodes': [{'data': {'role': 'seed', 'id': '14273', 'entrez': '55811', 'genbank': …

1. Menche, J. et al. Uncovering disease-disease relationships through the incomplete interactome. Science 347, 1257601–1257601 (2015).
2.