In [None]:
# import torch
# TORCH = torch.__version__
# CUDA = torch.version.cuda if torch.version.cuda is not None else "cpu"
# !pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --quiet
# !pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --quiet
# !pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --quiet
# !pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-${TORCH}+${CUDA}.html --quiet
# !pip install torch-geometric --quiet
# !pip install "gif[plotly]" --quiet

In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html --quiet
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html --quiet
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html --quiet
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html --quiet
!pip install torch-geometric==1.6.3 --quiet

In [None]:
import os
import pandas as pd
import numpy as np

import networkx as nx

import torch

from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec
from torch_geometric.utils import degree

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.manifold import TSNE

import plotly.graph_objects as go

import json
import pickle

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
bio_decagon_combo = pd.read_csv("../input/dpi-dataset/bio-decagon-combo.csv")
bio_decagon_ppi = pd.read_csv("../input/dpi-dataset/bio-decagon-ppi.csv")
bio_decagon_targets_all = pd.read_csv("../input/dpi-dataset/bio-decagon-targets-all.csv")
bio_decagon_targets = pd.read_csv("../input/dpi-dataset/bio-decagon-targets.csv")

In [None]:
edge_type_mapping = np.stack(
    (
        np.array(bio_decagon_combo.groupby(["Polypharmacy Side Effect"]).ngroup()),
        np.array(bio_decagon_combo["Polypharmacy Side Effect"]),
    ),
    axis=1,
)

edge_type_mapping = edge_type_mapping[edge_type_mapping[:, 0].argsort()]
edge_type_mapping = dict((value, key) for key, value in edge_type_mapping)

edge_type_mapping["ppi"] = edge_type_mapping[list(edge_type_mapping.keys())[-1]] + 1
edge_type_mapping["target"] = edge_type_mapping["ppi"] + 1
edge_type_mapping["targeted_by"] = edge_type_mapping["target"] + 1

In [None]:
drug_nodes_mapping = dict(
    (drug, i)
    for i, drug in enumerate(
        np.unique(
            np.append(
                np.array(bio_decagon_combo["STITCH 1"]),
                np.array(bio_decagon_combo["STITCH 2"]),
            )
        )
    )
)

In [None]:
protein_nodes_mapping = dict(
    (uniprot, i)
    for i, uniprot in enumerate(
        np.unique(
            np.append(
                np.array(bio_decagon_ppi["Gene 1"]),
                np.array(bio_decagon_ppi["Gene 2"]),
            )
        )
    )
)

protein_nodes_mapping = dict(
    (key, value + drug_nodes_mapping[list(drug_nodes_mapping.keys())[-1]] + 1)
    for key, value in protein_nodes_mapping.items()
)

In [None]:
G = nx.DiGraph()
for i in drug_nodes_mapping.values():
    G.add_node(i, type="Drug")
for i in protein_nodes_mapping.values():
    G.add_node(i, type="Protein")

In [None]:
row = []
col = []

edge_meta_type = []
edge_type = []

for drug_i, drug_j, edge_type_i_j in zip(bio_decagon_combo["STITCH 1"], bio_decagon_combo["STITCH 2"], bio_decagon_combo["Polypharmacy Side Effect"]):
    row.append(drug_nodes_mapping[drug_i])
    col.append(drug_nodes_mapping[drug_j])
    edge_type.append(edge_type_mapping[edge_type_i_j])
    edge_meta_type.append(1)
    
    row.append(drug_nodes_mapping[drug_j])
    col.append(drug_nodes_mapping[drug_i])
    edge_type.append(edge_type_mapping[edge_type_i_j])
    edge_meta_type.append(1)
    
    G.add_edge(drug_nodes_mapping[drug_i], drug_nodes_mapping[drug_j], relation=edge_type_mapping[edge_type_i_j])
    G.add_edge(drug_nodes_mapping[drug_j], drug_nodes_mapping[drug_i], relation=edge_type_mapping[edge_type_i_j])
    
for uniprot_i, uniprot_j in zip(bio_decagon_ppi["Gene 1"], bio_decagon_ppi["Gene 2"]):
    row.append(protein_nodes_mapping[uniprot_i])
    col.append(protein_nodes_mapping[uniprot_j])
    edge_type.append(edge_type_mapping["ppi"])
    edge_meta_type.append(2)
    
    row.append(protein_nodes_mapping[uniprot_j])
    col.append(protein_nodes_mapping[uniprot_i])
    edge_type.append(edge_type_mapping["ppi"])
    edge_meta_type.append(2)
    
    G.add_edge(protein_nodes_mapping[uniprot_i], protein_nodes_mapping[uniprot_j], relation=edge_type_mapping["ppi"])
    G.add_edge(protein_nodes_mapping[uniprot_j], protein_nodes_mapping[uniprot_i], relation=edge_type_mapping["ppi"])
    
for drug_i, uniprot_j in zip(bio_decagon_targets["STITCH"], bio_decagon_targets["Gene"]):
    if uniprot_j not in protein_nodes_mapping.keys():
        continue # about 10 proteins are in targets, but not in ppi network, so we are skipping them
    row.append(drug_nodes_mapping[drug_i])
    col.append(protein_nodes_mapping[uniprot_j])
    edge_type.append(edge_type_mapping["target"])
    edge_meta_type.append(3)
    
    row.append(protein_nodes_mapping[uniprot_j])
    col.append(drug_nodes_mapping[drug_i])
    edge_type.append(edge_type_mapping["targeted_by"])
    edge_meta_type.append(4)
    
    G.add_edge(protein_nodes_mapping[uniprot_j], drug_nodes_mapping[drug_i], relation=edge_type_mapping["targeted_by"])
    G.add_edge(drug_nodes_mapping[drug_i], protein_nodes_mapping[uniprot_j], relation=edge_type_mapping["target"])

In [None]:
def random_init(feat_len, distribution):
    features = []
    
    (mu_uniprot, sigma_uniprot, mu_drug, sigma_drug) = distribution
    
    for node in G.nodes():
        if G.nodes[node]["type"] == "Protein":
            features.append(np.random.normal(mu_uniprot, sigma_uniprot, feat_len))
        elif G.nodes[node]["type"] == "Drug":
            features.append(np.random.normal(mu_drug, sigma_drug, feat_len))
        else:
            raise Exception(f"""{G.nodes[node]["type"]}, This node type doesnt exist!""")
            
    return features

In [None]:
def make_init_features(cheat=False, random=False, feat_len=10, distribution=(0, 1, 5, 2)):
    """
    Make initial feature embedding for each node
    
    cheat(default = False): whether at test time a node knows how many total nodes it is connected to or not.
    random(default = False): whether to form random feature embeddings for nodes.
    feat_len(default = 10): size of init embedding for each node. Only used when random=True
    distribution(default = mu_uniprot:0, sigma_uniprot:1, mu_drug:5, sigma_drug:2): distributional properties of the normal distribution from which the samples will be chosen as embeddings.
    """
    
    if random:
        return random_init(feat_len, distribution)
    
    features = []

    for node in G.nodes():
        neighbors = [i for i in G.neighbors(node)]

        if cheat == True:
            features.append((G.nodes[node]["type"], len(neighbors)))

        else:
            count_proteins = 0
            for neighbor in neighbors:
                if G.nodes[neighbor]["type"] == "Protein":
                    count_proteins += 1

            features.append((G.nodes[node]["type"], count_proteins))

    return features

In [None]:
cheat_encoder = OneHotEncoder(handle_unknown='ignore')
cheat_features = cheat_encoder.fit_transform(make_init_features(cheat=True)).toarray()
true_encoder = OneHotEncoder(handle_unknown='ignore')
true_features = true_encoder.fit_transform(make_init_features(cheat=False)).toarray()
random_features = np.array(make_init_features(random=True, feat_len=10, distribution=(-1, 0.5, 1, 0.5)))

In [None]:
cheat_features.shape, true_features.shape, random_features.shape

In [None]:
layer_feat = true_features

In [None]:
y = [0  for _ in range(len(drug_nodes_mapping))]
y.extend([1  for _ in range(len(protein_nodes_mapping))])
y = np.array(y)

train_mask = torch.tensor([False for _ in range(len(row))])
val_mask = torch.tensor([False for _ in range(len(row))])
test_mask = torch.tensor([False for _ in range(len(row))])


x = torch.tensor(layer_feat)

In [None]:
edges_ = np.stack((row, col)).T
edge_meta_type = np.array(edge_meta_type)
train_val, test = next(StratifiedShuffleSplit(test_size=0.1, random_state=0).split(edges_, edge_meta_type))
train, val = next(StratifiedShuffleSplit(test_size=0.111, random_state=0).split(edges_[train_val], edge_meta_type[train_val]))

train_mask[train] = True
val_mask[val] = True
test_mask[test] = True

In [None]:
edge_index = torch.tensor(np.stack((row, col)))
row, col = torch.tensor(row), torch.tensor(col)

edge_meta_type = torch.tensor(edge_meta_type)
edge_type = torch.tensor(edge_type)

num_relations = edge_type.unique().size(0)

y = torch.tensor(y)

num_nodes = len(y)

edge_attr = 1.0 / degree(edge_index[1], num_nodes)[edge_index[1]]

- `edge_index` stores all the edges in the dataset in the form of a 2-D tensor. Each column represents an edge formed by two nodes and the number of columns indicate the total number of edges in the dataset. For example, the first column in `edge_index` is [0, 9052], which represents an edge between node 0 and node 9052.
- `edge_attr` contains edge attributes calulated using `1.0 / torch_geometric.utils.degree(col, num_nodes)[col]`. This attribute is used for GraphSAINT sampler. Please see [this](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/graph_saint.py) and [this](https://pytorch-geometric.readthedocs.io/en/latest/modules/utils.html) for reference. 
- `edge_meta_type` helps to identify the meta edge type of each edge in `edge_index`. Because drug and protein edges are directional, we use edge meta types here to do negative sampling more easily.  There are 3 meta edges. `1` represents edges between a drug and a drug, where drug is the starting node and drug is the ending node. `2` represents edges between proteins and proteins. `3` represents edges between a drug and a protein where drug is the starting node and protein is the ending node. `4` represents edges between a protein and a drug where protein is the starting node and drug is the ending node.
- `edge_type` stores the edge type for each edge in `edge_index`. See `edge_type_mapping`.
- `x` stores the input embeddings/attributes of each node, with dimension of 128. The main reason to use these embeddings is to decrease the input dimension for each node from 25455 to \approx in 100s. Naively, one-hot-encoded embeddings are used to represent each node. Alternatively, one can use random Gaussian vectors as input embeddings/attributes. In applications where side feature information about nodes is available, x can be used to integrate that information into the model.
- `y` stores the node type, where `0` represents a drug and `1` represents a protein.

In [None]:
data = Data(
    edge_index=edge_index,
    edge_attr=edge_attr,
    edge_type=edge_type,
    edge_meta_type=edge_meta_type,
    x=x,
    y=y,
    train_mask=train_mask,
    val_mask=val_mask,
    test_mask=test_mask,
)

### Node2Vec Inititial Embeddings

In [None]:
node2vec_model = Node2Vec(
    data.edge_index,
    embedding_dim=128,
    walk_length=20,
    context_size=10,
    walks_per_node=10,
    num_negative_samples=1,
    p=1,
    q=1,
    sparse=True,
).to(device)

loader = node2vec_model.loader(batch_size=2048, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(node2vec_model.parameters()), lr=0.01)


def train():
    node2vec_model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = node2vec_model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


@torch.no_grad()
def test():
    node2vec_model.eval()
    z = model()
    acc = node2vec_model.test(
        z[data.train_mask],
        data.y[data.train_mask],
        z[data.test_mask],
        data.y[data.test_mask],
        max_iter=150,
    )
    return acc

node2vec_path = "../input/node2vec-embedding/node2vec_embeddings.h5"

if not os.path.isfile(node2vec_path):
    for epoch in range(1, 60):
        loss = train()
        acc = test()
        print(f"Epoch: {epoch:02d}, Loss: {loss:.4f}, Acc: {acc:.4f}") if (epoch%20 == 0) else None

In [None]:
# torch.save(model.state_dict(), "./node2vec_ebeddings.h5")
node2vec_model.load_state_dict(torch.load(node2vec_path, map_location=device))

In [None]:
@torch.no_grad()
def plot_points():
    node2vec_model.eval()
    z = node2vec_model(torch.arange(data.num_nodes, device=device))
    z = TSNE(n_components=2).fit_transform(z.cpu().numpy())
    y = data.y.cpu().numpy()
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=z[y == 0, 0], y=z[y == 0, 1], mode="markers", name="drug")) 
    fig.add_trace(go.Scatter(x=z[y == 1, 0], y=z[y == 1, 1], mode="markers", name="protein"))

    fig.update_layout(autosize=False, width=700, height=700, xaxis=dict(visible=False), yaxis=dict(visible=False))
    return fig

fig = plot_points()
fig.show()

In [None]:
node2vec_ebeddings = node2vec_model(torch.arange(data.num_nodes, device=device)).cpu().detach()
data.x = node2vec_ebeddings

In [None]:
torch.save(data, 'data.pt')

torch.save(edge_index, 'edge_index.pt')
torch.save(edge_meta_type, 'edge_meta_type.pt')
torch.save(edge_type, 'edge_type.pt')
torch.save(y, 'y.pt')
torch.save(edge_attr, 'edge_attr.pt')

torch.save(node2vec_ebeddings, 'node2vec_ebeddings.pt')
torch.save(torch.tensor(cheat_features), 'cheat_features.pt')
torch.save(torch.tensor(true_features), 'true_features.pt')
torch.save(torch.tensor(random_features), 'random_features.pt')

with open('edge_type_mapping.p', 'wb') as fp:
    pickle.dump(edge_type_mapping, fp, protocol=pickle.HIGHEST_PROTOCOL)
with open('drug_nodes_mapping.p', 'wb') as fp:
    pickle.dump(drug_nodes_mapping, fp, protocol=pickle.HIGHEST_PROTOCOL)
with open('protein_nodes_mapping.p', 'wb') as fp:
    pickle.dump(protein_nodes_mapping, fp, protocol=pickle.HIGHEST_PROTOCOL)