# Link prediction

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import networkx as nx
import pandas as pd
import numpy as np


from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import CitationFull
from torch_geometric.datasets import HeterophilousGraphDataset
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import PPI

from torch_geometric.utils import to_networkx, from_networkx

import torch
from torch_geometric.transforms import RandomLinkSplit

In [26]:
# LOAD DATASET
dataset = Planetoid(root='/tmp/PubMed', name='PubMed') ## PUBMED
# dataset = CitationFull(root='/tmp/Cora', name='Cora')  ## CORA
# dataset = HeterophilousGraphDataset(root="./", name='amazon_ratings') ## AMAZON RATINGS
# dataset = PPI("./")
data = dataset[0]
G_BC = to_networkx(data, to_undirected=True)

In [34]:
labels = []
for i in range(len(data.y)):
    l = data.y[i].squeeze().numpy().tolist()
    labels.append(l)

group_dict = {i: labels[i] for i in range(len(labels))}

for user_id, groups in group_dict.items():
    nx.set_node_attributes(G_BC, {user_id: groups}, 'group_belonging')

print("Number of nodes:", G_BC.number_of_nodes())
print("Number of edges:", G_BC.number_of_edges())

Number of nodes: 19717
Number of edges: 44324


In [37]:
def predict_link(u, v, embeddings):
    """
    Computes the normalized probability for an existing link between two nodes u and v based on the input
    embeddings.
    :param u: a node in the graph
    :param v: a node in the graph
    :param embeddings: trained embeddings
    :return: sigmoid normalized probability for the existence of a link
    """
    embedding1 = embeddings[u]
    embedding2 = embeddings[v]
    
    # Compute inner product (dot product)
    dot_product = np.dot(embedding1, embedding2)

    # Normalize by sigmoid function
    link_probability = 1/(1 + np.exp(-dot_product))
    return link_probability


def link_predictions(embeddings, edges, y_true):
    """
    Computes the ROC-AUC score for a given set of test edges based on the trained embeddings.
    :param embeddings: a models trained embeddings
    :param edges: test edges
    :param y_true: the labels for edges (1=true, 0=false)
    :return: the ROC-AUC score from predictions
    """
    predictions = []
    for edge in edges:
        predictions.append(predict_link(edge[0], edge[1], embeddings))
    return roc_auc_score(y_true, predictions) 


def train_test_split_graph(G):
    """
    Splits a Graph into a test and train set randomly to 80-20. The test split is balanced with negative edges sampled from random vertex pairs that have no edges between them. 
    While removing edges randomly, it makes sure that no vertex is isolated.
    :param G: a networkx graph to be split
    :return: the train-test split as torch geometrics graphs
    """
    data = from_networkx(G)
    data.y = data.group_belonging
    data.x = torch.arange(G.number_of_nodes()).unsqueeze(1)
    
    transform = RandomLinkSplit(num_val=0, num_test=0.5, is_undirected=True, add_negative_train_samples=False)
    train_data, _, test_data = transform(data)
    return train_data, test_data

In [39]:
# LOAD EMBEDDING
from gensim.models.keyedvectors import KeyedVectors

PATH = '/Users/silviaarellanogarcia/Documents/MSc MACHINE LEARNING/Advanced Machine Learning/Project/embeddings_deepwalk/'
phi_model_YT = KeyedVectors.load_word2vec_format(PATH + 'model_pubmed_80_40.embedding')

In [40]:
# Get the embedding vectors
embeddings = phi_model_YT.vectors

In [41]:
embeddings

array([[ 0.20333679,  0.30727208,  0.06666798, ...,  0.03066145,
         0.11071127, -0.22095709],
       [ 0.16608925,  0.01392533, -0.19145992, ...,  0.05858718,
         0.268196  ,  0.05441923],
       [-0.12973866, -0.1092654 ,  0.10813908, ..., -0.13678004,
         0.14862283,  0.04702898],
       ...,
       [ 0.5484147 , -0.26757112,  0.2658604 , ..., -0.23587406,
         0.12957697, -0.09191801],
       [ 0.67656016,  0.40571168, -0.24498807, ...,  0.43837348,
        -0.07032686, -0.10148539],
       [ 0.748623  ,  0.11105119,  0.41657597, ...,  0.04104827,
         0.03264922,  0.17897221]], dtype=float32)

In [42]:
# LINK PREDICTION
train_data, test_data = train_test_split_graph(G_BC)

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

roc_auc = link_predictions(embeddings, test_edges, y_true)


In [43]:
roc_auc

0.49776102820359625