In [6]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import multiprocessing
import pandas as pd
from IPython.display import HTML, display
from stellargraph import datasets
import networkx as nx
from iteration_utilities import duplicates
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from stellargraph.data import EdgeSplitter
import matplotlib.pyplot as plt

from gensim.models import Word2Vec
from networkx.algorithms import community, tree
from stellargraph.data import BiasedRandomWalk


def cora_dataset():
    dataset = datasets.Cora()
    display(HTML(dataset.description))
    graph, _ = dataset.load(largest_connected_component_only=True, str_node_ids=True)
    return graph


def operator_hadamard(u, v):
    return u * v


def operator_l1(u, v):
    return np.abs(u - v)


def operator_l2(u, v):
    return (u - v) ** 2


def operator_avg(u, v):
    return (u + v) / 2.0



def load_data_and_construct_splits():
    graph = cora_dataset()
    return edge_splitter_graph_train_test(graph)


def train_link_classifier(
    examples_train,
    labels_train,
    embedding_train,
):
    results = []
    ops = [operator_hadamard, operator_l1, operator_l2, operator_avg]
    for op in ops:
        clf = train_link_prediction_model(
            examples_train, labels_train, embedding_train, op
        )
        results.append(
            evaluate_link_prediction_model(
                clf, examples_train, labels_train, embedding_train, op
            )
        )
    return results


def compute_best_results(results):
    best_result = max(results, key=lambda result: result["score"])
    print(f"Best result from '{best_result['binary_operator'].__name__}'")
    print(
        pd.DataFrame(
            [
                (result["binary_operator"].__name__, result["score"])
                for result in results
            ],
            columns=("name", "ROC AUC score"),
        ).set_index("name")
    )
    return best_result, best_result["binary_operator"].__name__


def evaluate_on_test_data(examples_test, labels_test, embedding_test, best_result):
    test_score = (
        best_result["classifier"],
        examples_test,
        labels_test,
        embedding_test,
        best_result["binary_operator"],
    )
    print(f"ROC AUC score on test set using " f"'{name}': {test_score}")


def visualise_link_embeddings(examples_test, embedding_test, labels_test, best_result):
    link_features = link_examples_to_features(
        examples_test, embedding_test, best_result["binary_operator"]
    )
    plot_link_features_projection(
        n_components=2, link_features=link_features, labels_test=labels_test
    )




def train_link_prediction_model(
    link_examples, link_labels, get_embedding, binary_operator
):
    clf = link_prediction_classifier()
    link_features = link_examples_to_features(
        link_examples, get_embedding, binary_operator
    )
    clf.fit(link_features, link_labels)
    return clf


def link_prediction_classifier(max_iter=2000):
    lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=max_iter)
    return Pipeline(steps=[("sc", StandardScaler()), ("clf", lr_clf)])


def evaluate_link_prediction_model(
    clf, link_examples_test, link_labels_test, get_embedding, binary_operator
):
    link_features_test = link_examples_to_features(
        link_examples_test, get_embedding, binary_operator
    )
    score = evaluate_roc_auc(clf, link_features_test, link_labels_test)
    return {
        "classifier": clf,
        "binary_operator": binary_operator,
        "score": score,
    }



def node2vec_embedding(graph, name, **kwargs):
    rw = BiasedRandomWalk(graph)
    walks = rw.run(
        graph.nodes(),
        n=kwargs["num_walks"],
        length=kwargs["walk_length"],
        p=kwargs["p"],
        q=kwargs["q"],
    )
   
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        vector_size=kwargs["dimensions"],
        window=kwargs["window_size"],
        min_count=0,
        sg=1,
        workers=kwargs["workers"],
        epochs=kwargs["num_iter"],
    )

    def get_embedding(u):
        return model.wv[u]

    return get_embedding


def edge_splitter_graph_train_test(graph):
    edge_splitter_test = EdgeSplitter(graph)
    graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(
        p=0.1, method="global"
    )

    print(graph_test.info())

    # Do the same process to compute a training subset from within the test graph
    edge_splitter_train = EdgeSplitter(graph_test, graph)
    graph_train, examples, labels = edge_splitter_train.train_test_split(
        p=0.1, method="global"
    )
    (
        examples_train,
        examples_model_selection,
        labels_train,
        labels_model_selection,
    ) = train_test_split(examples, labels, train_size=0.75, test_size=0.25)

    print(graph_train.info())

    return (
        graph_train,
        graph_test,
        examples_train,
        examples_test,
        examples_model_selection,
        labels_train,
        labels_test,
        labels_model_selection,
    )


def link_examples_to_features(link_examples, transform_node, binary_operator):
    return [
        binary_operator(transform_node(src), transform_node(dst))
        for src, dst in link_examples
    ]


def plot_link_features_projection(n_components, link_features, labels_test):
    pca = PCA(n_components=n_components)
    X_transformed = pca.fit_transform(link_features)
    plt.figure(figsize=(16, 12))
    col = []
    for label in labels_test:
        if label == 1:
            col.append("red")
        else:
            col.append("blue")

    plt.scatter(
        X_transformed[:, 0],
        X_transformed[:, 1],
        c=col,
        alpha=0.5,
    )
    plt.show()



In [2]:

params = {
    "p": 1.0,
    "q": 1.0,
    "dimensions": 128,
    "num_walks": 10,
    "walk_length": 80,
    "window_size": 10,
    "num_iter": 1,
    "workers": multiprocessing.cpu_count(),
}



In [7]:
graph_train_test_labels = load_data_and_construct_splits()
(graph_train,
 graph_test,
 examples_train,
 examples_test,
 examples_model_selection,
 labels_train,
 labels_test,
 labels_model_selection,
) = graph_train_test_labels
embedding_train = node2vec_embedding(graph_train, "Train Graph", **params)
embedding_test = node2vec_embedding(graph_test, "Test Graph", **params)
results = train_link_classifier(examples_train, labels_train, embedding_train)
best_result, name = compute_best_results(results)
evaluate_on_test_data(examples_test, labels_test, embedding_test, best_result)
visualise_link_embeddings(
    examples_test, embedding_test, labels_test, best_result
)

** Sampled 520 positive and 520 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 2485, Edges: 4689

 Node types:
  paper: [2485]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [4689]
        Weights: all 1 (default)
        Features: none
** Sampled 468 positive and 468 negative edges. **
StellarGraph: Undirected multigraph
 Nodes: 2485, Edges: 4221

 Node types:
  paper: [2485]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [4221]
        Weights: all 1 (default)
        Features: none
Number of random walks for 'Train Graph': 24850
Number of random walks for 'Test Graph': 24850


NameError: name 'evaluate_roc_auc' is not defined