In [2]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import preprocessing

import numpy as np


from stellargraph.data import UnsupervisedSampler
from stellargraph.data import BiasedRandomWalk


from stellargraph import StellarGraph
from stellargraph import StellarDiGraph

import warnings 
import collections
from stellargraph import datasets
import matplotlib.pyplot as plt

In [3]:
import stellargraph as sg

In [4]:
import networkx as nx
from pathlib import Path

from scipy.sparse import csr_matrix

import os
import pickle
import numpy as np

def load_pickle_file(file_name, dir_path="./data/outputs"):
    file_path = Path(dir_path + "/" + file_name)
    f = open(file_path, 'rb')
    file = pickle.load(f)
    f.close()

    return (file)


G2 = load_pickle_file("Graph_er", dir_path="../data/outputs")
A2 = load_pickle_file("Adjacency_er", dir_path="../data/outputs")
T2 = load_pickle_file("Transition_er", dir_path="../data/outputs")


G = load_pickle_file("Graph_er_weighted_wfeatures", dir_path="../data/outputs")

  file = pickle.load(f)


In [5]:
G.nodes['/']

{'node_feature': array([1., 0., 0., ..., 0., 0., 0.])}

In [6]:
G_sg = StellarDiGraph.from_networkx(G, node_features = "node_feature")

In [7]:
G_sg._edge_weights('/','/search/all')

[0.24342119512247948]

In [8]:
G_sg._edge_weights('/search/all', '/')

[0.07841605329166496]

In [9]:
walk_number = 10
walk_length = 5

walker = BiasedRandomWalk(
    G_sg,
    n=walk_number,
    length=walk_length,
    p=10,  # defines probability, 1/p, of returning to source node
    q=10,  # defines probability, 1/q, for moving to a node away from the source node
)

In [10]:
unsupervised_samples = UnsupervisedSampler(G_sg, nodes=list(G_sg.nodes()), walker=walker)

In [11]:
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification

batch_size = 50
epochs = 4
num_samples = [10, 5]

generator = GraphSAGELinkGenerator(G_sg, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples)

In [12]:
layer_sizes = [50, 50]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.0, normalize="l2"
)

In [13]:
# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

In [14]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [15]:
from tensorflow import keras

model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

  super(Adam, self).__init__(name, **kwargs)


In [16]:
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [17]:
#         The UnsupervisedSampler is responsible for sampling walks in the given graph
#         and returning positive and negative samples w.r.t. those walks, on demand.

#         The positive samples are all the (target, context) pairs from the walks and the negative
#         samples are contexts generated for each target based on a sampling distribution.

#         By default, a UniformRandomWalk is used, but a custom `walker` can be specified instead. An
#         error will be raised if other parameters are specified along with a custom `walker`.


In [None]:
model.predict()

# Extract node embeddings

In [24]:
from stellargraph.mapper import GraphSAGENodeGenerator

In [None]:
G_sg.nodes()

In [26]:
help(GraphSAGENodeGenerator(G_sg, batch_size, num_samples).flow)

Help on method flow in module stellargraph.mapper.sampled_node_generators:

flow(node_ids, targets=None, shuffle=False, seed=None) method of stellargraph.mapper.sampled_node_generators.GraphSAGENodeGenerator instance
    Creates a generator/sequence object for training or evaluation
    with the supplied node ids and numeric targets.
    
    The node IDs are the nodes to train or inference on: the embeddings
    calculated for these nodes are passed to the downstream task. These
    are a subset of the nodes in the graph.
    
    The targets are an array of numeric targets corresponding to the
    supplied node_ids to be used by the downstream task. They should
    be given in the same order as the list of node IDs.
    If they are not specified (for example, for use in prediction),
    the targets will not be available to the downstream task.
    
    Note that the shuffle argument should be True for training and
    False for prediction.
    
    Args:
        node_ids: an iterable

In [21]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [30]:
# node_ids = node_subjects.index
node_gen = GraphSAGENodeGenerator(G_sg, batch_size, num_samples).flow(G_sg.nodes())

In [31]:
node_gen

<stellargraph.mapper.sequences.NodeSequence at 0x22b6d130550>

In [32]:
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)



In [34]:
node_embeddings.shape

(10631, 50)

# Performance

In [39]:
import pandas as pd

In [40]:
def calc_cosine_similarity_matrix(X, y):
    '''
    X is a matatrix of embeddings, with nodes in rows (i.e number of rows = number of nodes, 
    number of columns = number of latent dimensions).
    '''
    cosine_similarity = np.dot(X, y)/(np.linalg.norm(X, axis = 1)* np.linalg.norm(y))
    
    return( cosine_similarity )

In [48]:
def get_seed_page_index(G, seed_page):
    seed_node_index = np.where(np.array(G.nodes) == seed_page)[0]
    seed_node_index = seed_node_index.astype(int)[0]

    print( "Seed node:", list(G2.nodes)[seed_node_index] )
    
    return(seed_node_index)

In [42]:
def get_rankings_dict(G, embeddings, seeds, metric = "cosine"):
    
    rankings = dict()

    rankings["pages"] = list(G.nodes)

    for seed_page in seeds:

        seed_node_index = get_seed_page_index(G, seed_page)
        seed_node_embedding = embeddings[seed_node_index]
        
        if metric == "cosine":
            node_similarities = calc_cosine_similarity_matrix(embeddings, seed_node_embedding)
        else:
            node_similarities = np.linalg.norm(embeddings - seed_node_embedding)

        rankings[seed_page] = node_similarities


    return(rankings)


def get_rankings_df(G, embeddings, seeds, metric = "cosine"):

    emb_dict = get_rankings_dict(G, embeddings, seeds, metric= metric)
        
    emb_df = pd.DataFrame.from_dict(emb_dict)
    
    emb_df.set_index("pages", inplace = True)
    seed_cols = emb_df.columns
    
    emb_df["max"]= emb_df[seed_cols].max(axis = 1)
    emb_df["median"] = emb_df[seed_cols].median(axis = 1)
    emb_df["mean"] = emb_df[seed_cols].mean(axis = 1)
    emb_df["min"]= emb_df[seed_cols].min(axis = 1)
    
    return(emb_df)
    
    

In [44]:
labelled_data_1 = pd.read_csv('../data/labelled/pages_ranked_with_data_labelled.csv')
labelled_data_1 = labelled_data_1.loc[:,["page path", "label"]]

In [45]:
# THIS IS THE SAME FUNCTION AND IN N2V
def calc_median_difference_n2v(df, labelled_data, standardise = True, page_path = "pagePath"):
    '''df needs to be a result of calling rw.page_freq_path_freq_ranking()
    
    df needs to be ranked from top page to the worst page (i.e. index represents ranking).'''
    df.reset_index(inplace = True, drop = True)
    
    df_labels = df.merge(labelled_data_1, left_on = page_path, right_on = "page path")
    df_labels.reset_index(inplace = True, drop = False)
    df_labels.rename(columns = {"index": "rank"}, inplace = True)

    med_ranking_label1 = df_labels[df_labels["label"] == 1]["rank"].median()
    med_ranking_label0 = df_labels[df_labels["label"] == 0]["rank"].median()
    
    if standardise == True:
        score = (med_ranking_label0 - med_ranking_label1) / ( df_labels[df_labels["label"] == 1]["rank"].std() +
                                                            df_labels[df_labels["label"] == 0]["rank"].std())
    else:
        score = med_ranking_label0 - med_ranking_label1
    
    return( score )

In [53]:
seed_pages_used = [    
    '/find-a-job',
    '/universal-credit',
    '/government/collections/financial-support-for-businesses-during-coronavirus-covid-19']



In [49]:
df_rankings_cosine = get_rankings_df(G2, node_embeddings, seed_pages_used, metric = "cosine")
df_rankings_cosine = df_rankings_cosine.sort_values(by = "max", ascending = False).reset_index(drop = False)
df_rankings_cosine.head()


Seed node: /find-a-job
Seed node: /universal-credit
Seed node: /government/collections/financial-support-for-businesses-during-coronavirus-covid-19


Unnamed: 0,pages,/find-a-job,/universal-credit,/government/collections/financial-support-for-businesses-during-coronavirus-covid-19,max,median,mean,min
0,/government/collections/financial-support-for-...,0.083953,-0.048485,1.0,1.0,0.083953,0.345156,-0.048485
1,/find-a-job,1.0,0.401406,0.083953,1.0,0.401406,0.49512,0.083953
2,/universal-credit,0.401406,1.0,-0.048485,1.0,0.401406,0.450974,-0.048485
3,/get-state-pension,0.770952,0.37124,-0.068289,0.770952,0.37124,0.357968,-0.068289
4,/guidance/if-we-refuse-your-application-for-an...,0.744767,0.456866,0.13382,0.744767,0.456866,0.445151,0.13382


In [50]:
df_rankings_l2 = get_rankings_df(G2, node_embeddings, seed_pages_used, metric = "l2")
df_rankings_l2 = df_rankings_l2.sort_values(by = "min", ascending = True).reset_index(drop = False)
df_rankings_l2.head()

Seed node: /find-a-job
Seed node: /universal-credit
Seed node: /government/collections/financial-support-for-businesses-during-coronavirus-covid-19


Unnamed: 0,pages,/find-a-job,/universal-credit,/government/collections/financial-support-for-businesses-during-coronavirus-covid-19,max,median,mean,min
0,/,132.021332,135.375595,139.610992,139.610992,135.375595,135.669296,132.021332
1,/guidance/immigration-rules/immigration-rules-...,132.021332,135.375595,139.610992,139.610992,135.375595,135.669296,132.021332
2,/guidance/data-analyst,132.021332,135.375595,139.610992,139.610992,135.375595,135.669296,132.021332
3,/overseas-domestic-worker-visa/domestic-worker...,132.021332,135.375595,139.610992,139.610992,135.375595,135.669296,132.021332
4,/plug-in-car-van-grants,132.021332,135.375595,139.610992,139.610992,135.375595,135.669296,132.021332


In [51]:
calc_median_difference_n2v(df_rankings_cosine, labelled_data_1, 
                           standardise = True, page_path = "pages")

0.15228365706759867

In [52]:
calc_median_difference_n2v(df_rankings_l2, labelled_data_1, 
                           standardise = True, page_path = "pages")

-0.01604114333438033

# Explaining the code

Source: https://stellargraph.readthedocs.io/en/stable/demos/embeddings/graphsage-unsupervised-sampler-embeddings.html

Node embeddings are learnt by solving a simple classification task:
given a large set of “positive” `(target, context)` node pairs generated from random walks performed on the graph (i.e., node pairs that co-occur within a certain context window in random walks), and an equally large set of “negative” node pairs that are randomly selected from the graph according to a certain distribution, learn a binary classifier that predicts whether arbitrary node pairs are likely to co-occur in a random walk performed on the graph. 

Through learning this simple binary node-pair-classification task, the model automatically learns an inductive mapping from attributes of nodes and their neighbors to node embeddings in a high-dimensional vector space, which preserves structural and feature similarities of the nodes. 

Unlike embeddings obtained by algorithms such as Node2Vec, this mapping is inductive: given a new node (with attributes) and its links to other nodes in the graph (which was unseen during model training), we can evaluate its embeddings without having to re-train the model.

**Architecture**

The architecture of the node pair classifier is the following. Input node pairs (with node features) are fed, together with the graph structure, into a pair of identical GraphSAGE encoders, producing a pair of node embeddings. These embeddings are then fed into a node pair classification layer, which applies a binary operator to those node embeddings (e.g., concatenating them), and passes the resulting node pair embeddings through a linear transform followed by a binary activation (e.g., sigmoid), thus predicting a binary label for the node pair.

**Specifications**

The Unsupervised GraphSAGE requires a training sample that can be either provided as a list of (target, context) node pairs or it can be provided with an UnsupervisedSampler instance that takes care of generating positive and negative samples of node pairs on demand. In this demo we discuss the latter technique.


The UnsupervisedSampler class takes in a Stellargraph graph instance. The generator method in the UnsupervisedSampler is responsible for generating equal number of positive and negative node pair samples from the graph for training. The samples are generated by performing uniform random walks over the graph, using UniformRandomWalk object. Positive (target, context) node pairs are extracted from the walks, and for each positive pair a corresponding negative pair (target, node) is generated by randomly sampling node from the degree distribution of the graph. Once the batch_size number of samples is accumulated, the generator yields a list of positive and negative node pairs along with their respective 1/0 labels.

In [None]:
unsupervised_samples = 

Next, create the node pair generator for sampling and streaming the training data to the model. The node pair generator essentially “maps” pairs of nodes (target, context) to the input of GraphSAGE: it either takes minibatches of node pairs, or an UnsupervisedSampler instance which generates the minibatches of node pairs on demand. The generator samples 2-hop subgraphs with (target, context) head nodes extracted from those pairs, and feeds them, together with the corresponding binary labels indicating which pair represent positive or negative sample, to the input layer of the node pair classifier with GraphSAGE node encoder, for SGD updates of the model parameters.

In [None]:
generator = 

Final node pair classification layer that takes a pair of nodes’ embeddings produced by graphsage encoder, applies a binary operator to them to produce the corresponding node pair embedding (ip for inner product; other options for the binary operator can be seen by running a cell with ?link_classification in it), and passes it through a dense layer:

In [None]:
prediction = link_classification()