# Simple graph experiment to investigate inner workings

This experiments seeks to provide a better understanding of how, and possibly if, the directed GraphSAGE algorithm works in its entirety.

Import NetworkX and stellar:

In [14]:
import networkx as nx
import pandas as pd
import os

import stellargraph as sg
from stellargraph.data.explorer import DirectedBreadthFirstNeighbours
from stellargraph.mapper import DirectedGraphSAGENodeGenerator
from stellargraph.layer import DirectedGraphSAGE

from keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection

### Load a simple graph

Create the graph with directed edges

In [2]:
Gnx = nx.DiGraph()
Gnx.add_edges_from([(1,2),(2,3)])

Create the features for the nodes

In [41]:
feature_names = ["f0"]
column_names =  ["id"] + feature_names
node_data = pd.DataFrame([[1,1.0],[2,0.0],[3,2.0]], columns=column_names)
node_data = node_data.set_index("id")
print(node_data)

     f0
id     
1   1.0
2   0.0
3   2.0


In [42]:
node_features = node_data[feature_names]
print(node_features)

     f0
id     
1   1.0
2   0.0
3   2.0


## Create the StellarGraph

In [43]:
G = sg.StellarDiGraph(Gnx, node_features=node_features)

In [44]:
print(G.info())

StellarDiGraph: Directed multigraph
 Nodes: 3, Edges: 2

 Node types:
  default: [3]
    Edge types: default-default->default

 Edge types:
    default-default->default: [2]



Let us use all nodes

In [45]:
nodes = list(G.nodes())
print(nodes)

[1, 2, 3]


Check on the features

In [46]:
features = G.get_feature_for_nodes(nodes)
print(features)

[[1.]
 [0.]
 [2.]]


Let us look at directed sampling

In [47]:
sampler = DirectedBreadthFirstNeighbours(G)

Examine 1-hop sampling:

In [48]:
res = sampler.run(nodes=nodes, in_size=[1], out_size=[1])
for node_sample in res:
    print(node_sample)

[[1], [None], [2]]
[[2], [1], [3]]
[[3], [2], [None]]


Examine 2-hop sampling:

In [49]:
res = sampler.run(nodes=nodes, in_size=[1, 1], out_size=[1, 1])
for node_sample in res:
    print(node_sample)

[[1], [None], [2], [None], [None], [1], [3]]
[[2], [1], [3], [None], [2], [2], [None]]
[[3], [2], [None], [1], [3], [None], [None]]


Now examine the node generator:

In [50]:
generator = DirectedGraphSAGENodeGenerator(G, batch_size=len(nodes), in_samples=[1], out_samples=[1])

In [51]:
train_data = node_data.index
train_gen = generator.flow(train_data, shuffle=False)
print(train_gen)

<stellargraph.mapper.node_mappers.NodeSequence object at 0x14c8732b0>


Let us delve into the inner workings:

In [52]:
features = train_gen.generator.sample_features(nodes, train_gen._sampling_schema)

In [53]:
print(features)

[array([[[1.]],

       [[0.]],

       [[2.]]]), array([[[0.]],

       [[1.]],

       [[0.]]]), array([[[0.]],

       [[2.]],

       [[0.]]])]


******************************************************************************

Let us see what happens when we ignore in-nodes:

In [8]:
batch_size = 3; in_samples = [0, 0]; out_samples = [2, 2]

A `DirectedGraphSAGENodeGenerator` object is required to send the node features in sampled subgraphs to Keras

In [9]:
generator = DirectedGraphSAGENodeGenerator(G, batch_size, in_samples, out_samples)

Using the `generator.flow()` method, we can create iterators over nodes that should be used to train, validate, or evaluate the model. For training we use all nodes.

In [11]:
train_data = node_data["id"]
train_gen = generator.flow(train_data, shuffle=False)
print(train_gen)

<stellargraph.mapper.node_mappers.NodeSequence object at 0x11fbd5a20>


Now we can specify our machine learning model, we need a few more parameters for this:

 * the `layer_sizes` is a list of hidden feature sizes of each layer in the model. In this example we use 48-dimensional hidden node features at each layer, which corresponds to 16 weights for a node, 16 for the in-nodes (unused) and 16 for the out-nodes. This corresponds to the 32 dimensions used in example 1 (where we do not distinguish between in-nodes and out-nodes).
 * The `bias` and `dropout` are internal parameters of the model. 

In [None]:
graphsage_model = DirectedGraphSAGE(
    layer_sizes=[48, 48],
    generator=train_gen,
    bias=False,
    dropout=0.5,
)

Now we create a model to predict the 7 categories using Keras softmax layers.

In [None]:
x_inp, x_out = graphsage_model.build()
prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

### Training the model

Now let's create the actual Keras model with the graph inputs `x_inp` provided by the `graph_model` and outputs being the predictions from the softmax layer

In [None]:
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

Train the model, keeping track of its loss and accuracy on the training set, and its generalisation performance on the test set (we need to create another generator over the test data for this)

In [None]:
test_gen = generator.flow(test_data.index, test_targets)

In [None]:
history = model.fit_generator(
    train_gen,
    epochs=20,
    validation_data=test_gen,
    verbose=2,
    shuffle=False
)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_history(history):
    metrics = sorted(history.history.keys())
    metrics = metrics[:len(metrics)//2]
    for m in metrics:
        # summarize history for metric m
        plt.plot(history.history[m])
        plt.plot(history.history['val_' + m])
        plt.title(m)
        plt.ylabel(m)
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='best')
        plt.show()

In [None]:
plot_history(history)

Now we have trained the model we can evaluate on the test set.

In [None]:
test_metrics = model.evaluate_generator(test_gen)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

### Making predictions with the model

Now let's get the predictions themselves for all nodes using another node iterator:

In [None]:
all_nodes = node_data.index
all_mapper = generator.flow(all_nodes)
all_predictions = model.predict_generator(all_mapper)

These predictions will be the output of the softmax layer, so to get final categories we'll use the `inverse_transform` method of our target attribute specifcation to turn these values back to the original categories

In [None]:
node_predictions = target_encoding.inverse_transform(all_predictions)

Let's have a look at a few:

In [None]:
results = pd.DataFrame(node_predictions, index=all_nodes).idxmax(axis=1)
df = pd.DataFrame({"Predicted": results, "True": node_data['subject']})
df.head(10)

Add the predictions to the graph, and save as graphml, e.g. for visualisation in [Gephi](https://gephi.org)

In [None]:
for nid, pred, true in zip(df.index, df["Predicted"], df["True"]):
    Gnx.node[nid]["subject"] = true
    Gnx.node[nid]["PREDICTED_subject"] = pred.split("=")[-1]

Also add isTrain and isCorrect node attributes:

In [None]:
for nid in train_data.index:
    Gnx.node[nid]["isTrain"] = True
    
for nid in test_data.index:
    Gnx.node[nid]["isTrain"] = False

In [None]:
for nid in Gnx.nodes():
    Gnx.node[nid]["isCorrect"] = Gnx.node[nid]["subject"] == Gnx.node[nid]["PREDICTED_subject"]

## Node embeddings
Evaluate node embeddings as activations of the output of graphsage layer stack, and visualise them, coloring nodes by their subject label.

The GraphSAGE embeddings are the output of the GraphSAGE layers, namely the `x_out` variable. Let's create a new model with the same inputs as we used previously `x_inp` but now the output is the embeddings rather than the predicted class. Additionally note that the weights trained previously are kept in the new model.

In [None]:
embedding_model = Model(inputs=x_inp, outputs=x_out)

In [None]:
emb = embedding_model.predict_generator(all_mapper)
emb.shape

Project the embeddings to 2d using either TSNE or PCA transform, and visualise, coloring nodes by their subject label

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np

In [None]:
X = emb
y = np.argmax(target_encoding.transform(node_data[["subject"]].to_dict('records')), axis=1)

In [None]:
if X.shape[1] > 2:
    transform = TSNE #PCA 

    trans = transform(n_components=2)
    emb_transformed = pd.DataFrame(trans.fit_transform(X), index=node_data.index)
    emb_transformed['label'] = y
else:
    emb_transformed = pd.DataFrame(X, index=node_data.index)
    emb_transformed = emb_transformed.rename(columns = {'0':0, '1':1})
    emb_transformed['label'] = y

In [None]:
alpha = 0.7

fig, ax = plt.subplots(figsize=(7,7))
ax.scatter(emb_transformed[0], emb_transformed[1], c=emb_transformed['label'].astype("category"), 
            cmap="jet", alpha=alpha)
ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
plt.title('{} visualization of GraphSAGE embeddings for cora dataset'.format(transform.__name__))
plt.show()