In [1]:
import os 
import pandas as pd
import numpy as np
import pickle as pkl
import time

import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.mapper import FullBatchNodeGenerator
from stellargraph.layer import GCN
from stellargraph import StellarGraph

from stellargraph import datasets

from sklearn import model_selection

import networkx as nx

from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import matplotlib.pyplot as plt

## Import the data

In [2]:
file = open("./1140_25122020_global_adj_graph.pkl",'rb')

graph = pkl.load(file)
label = pkl.load(file)

file.close()    

identity = pd.DataFrame(0, index=graph.nodes, columns=list(range(len(graph.nodes))))
for i in range(identity.shape[1]):
    identity.iloc[i,i] = 1

In [3]:
graph_SG = StellarGraph.from_networkx(graph, node_features=identity)

In [4]:
graph_labels = np.array(label).argmax(1)

In [5]:
len(graph_labels)

2225

In [6]:
print(graph_SG.info())
print(np.unique(graph_labels))

StellarGraph: Undirected multigraph
 Nodes: 8577, Edges: 628557

 Node types:
  default: [8577]
    Features: float32 vector, length 8577
    Edge types: default-default->default

 Edge types:
    default-default->default: [628557]
        Weights: range=[0.000211188, 10.8606], mean=1.39254, std=1.98943
        Features: none
[0 1 2 3 4]


### FullBatchNodeGenerator


In [7]:
generator = FullBatchNodeGenerator(graph_SG, method="gcn")

Using GCN (local pooling) filters...


### Create the Keras graph classification model

We are now ready to create a `tf.Keras` graph classification model using `StellarGraph`'s `GraphClassification` class together with standard `tf.Keras` layers, e.g., `Dense`. 

The input is the graph represented by its adjacency and node features matrices. The first two layers are Graph Convolutional as in [2] with each layer having 64 units and `relu` activations. The next layer is a mean pooling layer where the learned node representation are summarized to create a graph representation. The graph representation is input to two fully connected layers with 32 and 16 units respectively and `relu` activations. The last layer is the output layer with a single unit and `sigmoid` activation.

![](graph_classification_architecture.png)

In [8]:
def create_graph_classification_model(generator):
    
    gc_model = GCN(
        layer_sizes=[ 5],
        activations=["softmax"],
        generator=generator,
        dropout=0.0,
    )
    x_inp, x_out = gc_model.in_out_tensors()
#     predictions = Dense(units=32, activation="relu")(x_out)
#     predictions = Dense(units=16, activation="relu")(predictions)
#     predictions = Dense(units=5, activation="softmax")(predictions)

    # Let's create the Keras model and prepare it for training
    model = Model(inputs=x_inp, outputs=x_out)
    model.compile(optimizer=Adam(0.005), loss = sparse_categorical_crossentropy, metrics=["acc"])

    return model

### Train the model

We can now train the model using the model's `fit` method. First, we specify some important training parameters such as the number of training epochs, number of fold for cross validation and the number of time to repeat cross validation.

In [9]:
epochs = 1000  # maximum number of training epochs
folds = 10  # the number of folds for k-fold cross validation
n_repeats = 1  # the number of repeats for repeated k-fold cross validation

In [10]:
es = EarlyStopping(
    monitor="val_loss", min_delta=0, patience=25, restore_best_weights=True
)

The method `train_fold` is used to train a graph classification model for a single fold of the data.

In [11]:
def train_fold(model, train_gen, test_gen, es, epochs):
    history = model.fit(
        train_gen, epochs=epochs, validation_data=test_gen, verbose=0, callbacks=[es]
    )
    # calculate performance on the test data and return along with history
    test_metrics = model.evaluate(test_gen, verbose=0)
    test_acc = test_metrics[model.metrics_names.index("acc")]

    return history, test_acc

In [13]:
test_accs = []

stratified_folds = model_selection.RepeatedStratifiedKFold(
    n_splits=folds, n_repeats=n_repeats
).split(graph_labels, graph_labels)


In [None]:
for i, (train_index, test_index) in enumerate(stratified_folds):
    print(f"Training and evaluating on fold {i+1} out of {folds * n_repeats}...")
    train_index2 = train_index +  (len(graph.nodes) - label.shape[0])
    test_index2 =  test_index + (len(graph.nodes) - label.shape[0])
    print(str(np.min(train_index2)) + " " + str(np.max(train_index2)))
    print(str(np.min(test_index2)) + " " + str(np.max(test_index2)))
    train_gen = generator.flow(
        train_index2, targets=graph_labels[train_index], use_ilocs=True
    )
    test_gen = generator.flow(
        test_index2, targets=graph_labels[test_index], use_ilocs=True
    )

    model = create_graph_classification_model(generator)
    start_time = time.time()
    history, acc = train_fold(model, train_gen, test_gen, es, epochs)
    print("--- %s seconds ---" % (time.time() - start_time))
    print("Acc: " + str(acc))
    test_accs.append(acc)

Training and evaluating on fold 1 out of 10...
6354 8576
6352 8560


In [None]:
model = create_graph_classification_model(generator)

In [None]:
model.summary()

In [None]:
history.history

In [None]:
print(
    f"Accuracy over all folds mean: {np.mean(test_accs)*100:.3}% and std: {np.std(test_accs)*100:.2}%"
)

Finally, we plot a histogram of the accuracy of all `n_repeats x folds` models trained (50 in total).

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(test_accs)
plt.xlabel("Accuracy")
plt.ylabel("Count")