#### Import Required Modules

In [21]:
import numpy as np
import tensorflow as tf
import pandas as pd
import random

from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from tensorflow.keras import layers, losses, Model, regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

import matplotlib as plt
from model import *

## Loading & Preprocessing Data
#### Feature Extraction - Load data from facebook dataset

In [2]:
# load data
X_features, labels, edges = load_data()

# check loaded properly
num_classes = len(np.unique(labels))
num_nodes = X_features.shape[0]
num_features = X_features.shape[1]
num_edges = len(edges)/2

print("num of classes: " + str(num_classes))
print("num of nodes: " + str(num_nodes))
print("num of features: " + str(num_features))
print("num of edges: "+ str(num_edges))

# adjacency matrix
A = get_adj_matrix(labels, edges)

num of classes: 4
num of nodes: 22470
num of features: 128
num of edges: 171002.0


#### Normalise Adjacency Matrix

In [3]:
# normalise
A = normalise_adj(A)

#### Split Training, Validation, and Testing

In [4]:
## in py file ###
def split_index(data):
    """ Partitions the dataset into training, validation, and testing splits
        of 0.2 : 0.2 : 0.6 since semi-supervised.
    Parameters:
        data: data to be split
    Returns:
        Indices of Training set, Validation, and Test set
    """
    size = int(len(data)*0.2)
    indices = [i for i in range(len(data))]

    # training split
    train_set = random.sample(indices, k = size)
    
    # split remainder of set
    remainder = set(indices).difference(train_set)

    val_set = random.sample(remainder, k = size)
    test_set = list(set(remainder).difference(val_set))
    
    return train_set, val_set, test_set

In [5]:
# Get indices for splitting set
train_idx, val_idx, test_idx = split_index(labels)

# Apply mask
train_mask = np.zeros((num_nodes,), dtype = bool)
val_mask = np.zeros((num_nodes,), dtype = bool)
test_mask = np.zeros((num_nodes,), dtype = bool)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

since Python 3.9 and will be removed in a subsequent version.
  val_set = random.sample(remainder, k = size)


In [6]:
# One-hot encoding
def encode(labels):
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels) # returns encoded labels
    encoded_labels = to_categorical(labels)
    return encoded_labels, encoder.classes_

encoded_labels, classes = encode(labels)

## Building & Training GCN

In [29]:
def GCN_Model(num_features, num_classes, 
              num_channels = 16, dropout_rate = 0.5, 
              kernel_regulariser = None, num_input_channels = None):
    """ Creates a GCN Model
    Parameters:
        num_features: number of features
        num_classes: number of channels in output
        num_channels: number of channels in first GCN Layer
        dropout_rate: rate for Dropout Layers
        kernel_regulariser: regularisation applied to weights
        num_input_channels: number of input channels aka. node features
    """

    # Inputs
    x_input = Input((num_features,), dtype = tf.float32)
    node_input = Input((num_input_channels,), dtype = tf.float32, sparse = True)

    # Create layers
    dropout_L0 = Dropout(dropout_rate)(x_input)
    gcn_L0 = GCN_Layer(num_channels, activations.relu, kernel_regulariser)([dropout_L0,node_input])

    dropout_L1 = Dropout(dropout_rate)(gcn_L0)
    gcn_L1 = GCN_Layer(num_classes, activations.softmax)([dropout_L1, node_input])
    
    # Model
    model = Model(inputs = [x_input, node_input], outputs = gcn_L1)

    return model

class GCN_Layer(Layer):
    """ A GCN layer.
    *Input*
    - Node features, with shape ([batch], num_nodes, num_features)
    *Output*
    - Node features
    Parameters:
        num_channels: number of output channels
        activation: activation function
        use_bias: boolean, whether to add a bias vector to output
        kernel_initialiser: intialiser for weights
        bias_initialiser: initialiser for bias vector
        kernel_regulariser: regularisation applied to weights
        bias_regulariser: regularisation applied to bias vector
        activity_regulariser: regularisation applied to output
    """
    def __init__(self, 
        num_channels, 
        activation, 
        use_bias = False, 
        kernel_initialiser = 'glorot_uniform',
        bias_initaliser = 'zeros',
        kernel_regulariser = None,
        bias_regulariser = None,
        activity_regulariser = None, **kwargs):

        super(GCN_Layer, self).__init__(**kwargs) 
        self.activation = activations.get(activation) 
        self.use_bias = use_bias
        self.kernel_initialiser = initializers.get(kernel_initialiser)
        self.bias_initaliser = initializers.get(bias_initaliser)
        self.kernel_regulariser = regularizers.get(kernel_regulariser)
        self.bias_regulariser = regularizers.get(bias_regulariser)
        self.activity_regulariser = regularizers.get(activity_regulariser)
        self.num_channels = num_channels

    def build(self, input_shape): 
        assert len(input_shape)>= 2
        input_dim = input_shape[0][-1]

        # create weights of layer
        self.w = self.add_weight(shape = (input_dim, self.num_channels), 
            initializer = self.kernel_initialiser,
            name = "kernel",
            regularizer = self.kernel_regulariser)

    def call(self, inputs):
        x, a = inputs

        output = tf.keras.backend.dot(x, self.w)
        output = tf.keras.backend.dot(a, output)

        return self.activation(output)

    def config(self):
        return {"channels": self.num_channels}

In [30]:
# Parameters
channels = 16 #num for first layer
dropout = 0.5 #rate
l_rate = 1e-2 #learning rate
l2_reg = 2.5e-4 # regularisation rate
epochs = 200 #number of epochs

In [31]:
# Create and Compile
model = GCN_Model(num_features, num_classes, channels, 
                  dropout, l2(l2_reg), num_nodes)

model.compile(optimizer = Adam(learning_rate = l_rate), 
              loss = 'categorical_crossentropy', metrics = ['acc'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
dropout_7 (Dropout)             (None, 128)          0           input_9[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 22470)]      0                                            
__________________________________________________________________________________________________
gcn__layer_7 (GCN_Layer)        (None, 16)           2048        dropout_7[0][0]                  
                                                                 input_10[0][0]             

In [32]:
# Train
def train():
    validation_data = ([X_features, A], encoded_labels, val_mask)
    
    model.fit([X_features, A], 
              encoded_labels,
             sample_weight = train_mask,
             epochs = epochs,
             batch_size = num_nodes,
             validation_data = validation_data,
              shuffle = False)
    

In [None]:
train()

Epoch 1/200


## Results

In [None]:
X_test = X_features[test_mask]
A_test = A[test_mask, :][:,test_mask]
y_test = encoded_labels[tesk_mask]

# Evaluation
y_predictions = model.predict([X_test, A_test], 
                            batch_size = num_nodes)

report = classification_report(y_test, y_predictions, 
                               target_names = classes)

print(report)

## TSNE Plot

Each point is a node representing the facebook page. The colours represent the four possible categories.

In [None]:
## TEST SCRIPT ##
output = output of first gcn layer

tsne = TSNE(n_components = 2).fit_transform(output)
plt.figre(figsize = (10,10))

colour_map = np.argmax(encoded_labels, axis = 1)
for i in range(num_classes):
    indices = np.where(color_map == i)
    
    plt.scatter(tsne[indices[0], 0], tsne[indices[0],1], label = i)
    
plt.title('tSNE Plot')
plt.legend()
plt.show()