# Node classification with Graph Convolutional Network (GCN)

In [1]:
import numpy as np

import mygene
import h5py
import pickle
import argparse
import networkx as nx
import seaborn as sns

import pandas as pd
import os

import stellargraph as sg
from stellargraph.layer import GCN
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import sys

from stellargraph import StellarGraph
from stellargraph import datasets
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    ClusterNodeGenerator,
)

from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE
from stellargraph.utils import plot_history

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score
from IPython.display import display, HTML

import tensorflow as tf
from scipy.sparse import csr_matrix, lil_matrix
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras import Model
from keras.models import Sequential
from keras.layers import Dense

from imblearn.over_sampling import SMOTE

In [3]:
network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/HPRD_network.tsv', sep='\t')

features = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/HPRD_features_complete.tsv', sep='\t', index_col='gene')

labels = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/HPRD_labels_semisupervised.tsv', sep='\t', index_col='gene')

In [4]:
# Mutations 0:48
# CNA 16:64
# DNA Methylation 0:32 e 16:48
# Gene Expression 0:16 e 16:48


#features.drop(features.iloc[:, 0:16], inplace = True, axis = 1)
#features.drop(features.iloc[:, 16:48], inplace = True, axis = 1)
#features

In [4]:
# Transformar as labels boleanas em 0/1
labels["label"].replace({False: 0, True: 1}, inplace=True)

# Transformar as labels vazias em -1
labels["label"] = labels.label.fillna(-1)

In [5]:
G = StellarGraph(edges=network, nodes=features)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 9438, Edges: 36844

 Node types:
  default: [9438]
    Features: float32 vector, length 68
    Edge types: default-default->default

 Edge types:
    default-default->default: [36844]
        Weights: all 1 (default)
        Features: none


In [6]:
series_classes = labels['label']

series_classes.value_counts(dropna = False).to_frame()

Unnamed: 0,label
0.0,4873
-1.0,3772
1.0,793


In [7]:
# Divisão treino/teste

labeled_data = labels[labels['label'] != -1]
labeled_data = labeled_data.sample(frac=1)

# Um conjunto de teste é utilizado em todos as redes, para isso foi selecionado posteriormente uma lista de genes que é comum à todas as redes
test_set = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/test_set_final.tsv', sep='\t')

# Método para separar os dados de treino a partir do test_set
# É criada uma nova coluna comparando os genes do banco de teste pré-selecionado com os dados rotulados totais da rede

labeled_data['treino'] = labeled_data.index.isin(test_set['gene'])
labeled_data.treino.value_counts().to_frame()

# Sempre deve existir 1134 True, pois são genes que existem em todas as redes, logo, o restante é separado para treinamento

labeled_train_temp = labeled_data[labeled_data['treino'] == False]
labeled_test_temp = labeled_data[labeled_data['treino'] == True]

labeled_train = labeled_train_temp.drop(columns=['treino'])
labeled_test = labeled_test_temp.drop(columns=['treino'])

print("Train: ", len(labeled_train))
print("Test: ", len(labeled_test))
print("\nTotal: ", len(labeled_train)+len(labeled_test))

Train:  4532
Test:  1134

Total:  5666


In [8]:
# Difinição da função de custo Focal Loss

import dill

from keras import backend as K

def binary_focal_loss(gamma=2., alpha=.25):

    def binary_focal_loss_fixed(y_true, y_pred):
     
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        # Define epsilon so that the back-propagation will not result in NaN for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        # y_pred = y_pred + epsilon
        # Clip the prediciton value
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        # Calculate p_t
        p_t = tf.where(K.equal(y_true, 1), y_pred, 1 - y_pred)
        # Calculate alpha_t
        alpha_factor = K.ones_like(y_true) * alpha
        alpha_t = tf.where(K.equal(y_true, 1), alpha_factor, 1 - alpha_factor)
        # Calculate cross entropy
        cross_entropy = -K.log(p_t)
        weight = alpha_t * K.pow((1 - p_t), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.mean(K.sum(loss, axis=1))
        return loss

    return binary_focal_loss_fixed

In [9]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(labeled_train)
test_targets = target_encoding.transform(labeled_test)



generator = FullBatchNodeGenerator(G, method="gcn")

Using GCN (local pooling) filters...


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid


params = {'layer_sizes':[[64, 64], [128, 128], [256, 256]],
        'activations': [["ReLU", "ReLU"], ["PReLU", "PReLU"]],
        'dropout':[0.001, 0.01, 0.1],
        'learning_rate':[0.00001, 0.0001, 0.001],
        'epochs':[500],
        'gamma':[0, 0.5, 1, 2],
        'alpha':[0.25, 0.50, 0.75, 0.90]
        }


num_of_settings = len(list(ParameterGrid(params)))
print ("Grid Search: Trying {} different parameter settings...".format(num_of_settings))
param_num = 0

for param_set in list(ParameterGrid(params)):

  num_folds = 5

  auc_pr_per_fold = []
  loss_per_fold = []

  kfold = StratifiedKFold(n_splits=num_folds, shuffle=True)

  fold_no = 1
  for train, test in kfold.split(labeled_train.index, train_targets):

    train_gen = generator.flow(labeled_train.index[train], train_targets[train])

    gcn = GCN(
      layer_sizes=param_set['layer_sizes'], activations=param_set['activations'], generator=generator, dropout=param_set['dropout']
    )

    x_inp, x_out = gcn.in_out_tensors()

    predictions = layers.Dense(units=train_targets.shape[1], activation="sigmoid")(x_out)

    model = Model(inputs=x_inp, outputs=predictions)
    model.compile(
      optimizer=optimizers.Adam(learning_rate=param_set['learning_rate']),
      #loss=losses.binary_crossentropy,
      loss=[binary_focal_loss(gamma=param_set['gamma'], alpha=param_set['alpha'])],
      metrics=["acc", metrics.AUC(curve="ROC", name="auc_roc"), metrics.AUC(curve="PR", name="auc_pr")],
    )

    val_gen = generator.flow(labeled_train.index[test], train_targets[test])


    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    history = model.fit(
      train_gen,
      epochs=param_set['epochs'],
      validation_data=val_gen,
      verbose=2,
      shuffle=False,
    )

    # Generate generalization metrics
    #val_gen = generator.flow(labeled_train.index[test], train_targets[test])
    scores = model.evaluate(val_gen, verbose=0)
    print('\n')
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[3]} of {scores[3]*100}%')
    auc_pr_per_fold.append(scores[3] * 100)
    loss_per_fold.append(scores[0])

    # Save history to csv
    hist_df = pd.DataFrame(history.history)
    hist_csv_file = f'hprd_paramset{param_num}_fold{fold_no}.csv'
    with open(hist_csv_file, mode='w') as f:
      hist_df.to_csv(f)  
    f.close()

    # Increase fold number
    fold_no = fold_no + 1

  # Provide average scores
  print('------------------------------------------------------------------------')
  print('Score per fold')
  for i in range(0, len(auc_pr_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - AUC PR: {auc_pr_per_fold[i]}%')
  print('------------------------------------------------------------------------')
  print('Average scores for all folds:')
  print(f'> AUC PR: {np.mean(auc_pr_per_fold)} (+- {np.std(auc_pr_per_fold)})')
  print(f'> Loss: {np.mean(loss_per_fold)}')
  print('------------------------------------------------------------------------')

  param_num += 1



In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid


params = {'layer_sizes':[[64, 64], [128, 128], [256, 256]],
        'activations': [["ReLU", "ReLU"], ["PReLU", "PReLU"]],
        'dropout':[0.001, 0.01, 0.1],
        'learning_rate':[0.00001, 0.0001, 0.001],
        'epochs':[500],
        'gamma':[0, 0.5, 1, 2],
        'alpha':[0.25, 0.50, 0.75, 0.90]
        }


num_of_settings = len(list(ParameterGrid(params)))
print ("Grid Search: Trying {} different parameter settings...".format(num_of_settings))
param_num = 0

for param_set in list(ParameterGrid(params)):
        if param_num < 113:
                param_num+=1
        else:
                print ("[{} out of {} combinations]: {}".format(param_num, num_of_settings, param_set))
                param_num += 1



Grid Search: Trying 864 different parameter settings...
[113 out of 864 combinations]: {'activations': ['ReLU', 'ReLU'], 'alpha': 0.5, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [128, 128], 'learning_rate': 0.001}
[114 out of 864 combinations]: {'activations': ['ReLU', 'ReLU'], 'alpha': 0.5, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [256, 256], 'learning_rate': 1e-05}
[115 out of 864 combinations]: {'activations': ['ReLU', 'ReLU'], 'alpha': 0.5, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [256, 256], 'learning_rate': 0.0001}
[116 out of 864 combinations]: {'activations': ['ReLU', 'ReLU'], 'alpha': 0.5, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [256, 256], 'learning_rate': 0.001}
[117 out of 864 combinations]: {'activations': ['ReLU', 'ReLU'], 'alpha': 0.5, 'dropout': 0.001, 'epochs': 500, 'gamma': 0.5, 'layer_sizes': [64, 64], 'learning_rate': 1e-05}
[118 out of 864 combinations]: {'activations': ['ReLU', 'ReLU'], 'al

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid


params = {'layer_sizes':[[64, 64], [128, 128], [256, 256]],
        'activations': [["relu", "relu"]],
        'dropout':[0.001, 0.01, 0.1],
        'learning_rate':[0.001, 0.005],
        'epochs':[500],
        'gamma':[0, 0.5, 1, 2],
        'alpha':[0.25, 0.50, 0.75, 0.90]
        }


num_of_settings = len(list(ParameterGrid(params)))
print ("Grid Search: Trying {} different parameter settings...".format(num_of_settings))
param_num = 0

for param_set in list(ParameterGrid(params)):
    print ("[{} out of {} combinations]: {}".format(param_num, num_of_settings, param_set))
    train_probs = pd.DataFrame({"paramset": param_num, "param": param_set})
    param_num += 1

Grid Search: Trying 288 different parameter settings...
[0 out of 288 combinations]: {'activations': ['relu', 'relu'], 'alpha': 0.25, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [64, 64], 'learning_rate': 0.001}
[1 out of 288 combinations]: {'activations': ['relu', 'relu'], 'alpha': 0.25, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [64, 64], 'learning_rate': 0.005}
[2 out of 288 combinations]: {'activations': ['relu', 'relu'], 'alpha': 0.25, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [128, 128], 'learning_rate': 0.001}
[3 out of 288 combinations]: {'activations': ['relu', 'relu'], 'alpha': 0.25, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [128, 128], 'learning_rate': 0.005}
[4 out of 288 combinations]: {'activations': ['relu', 'relu'], 'alpha': 0.25, 'dropout': 0.001, 'epochs': 500, 'gamma': 0, 'layer_sizes': [256, 256], 'learning_rate': 0.001}
[5 out of 288 combinations]: {'activations': ['relu', 'relu'], 'alpha': 0.25, 

In [12]:
lista_parametros = list(ParameterGrid(params))
lista_parametros

[{'activations': ['relu', 'relu'],
  'alpha': 0.25,
  'dropout': 0.001,
  'epochs': 500,
  'gamma': 0,
  'layer_sizes': [64, 64],
  'learning_rate': 0.001},
 {'activations': ['relu', 'relu'],
  'alpha': 0.25,
  'dropout': 0.001,
  'epochs': 500,
  'gamma': 0,
  'layer_sizes': [64, 64],
  'learning_rate': 0.005},
 {'activations': ['relu', 'relu'],
  'alpha': 0.25,
  'dropout': 0.001,
  'epochs': 500,
  'gamma': 0,
  'layer_sizes': [128, 128],
  'learning_rate': 0.001},
 {'activations': ['relu', 'relu'],
  'alpha': 0.25,
  'dropout': 0.001,
  'epochs': 500,
  'gamma': 0,
  'layer_sizes': [128, 128],
  'learning_rate': 0.005},
 {'activations': ['relu', 'relu'],
  'alpha': 0.25,
  'dropout': 0.001,
  'epochs': 500,
  'gamma': 0,
  'layer_sizes': [256, 256],
  'learning_rate': 0.001},
 {'activations': ['relu', 'relu'],
  'alpha': 0.25,
  'dropout': 0.001,
  'epochs': 500,
  'gamma': 0,
  'layer_sizes': [256, 256],
  'learning_rate': 0.005},
 {'activations': ['relu', 'relu'],
  'alpha': 0.25

In [13]:
df = pd.DataFrame (lista_parametros, columns = ['activations', 'alpha', 'dropout', 'epochs', 'gamma', 'layer_sizes', 'learning_rate'])
df

Unnamed: 0,activations,alpha,dropout,epochs,gamma,layer_sizes,learning_rate
0,"[relu, relu]",0.25,0.001,500,0.0,"[64, 64]",0.001
1,"[relu, relu]",0.25,0.001,500,0.0,"[64, 64]",0.005
2,"[relu, relu]",0.25,0.001,500,0.0,"[128, 128]",0.001
3,"[relu, relu]",0.25,0.001,500,0.0,"[128, 128]",0.005
4,"[relu, relu]",0.25,0.001,500,0.0,"[256, 256]",0.001
...,...,...,...,...,...,...,...
283,"[relu, relu]",0.90,0.100,500,2.0,"[64, 64]",0.005
284,"[relu, relu]",0.90,0.100,500,2.0,"[128, 128]",0.001
285,"[relu, relu]",0.90,0.100,500,2.0,"[128, 128]",0.005
286,"[relu, relu]",0.90,0.100,500,2.0,"[256, 256]",0.001


In [14]:
df['paramset'] = np.arange(len(df))
df

Unnamed: 0,activations,alpha,dropout,epochs,gamma,layer_sizes,learning_rate,paramset
0,"[relu, relu]",0.25,0.001,500,0.0,"[64, 64]",0.001,0
1,"[relu, relu]",0.25,0.001,500,0.0,"[64, 64]",0.005,1
2,"[relu, relu]",0.25,0.001,500,0.0,"[128, 128]",0.001,2
3,"[relu, relu]",0.25,0.001,500,0.0,"[128, 128]",0.005,3
4,"[relu, relu]",0.25,0.001,500,0.0,"[256, 256]",0.001,4
...,...,...,...,...,...,...,...,...
283,"[relu, relu]",0.90,0.100,500,2.0,"[64, 64]",0.005,283
284,"[relu, relu]",0.90,0.100,500,2.0,"[128, 128]",0.001,284
285,"[relu, relu]",0.90,0.100,500,2.0,"[128, 128]",0.005,285
286,"[relu, relu]",0.90,0.100,500,2.0,"[256, 256]",0.001,286


In [15]:
#df.to_csv(f'C:/Users/renan/Desktop/experiments/GCN/gridsearch/paramset2.tsv', sep='\t', index=False)

In [11]:
%%capture cap --no-stderr

param_num = 0
for param_set in list(ParameterGrid(params)):
    print ("[{} out of {} combinations]: {}".format(param_num, num_of_settings, param_set))
    param_num += 1


with open('params.txt', 'w') as f:
    f.write(str(cap))

NameError: name 'cap' is not defined

In [None]:
# Execução para os dados de teste

test_gen = generator.flow(labeled_test.index, test_targets)
test_metrics = model.evaluate(test_gen)

print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.2463
	acc: 0.9123
	auc_roc: 0.8162
	auc_pr: 0.4019


In [None]:
# Gerando um dataframe com as probabilidades na etapa de teste

nodes_test = labeled_test.index
gen_test = generator.flow(nodes_test)
test_predictions = model.predict(gen_test)

test_predictions2 = target_encoding.inverse_transform(test_predictions.squeeze())

train_probs = pd.DataFrame({"Predicted": test_predictions2, "Probability": test_predictions.squeeze(), "True": labeled_test['label']})

# Save to csv
pred_test_csv_file = f'test_predictions_hprd_6a.csv'
with open(pred_test_csv_file, mode='w') as f:
    train_probs.to_csv(f)
f.close()

In [None]:
all_nodes = series_classes.index
all_gen = generator.flow(all_nodes)
all_predictions = model.predict(all_gen)

In [None]:
node_predictions = target_encoding.inverse_transform(all_predictions.squeeze())
df = pd.DataFrame({"Predicted": node_predictions, "True": series_classes})

#df.head(20)

print(df['True'].value_counts(), "\n")
df['Predicted'].value_counts()

 0.0    8486
-1.0    4633
 1.0     868
Name: True, dtype: int64 



0.0    13597
1.0      390
Name: Predicted, dtype: int64

In [None]:
# Salvar a variável all_predictions e a node_predictions para comparar
# Usar a all_predictions para gerar gráficos da fig3 do artigo, sobre distribuição de nós 

df_predictions = pd.DataFrame({"percent": all_predictions.squeeze(), "binary": node_predictions, "true": series_classes})

# Save to csv
pred_csv_file = f'all_predictions_hprd_6a.csv'
with open(pred_csv_file, mode='w') as f:
    df_predictions.to_csv(f)
f.close()