# Node classification with Graph Convolutional Network (GCN)

In [151]:
import numpy as np

import mygene
import h5py
import pickle
import argparse
import networkx as nx
import seaborn as sns

import pandas as pd
import os

import stellargraph as sg
from stellargraph.layer import GCN
import matplotlib.pyplot as plt
%matplotlib inline

In [152]:
import sys

from stellargraph import StellarGraph
from stellargraph import datasets
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    ClusterNodeGenerator,
)

from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE
from stellargraph.utils import plot_history

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score
from IPython.display import display, HTML

import tensorflow as tf
from scipy.sparse import csr_matrix, lil_matrix
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras import Model
from keras.models import Sequential
from keras.layers import Dense

from imblearn.over_sampling import SMOTE

In [153]:
network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/IREF_network.tsv', sep='\t')

features = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/IREF_features_omics.tsv', sep='\t', index_col='gene')

labels = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/IREF_labels_semisupervised.tsv', sep='\t', index_col='gene')

In [154]:
# Mutations 0:48
# CNA 16:64
# DNA Methylation 0:32 e 16:48
# Gene Expression 0:16 e 16:48


features.drop(features.iloc[:, 16:64], inplace = True, axis = 1)
#features.drop(features.iloc[:, 16:48], inplace = True, axis = 1)
features

Unnamed: 0_level_0,CNA: BLCA,CNA: BRCA,CNA: CESC,CNA: COAD,CNA: ESCA,CNA: HNSC,CNA: KIRC,CNA: KIRP,CNA: LIHC,CNA: LUAD,CNA: LUSC,CNA: PRAD,CNA: READ,CNA: STAD,CNA: THCA,CNA: UCEC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A1BG,0.0,0.000000,0.167401,0.000000,0.000000,0.255263,0.000000,0.095477,0.000000,0.000000,0.000000,0.000000,0.244898,0.00000,0.000000,0.261719
A1CF,0.0,0.000000,0.000000,0.000000,0.283951,0.000000,0.000000,0.082251,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.165017
A2M,0.0,0.000000,0.000000,0.174419,0.000000,0.000000,0.000000,0.000000,0.000000,0.289256,0.000000,0.000000,0.129870,0.23913,0.000000,0.000000
AAAS,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.129870,0.18323,0.000000,0.000000
AACS,0.0,0.227126,0.000000,0.000000,0.000000,0.159091,0.000000,0.000000,0.175074,0.274793,0.178794,0.129129,0.000000,0.00000,0.000000,0.217822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
ZYG11B,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000
ZYX,0.0,0.000000,0.264317,0.000000,0.000000,0.000000,0.376344,0.000000,0.100890,0.000000,0.000000,0.000000,0.000000,0.00000,0.010526,0.207921
ZZEF1,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.670623,0.000000,0.000000,0.000000,0.000000,0.00000,0.084211,0.000000


In [155]:
# Transformar as labels boleanas em 0/1
labels["label"].replace({False: 0, True: 1}, inplace=True)

# Transformar as labels vazias em -1
labels["label"] = labels.label.fillna(-1)

In [156]:
G = StellarGraph(edges=network, nodes=features)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 14627, Edges: 133095

 Node types:
  default: [14627]
    Features: float32 vector, length 16
    Edge types: default-default->default

 Edge types:
    default-default->default: [133095]
        Weights: all 1 (default)
        Features: none


In [157]:
series_classes = labels['label']

series_classes.value_counts(dropna = False).to_frame()

Unnamed: 0,label
0.0,8838
-1.0,4915
1.0,874


In [158]:
# Dividindo dados em treino/validação/teste

train_ratio = 0.80
test_ratio = 0.20

labeled_data = labels[labels['label'] != -1]
labeled_data = labeled_data.sample(frac=1)

# Aqui aplica-se então 20% do tamanho total da rede e o restante para treino
labeled_train, labeled_test = model_selection.train_test_split(
    labeled_data, test_size=test_ratio, stratify=labeled_data)


print("Train: ", len(labeled_train))
print("Test: ", len(labeled_test))
print("\nTotal: ", len(labeled_train)+len(labeled_test))

Train:  7769
Test:  1943

Total:  9712


In [159]:
# Difinição da função de custo Focal Loss

import dill

from keras import backend as K

def binary_focal_loss(gamma=2., alpha=.25):

    def binary_focal_loss_fixed(y_true, y_pred):
     
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        # Define epsilon so that the back-propagation will not result in NaN for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        # y_pred = y_pred + epsilon
        # Clip the prediciton value
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        # Calculate p_t
        p_t = tf.where(K.equal(y_true, 1), y_pred, 1 - y_pred)
        # Calculate alpha_t
        alpha_factor = K.ones_like(y_true) * alpha
        alpha_t = tf.where(K.equal(y_true, 1), alpha_factor, 1 - alpha_factor)
        # Calculate cross entropy
        cross_entropy = -K.log(p_t)
        weight = alpha_t * K.pow((1 - p_t), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.mean(K.sum(loss, axis=1))
        return loss

    return binary_focal_loss_fixed

In [160]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(labeled_train)
test_targets = target_encoding.transform(labeled_test)

In [161]:
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout

from sklearn.model_selection import KFold

num_folds = 5

auc_pr_per_fold = []
loss_per_fold = []

kfold = StratifiedKFold(n_splits=num_folds, shuffle=True)

fold_no = 1
for train, test in kfold.split(labeled_train.index, train_targets):

  nb_classes = train_targets.shape[1]
  input_dim = features.loc[labeled_train.index].values.shape[1]

  model = Sequential()
  model.add(Dense(64, input_dim=input_dim))
  model.add(Activation('ReLU'))
  model.add(Dropout(0.01))
  model.add(Dense(64))
  model.add(Activation('ReLU'))
  model.add(Dropout(0.01))
  model.add(Dense(nb_classes))
  model.add(Activation('sigmoid'))


  model.compile(
      loss='binary_crossentropy',
      #loss=[binary_focal_loss(gamma=0, alpha=0.85)],
      optimizer=optimizers.Adam(learning_rate=0.001), 
      metrics=["acc", metrics.AUC(curve="ROC", name="auc_roc"), metrics.AUC(curve="PR", name="auc_pr")]
  )

  X_train = features.loc[labeled_train.index[train]].values
  y_train = train_targets[train]

  X_val = features.loc[labeled_train.index[test]].values
  y_val = train_targets[test]

    
  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(X_train, y_train, epochs=300, validation_data=(X_val, y_val), batch_size=8, verbose=2)

  # Generate generalization metrics
  scores = model.evaluate(X_val, y_val, verbose=0)
  print('\n')
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[3]} of {scores[3]*100}%')
  auc_pr_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Save history to csv
  hist_df = pd.DataFrame(history.history)

  hist_csv_file = f'fold{fold_no}_iref_2a.csv'
  with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)
  f.close()

  # Increase fold number
  fold_no = fold_no + 1

# Provide average scores
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(auc_pr_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - AUC PR: {auc_pr_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> AUC PR: {np.mean(auc_pr_per_fold)} (+- {np.std(auc_pr_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/300
777/777 - 4s - loss: 0.3236 - acc: 0.9099 - auc_roc: 0.4851 - auc_pr: 0.0889 - val_loss: 0.3043 - val_acc: 0.9099 - val_auc_roc: 0.5468 - val_auc_pr: 0.1140
Epoch 2/300
777/777 - 2s - loss: 0.3022 - acc: 0.9101 - auc_roc: 0.5361 - auc_pr: 0.1147 - val_loss: 0.3087 - val_acc: 0.9099 - val_auc_roc: 0.5367 - val_auc_pr: 0.1099
Epoch 3/300
777/777 - 2s - loss: 0.3012 - acc: 0.9101 - auc_roc: 0.5526 - auc_pr: 0.1314 - val_loss: 0.3039 - val_acc: 0.9099 - val_auc_roc: 0.5198 - val_auc_pr: 0.1062
Epoch 4/300
777/777 - 1s - loss: 0.2995 - acc: 0.9101 - auc_roc: 0.5701 - auc_pr: 0.1233 - val_loss: 0.3036 - val_acc: 0.9099 - val_auc_roc: 0.5372 - val_auc_pr: 0.1146
Epoch 5/300
777/777 - 2s - loss: 0.2987 - acc: 0.9101 - auc_roc: 0.5709 - auc_pr: 0.1380 - val_loss: 0.3058 - val_acc: 0.9099 - val_auc_roc: 0.5280 - val_auc_pr: 0.1123
Epoch 6/300
777/777 - 2s - loss: 0.2975 - acc: 0.9101 - au

In [162]:
# Execução para os dados de teste

X_test = features.loc[labeled_test.index].values
y_test = test_targets

test_metrics = model.evaluate(X_test, y_test)
print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 0.4025
	acc: 0.9068
	auc_roc: 0.5522
	auc_pr: 0.1165
