# Node classification with GraphSAGE

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mygene
import h5py
import pickle
import argparse
import networkx as nx
import seaborn as sns
%matplotlib inline

In [53]:
import os, sys

import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph import datasets
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    ClusterNodeGenerator,
)

from stellargraph.layer import GCN, DeepGraphInfomax, GraphSAGE, GAT, APPNP, HinSAGE
from stellargraph.utils import plot_history

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.manifold import TSNE
from sklearn.metrics import average_precision_score
from IPython.display import display, HTML

import tensorflow as tf
from scipy.sparse import csr_matrix, lil_matrix
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras import Model
from keras.models import Sequential
from keras.layers import Dense

from imblearn.over_sampling import SMOTE

In [54]:
network = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/IREF_network.tsv', sep='\t')

features = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/IREF_features_complete.tsv', sep='\t', index_col='gene')

labels = pd.read_csv('C:/Users/renan/Desktop/UFRGS/GNN/data/data_final_v2/IREF_labels_semisupervised.tsv', sep='\t', index_col='gene')

In [55]:
# Mutations 0:48
# CNA 16:64
# DNA Methylation 0:32 e 16:48
# Gene Expression 0:16 e 16:48

#features.drop(features.iloc[:, 0:16], inplace = True, axis = 1)
#features.drop(features.iloc[:, 16:48], inplace = True, axis = 1)
#features

In [56]:
# Transformar as labels boleanas em 0/1
labels["label"].replace({False: 0, True: 1}, inplace=True)

# Transformar as labels vazias em -1
labels["label"] = labels.label.fillna(-1)

In [57]:
G = StellarGraph(edges=network, nodes=features)

print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 14627, Edges: 133095

 Node types:
  default: [14627]
    Features: float32 vector, length 68
    Edge types: default-default->default

 Edge types:
    default-default->default: [133095]
        Weights: all 1 (default)
        Features: none


In [58]:
series_classes = labels['label']

series_classes.value_counts(dropna = False).to_frame()

Unnamed: 0,label
0.0,8838
-1.0,4915
1.0,874


In [59]:
# Dividir os dados em treino e teste apenas, usando a proporção 80/20

train_ratio = 0.80
test_ratio = 0.20

labeled_data = labels[labels['label'] != -1]
labeled_data = labeled_data.sample(frac=1)

# Aqui aplica-se então 20% do tamanho total da rede e o restante para treino
labeled_train, labeled_test = model_selection.train_test_split(
    labeled_data, test_size=test_ratio, stratify=labeled_data)


print("Train: ", len(labeled_train))
print("Test: ", len(labeled_test))
print("\nTotal: ", len(labeled_train)+len(labeled_test))

Train:  7769
Test:  1943

Total:  9712


In [60]:
# Undersampling data 

minor = labeled_train[labeled_train['label'] == 1]
major = labeled_train[labeled_train['label'] == 0]
major = major.sample(n=len(minor), random_state=101)
labeled_train_undersampling = pd.concat([minor,major],axis=0)

labeled_train_undersampling = labeled_train_undersampling['label']

labeled_train_undersampling.value_counts().to_frame()

Unnamed: 0,label
0.0,699
1.0,699


In [61]:
# Difinição da função de custo Focal Loss

import dill

from keras import backend as K

def binary_focal_loss(gamma=2., alpha=.25):

    def binary_focal_loss_fixed(y_true, y_pred):
     
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        # Define epsilon so that the back-propagation will not result in NaN for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        # y_pred = y_pred + epsilon
        # Clip the prediciton value
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        # Calculate p_t
        p_t = tf.where(K.equal(y_true, 1), y_pred, 1 - y_pred)
        # Calculate alpha_t
        alpha_factor = K.ones_like(y_true) * alpha
        alpha_t = tf.where(K.equal(y_true, 1), alpha_factor, 1 - alpha_factor)
        # Calculate cross entropy
        cross_entropy = -K.log(p_t)
        weight = alpha_t * K.pow((1 - p_t), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.mean(K.sum(loss, axis=1))
        return loss

    return binary_focal_loss_fixed

In [62]:
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(labeled_train_undersampling)
test_targets = target_encoding.transform(labeled_test)

In [63]:
batch_size = 50
num_samples = [10, 5]

generator = GraphSAGENodeGenerator(G, batch_size, num_samples)

In [64]:
from sklearn.model_selection import KFold

num_folds = 5

auc_pr_per_fold = []
loss_per_fold = []

kfold = StratifiedKFold(n_splits=num_folds, shuffle=True)

fold_no = 1
for train, test in kfold.split(labeled_train_undersampling.index, train_targets):

  train_gen = generator.flow(labeled_train_undersampling.index[train], train_targets[train], shuffle=True)

  graphsage_model = GraphSAGE(
    layer_sizes=[64, 64], generator=generator, bias=True, dropout=0.01,
  )

  x_inp, x_out = graphsage_model.in_out_tensors()
  prediction = layers.Dense(units=train_targets.shape[1], activation="sigmoid")(x_out)

  model = Model(inputs=x_inp, outputs=prediction)
  model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.binary_crossentropy,
    #loss=[binary_focal_loss(gamma=0.5, alpha=0.50)],
    metrics=["acc", metrics.AUC(curve="ROC", name="auc_roc"), metrics.AUC(curve="PR", name="auc_pr")],
  )

  val_gen = generator.flow(labeled_train_undersampling.index[test], train_targets[test])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
    train_gen, epochs=300, validation_data=val_gen, verbose=2, shuffle=False
  )

  # Generate generalization metrics
  #val_gen = generator.flow(labeled_train.index[test], train_targets[test])
  scores = model.evaluate(val_gen, verbose=0)
  print('\n')
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[3]} of {scores[3]*100}%')
  auc_pr_per_fold.append(scores[3] * 100)
  loss_per_fold.append(scores[0])

  # Save history to csv
  hist_df = pd.DataFrame(history.history)

  hist_csv_file = f'fold{fold_no}_iref_6b.csv'
  with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)
  f.close()

  # Increase fold number
  fold_no = fold_no + 1

# Provide average scores
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(auc_pr_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - AUC PR: {auc_pr_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> AUC PR: {np.mean(auc_pr_per_fold)} (+- {np.std(auc_pr_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/300
23/23 - 5s - loss: 0.6894 - acc: 0.5403 - auc_roc: 0.5449 - auc_pr: 0.5193 - val_loss: 0.6771 - val_acc: 0.5964 - val_auc_roc: 0.6237 - val_auc_pr: 0.5917
Epoch 2/300
23/23 - 2s - loss: 0.6605 - acc: 0.6342 - auc_roc: 0.6814 - auc_pr: 0.6578 - val_loss: 0.6585 - val_acc: 0.5893 - val_auc_roc: 0.6786 - val_auc_pr: 0.6599
Epoch 3/300
23/23 - 2s - loss: 0.6321 - acc: 0.6592 - auc_roc: 0.7322 - auc_pr: 0.7261 - val_loss: 0.6341 - val_acc: 0.6500 - val_auc_roc: 0.7110 - val_auc_pr: 0.7028
Epoch 4/300
23/23 - 2s - loss: 0.6065 - acc: 0.6896 - auc_roc: 0.7616 - auc_pr: 0.7621 - val_loss: 0.6208 - val_acc: 0.6536 - val_auc_roc: 0.7203 - val_auc_pr: 0.7098
Epoch 5/300
23/23 - 2s - loss: 0.5743 - acc: 0.7236 - auc_roc: 0.8006 - auc_pr: 0.8020 - val_loss: 0.6034 - val_acc: 0.6750 - val_auc_roc: 0.7480 - val_auc_pr: 0.7477
Epoch 6/300
23/23 - 1s - loss: 0.5504 - acc: 0.7567 - auc_roc: 0.821

In [65]:
# Execução para os dados de teste

test_gen = generator.flow(labeled_test.index, test_targets)
test_metrics = model.evaluate(test_gen)

print("\nTest Set Metrics:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))


Test Set Metrics:
	loss: 1.3193
	acc: 0.6907
	auc_roc: 0.6976
	auc_pr: 0.1887


In [66]:
all_nodes = series_classes.index
all_mapper = generator.flow(all_nodes)
all_predictions = model.predict(all_mapper)

In [67]:
node_predictions = target_encoding.inverse_transform(all_predictions)
df = pd.DataFrame({"Predicted": node_predictions, "True": series_classes})

#df.head(10)

print(df['True'].value_counts(), "\n")
df['Predicted'].value_counts()

 0.0    8838
-1.0    4915
 1.0     874
Name: True, dtype: int64 



0.0    9059
1.0    5568
Name: Predicted, dtype: int64

In [68]:
# Salvar a variável all_predictions e a node_predictions para comparar
# Usar a all_predictions para gerar gráficos da fig3 do artigo, sobre distribuição de nós 

df_predictions = pd.DataFrame({"percent": all_predictions.squeeze(), "binary": node_predictions, "true": series_classes})

# Save to csv
pred_csv_file = f'predictions_iref_6b.csv'
with open(pred_csv_file, mode='w') as f:
    df_predictions.to_csv(f)
f.close()