In [None]:
import os
import pandas as pd
import numpy as np
np.random.seed(12)
from matplotlib import pyplot as plt
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import torch
torch.manual_seed(12)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler

import dgl
from dgl.data import DGLDataset
from dgl.dataloading import GraphDataLoader
from dgl.nn import EdgeGATConv
from dgl import batch
import dgl.function as fn

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, f1_score, precision_recall_curve
import optuna
from optuna import trial
from optuna.samplers import TPESampler

# 0. Check GPU

In [None]:
print(dgl.__version__)

In [None]:
!nvidia-smi

Fri Jun 30 08:08:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
device = torch.device('cuda' if torch.cuda.is_availabe() else 'cpu')

# 1. Load Data

In [None]:
# Change ./GC to your dataset folder
dataset = dgl.data.CSVDataset(./GC)
len(dataset)

In [None]:
for data in dataset:
	print(data)

In [None]:
graph0, data0 = dataset[0]
print(graph0)

In [None]:
print(data0)

In [None]:
# create self loop for each node
self_dataset = []
for graph, data in dataset:
    graph = dgl.add_self_loop(graph)
    self_dataset.append((graph, data))
dataset = self_dataset

In [None]:
# check num_edges to see if self loops are added properly
graph0, data0 = dataset[0]
print(graph0)

# 2. Train/Test Split

In [None]:
labels = np.array([dataset[i][1] for i in range(len(dataset))])
print(f"{labels}, with {len(labels)} labels")

In [None]:
neg_indices = np.where(labels == 0)[0]
pos_indices = np.where(labels == 1)[0]

train_ratio, val_ratio, test_ratio = 0.6, 0.2, 0.2

# divide negative label indices into train / val / test sets
train_neg, val_test_neg = train_test_split(neg_indices, train_size=train_ratio, random_state=12)
val_neg, test_neg = train_test_split(val_test_neg, train_size=0.5, random_state=12)

# divide positive label indices into train / val / test sets
train_pos, val_test_pos = train_test_split(pos_indices, train_size=train_ratio, random_state=12)
val_pos, test_pos = train_test_split(val_test_pos, train_size=0.5, random_state=12)

train_indices = np.concatenate([train_neg, train_pos])
val_indices = np.concatenate([val_neg, val_pos])
test_indices = np.concatenate([test_neg, test_pos])

In [None]:
# divide graphs according to indices divided above
train_dataset, val_dataset, test_dataset = [], [], []

for index in train_indices:
    train_dataset.append(dataset[index])

for index in val_indices:
    val_dataset.append(dataset[index])

for index in test_indices:
    test_dataset.append(dataset[index])

In [None]:
# check if they are divided well
print(f"Total dataset: {len(dataset)}, \nTrain:Val:Test = {len(train_dataset)}:{len(val_dataset)}:{len(test_dataset)}")

In [None]:
# shuffle data
np.random.shuffle(train_dataset)
np.random.shuffle(val_dataset)
np.random.shuffle(test_dataset)

# 3. Model

In [None]:
class EdgeGATClassifier(nn.Module):
    def __init__(self, in_feats, edge_feats, hidden_feats, out_feats, num_heads):
        super(EdgeGATClassifier, self).__init__()
        self.conv1 = EdgeGATConv(in_feats, edge_feats, hidden_feats, num_heads)
        self.conv2 = EdgeGATConv(hidden_feats, edge_feats, hidden_feats, num_heads)
        self.fc = nn.Linear(hidden_feats, 1)
    
    def forward(self, bg):
        h = bg.ndata['feat'] # node features
        e = bg.edata['feat'] # edge features
        
        h = self.conv1(bg, h, e)
        h = F.leaky_relu(h)
        h = torch.mean(h, dim=1, keepdim=True)
        h = torch.flatten(h, start_dim=1)

        h = self.conv2(bg, h, e)
        h = F.leaky_relu(h)
        h = torch.mean(h, dim=1, keepdim=True)
        
        bg.ndata['h'] = h
        h = dgl.mean_nodes(bg, 'h')
        h = self.fc(h)
        h = torch.squeeze(h, dim=-1)
        return h

# 4. Training with Optuna
- Training works to minimize the loss set in the criterion.

In [None]:
def objective(trial):
    input_dim = graph0.ndata['feat'].shape[1]
    edge_dim = graph0.edata['feat'].shape[1]
    num_classes = 1
    
    hidden_dim = 2 ** trial.suggest_int('hidden_dim', 3, 6)
    num_heads = trial.suggest_int('num_heads', 2, 5)
    learning_rate = 10 ** trial.susgest_int('learning_rate', -5, -2)
    num_epochs = trial.suggest_discrete_uniform('num_epochs', 50, 100, 10)
    batch_size = 2 ** trial.suggest_int('batch_size', 5, 8)

    # load splitted data
    train_dataloader = GraphDataLoader(train_dataset, batch_size=batch_size, drop_last=False)
    val_dataloader = GraphDataLoader(val_dataset, batch_size=batch_size, drop_last=False)

    # define model
    model = EdgeGATClassifier(input_dim, edge_dim, hidden_dim, num_classes, num_heads)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # train mode
    model.train()
    for batched_graph, labels in train_dataloader:
        preds = model(batched_graph)
        loss = criterion(preds.float(), labels.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # evaluation mode
    model.eval()
    val_loss = 0
    step = 0
    with torch.no_grad():
        for batched_graph, labels in val_dataloader:
            step += 1
            pred = model(batched_graph)
            loss = criterion(pred.float(), labels.float())
            val_loss += loss.item()
            average_val_loss = val_loss / step
            trial.report(average_val_loss, step)
            if trial.should_prune():
                raise optuna.TrialPruned()
        return average_val_loss

In [None]:
sampler = TPESampler(seed=12)
study = optuna.create_study(study_name='EdgeGAT', direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=50)

In [None]:
# best hyperparameters
study.best_trial.params

In [None]:
# best loss value
study.best_value

# 5. With best params

In [None]:
# default
input_dim = graph0.ndata['feat'].shape[1]
edge_dim = graph0.edata['feat'].shape[1]
num_classes = 1

# hyperparameters
hidden_dim = 2 ** study.best_trial.params['hidden_dim']
num_heads = study.best_trial.params['num_heads']
learning_rate = 10 ** study.best_trial.params['learning_rate']
num_epochs = int(study.best_trial.params['num_epochs'])
batch_size = 2 ** study.best_trial.params['batch_size']

In [None]:
train_dataloader = GraphDataLoader(train_dataset, batch_size=batch_size, drop_last=False)
val_dataloader = GraphDataLoader(val_dataset, batch_size=batch_size, drop_last=False)
test_dataloader = GraphDataLoader(test_dataset, batch_size=batch_size, drop_last=False)

In [None]:
model = EdgeGATClassifier(input_dim, edge_dim, hidden_dim, num_classes, num_heads)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
with tqdm(range(num_epochs), desc='Training Progress', unit='epoch') as epoch_progress:
    for epoch in epoch_progress:
        model.train()
        total_loss = 0
        num_batches = 0
        
        for batched_graph, labels in train_dataloader:
            preds = model(batched_graph)
            loss = criterion(preds.float(), labels.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
            avg_loss = total_loss / num_batches
            epoch_progress.set_postfix({'Loss' : f"{avg_loss:.4f}"})

# 6. Validation
- The purpose of validation is to find the optimal threshold to decide the labels for each predicted value.

In [None]:
def find_optimal_threshold(model, dataloader):
    model.eval()
    with torch.no_grad():
        all_pred = []
        all_labels = []
        for batched_graph, labels in dataloader:
            pred = model(batched_graph)
            all_pred.append(pred)
            all_labels.append(labels.float())

        all_pred = torch.sigmoid(torch.cat(all_pred))
        all_labels = torch.cat(all_labels)
        precision, recall, threshold = precision_recall_curve(all_labels, all_pred)

        f1 = precision * recall * 2 / (precision + recall)
        f1 = np.nan_to_num(f1)
        ix = np.argmax(f1)
        opt_thr = threshold[ix]
        print(f"Optimal Threshold : {opt_thr}, F1 Score : {f1[ix]}")

        plt.plot(recall, precision, marker=',', label='EdgeGAT')
        plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Optimal', linewidths=3)
        plt.title('Precision - Recall Curve')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.legend()
        plt.show()

        return opt_thr

In [None]:
best_threshold = find_optimal_threshold(model, val_dataloader)

# 7. Test

In [None]:
def evaluate(model, dataloader, threshold):
    model.eval()
    with torch.no_grad():
        all_pred = []
        all_labels = []
        for batched_graph, labels in dataloader:
            preds = model(batched_graph)
            all_pred.append(preds)
            all_labels.append(labels.float())

        all_pred = torch.sigmoid(torch.cat(all_pred))
        all_labels = torch.cat(all_labels)
        pred_labels = (all_pred >= threshold).long()

        f1 = f1_score(all_labels, pred_labels)
        accuracy = (pred_labels == all_labels).float().mean().item()
        recall = recall_score(all_labels, pred_labels)

        return f1, accuracy, recall

In [None]:
f1, accuracy, recall = evaluate(model, test_dataloader, best_threshold)
print(f"Accuracy : {accuracy:.4f}, Recall : {recall:.4f}, F1 : {f1:.4f}")