# GCN Experiments

In [1]:
import os
import sys

import numpy as np
import pandas as pd

module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

import torch
import torch_geometric as pyg
from torch_geometric.nn import GCNConv, Linear
from torch.nn import functional as F

from src import temporal_loader_v2 as tl
from src.utils import to_homogeneous, score

In [2]:
cuda_device = 4
pyg.seed_everything(42)

### Model

In [3]:
class GNN(torch.nn.Module):
    def __init__(self, num_features, dim=16, num_classes=2, num_layers=2, model_type='gcn'):
        super(GNN, self).__init__()

        self.conv1 = GCNConv(num_features, dim)
        self.gcs = torch.nn.ModuleList()
        self.num_layers = num_layers
        for i in range(1, num_layers):
            conv = GCNConv(dim, dim)
            self.gcs.append(conv)
        self.lin = Linear(dim, num_classes)

    def forward(self, x, edge_index, data=None, save_embedding=False):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        for i in range(1, self.num_layers):
            x = F.relu(self.gcs[i-1](x, edge_index))
            x = F.dropout(x, training=self.training)
        return self.lin(x)
    

def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    mask = data.train_mask
    loss = F.cross_entropy(out[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(model, data):
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)

    accs = []
    for split in ['train_mask', 'val_mask']:
        mask = data[split]
        acc = (pred[mask] == data.y[mask]).sum() / mask.sum()
        accs.append(float(acc))
    return accs

In [6]:
def experiment(start,end,test_list, model_type):
    kg_path = lambda graph_name: f'../data/{graph_name}'

    dataset = tl.DNS(root=kg_path('DNS_2m'), start=start, end=end, test_list=test_list, 
                     balance_gt=False, domain_file='domains2.csv')
    data = to_homogeneous(dataset.train_data) # training data
    
    model = GNN(data.x.size(1), dim=64, num_classes=2, num_layers=2, model_type=model_type)
    
    if torch.cuda.is_available():
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        torch.cuda.set_device(cuda_device)

        data, model = data.to(device), model.to(device)

    with torch.no_grad():  # Initialize lazy modules.
        out = model(data.x, data.edge_index)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)

    for epoch in range(0, 201):
        loss = train(model, data, optimizer)
        train_acc, val_acc = test(model,data)
        if epoch % 20 == 0:
            print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
                f'Val: {val_acc:.4f}')
        
    model.eval()
    for index, test_data in enumerate(dataset.test_data):
        test_data = to_homogeneous(test_data)
        test_data = test_data.to(device)
        with torch.no_grad():
            pred = model(test_data.x, test_data.edge_index).argmax(dim=-1)
        mask = test_data['val_mask']
        scores = score(pred[mask],test_data.y[mask])
        with open("results_copy.csv", "a") as logger:
            logger.write("{},{},{},{},".format(model_type,start,end,index))
            logger.write(",".join(str(x) for x in scores.values()))
            logger.write('\n')


        for metric, val in scores.items():
            print(metric, ':{:.4f}'.format(val))
    
for model_type in ['gcn']:
    for i in range(5):
        experiment(i,i+6,[i+7,i+8], model_type)

Total labeled 897635
Labeled node count for 0, 6: 31778
Labeled node count for 0, 7: 2610
Labeled node count for 0, 8: 2083
Epoch: 000, Loss: 0.7043, Train: 0.7094, Val: 0.7169
Epoch: 020, Loss: 0.5079, Train: 0.7903, Val: 0.7954
Epoch: 040, Loss: 0.4562, Train: 0.8026, Val: 0.8066
Epoch: 060, Loss: 0.4276, Train: 0.8075, Val: 0.8121
Epoch: 080, Loss: 0.4195, Train: 0.8093, Val: 0.8118
Epoch: 100, Loss: 0.4176, Train: 0.8101, Val: 0.8127
Epoch: 120, Loss: 0.4156, Train: 0.8107, Val: 0.8131
Epoch: 140, Loss: 0.4144, Train: 0.8112, Val: 0.8135
Epoch: 160, Loss: 0.4123, Train: 0.8115, Val: 0.8132
Epoch: 180, Loss: 0.4121, Train: 0.8115, Val: 0.8137
Epoch: 200, Loss: 0.4127, Train: 0.8112, Val: 0.8132
tn, fp, fn, tp 1307 298 159 846
acc :0.8249
f1 :0.8266
auc :0.8281
prec :0.7395
recall :0.8418
fpr :0.1857
mi_f1 :0.8249
ma_f1 :0.8193
tn, fp, fn, tp 1059 250 116 658
acc :0.8243
f1 :0.8266
auc :0.8296
prec :0.7247
recall :0.8501
fpr :0.1910
mi_f1 :0.8243
ma_f1 :0.8175
Total labeled 897635
La