# Load DNS and mDNS datasets

In [1]:
import os
import sys

import numpy as np
import pandas as pd

module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

import torch
import torch_geometric.transforms as T

from libs.utils import score
from libs.loader import DNS

## Load Graphs

In [2]:
kg_path = lambda graph_name: f'./data/{graph_name}'

#### mDNS

In [3]:
# dataset = DNS(root=kg_path('mDNS'), transform=T.Compose([T.NormalizeFeatures(), T.ToUndirected()]), balance_gt=True)
# data = dataset[0]
# # data['domain_node']['test_mask'].unique(return_counts=True)
# # # datadomain_node.test_mask

#### DNS

In [4]:
dataset = DNS(root=kg_path('DNS'), transform=T.Compose([T.NormalizeFeatures(), T.ToUndirected()]), balance_gt=True)
data = dataset[0]
data['domain_node'].x.size()

Remove parallel edges: type
similar    50910
dtype: int64
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
Mal nodes [     3     17     25 ... 373420 373431 373464]
Mal nodes 4386
Ben nodes 4386
8772 [359268 293963 301564 ...  59017  83459  74034]


torch.Size([373475, 10])

In [5]:
data = dataset.to_homogeneous()
print(data.train_mask.unique(return_counts=True))
print(data.test_mask.unique(return_counts=True))
data.edge_type.unique(return_counts=True)

(tensor([False,  True]), tensor([442682,   4386]))
(tensor([False,  True]), tensor([444437,   2631]))


(tensor([0, 1, 2]), tensor([ 89472, 730438,  77678]))

In [6]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import RGCNConv, Linear


class RGCN(torch.nn.Module):
    def __init__(self, num_features, dim=16, num_classes=2, num_layers=2):
        super(RGCN, self).__init__()

        self.conv1 = RGCNConv(num_features, dim, num_relations=3)
        self.gcs = nn.ModuleList()
        self.num_layers = num_layers
        for i in range(1, num_layers):
            conv = RGCNConv(dim, dim, num_relations=3) 
            self.gcs.append(conv)
        self.lin = Linear(dim, num_classes)

    def forward(self, x, edge_index, edge_type):
        x = F.relu(self.conv1(x, edge_index, edge_type))
        x = F.dropout(x, training=self.training)
        for i in range(1, self.num_layers):
            x = F.relu(self.gcs[i-1](x, edge_index, edge_type))
            x = F.dropout(x, training=self.training)
        return self.lin(x)


    
model = RGCN(data.x.size(1), dim=64, num_classes=2,
                  num_layers=2)

model

RGCN(
  (conv1): RGCNConv(12, 64, num_relations=3)
  (gcs): ModuleList(
    (0): RGCNConv(64, 64, num_relations=3)
  )
  (lin): Linear(64, 2, bias=True)
)

In [7]:
cuda_device = 2
torch.manual_seed(42)

if torch.cuda.is_available():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(cuda_device)

    data, model = data.to(device), model.to(device)

with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x, data.edge_index, data.edge_type)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)


def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_type)
    mask = data.train_mask
    loss = F.cross_entropy(out[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index, data.edge_type).argmax(dim=-1)

    accs = []
    for split in ['train_mask', 'val_mask', 'test_mask']:
        mask = data[split]
        acc = (pred[mask] == data.y[mask]).sum() / mask.sum()
        accs.append(float(acc))
    return accs


for epoch in range(1, 201):
    loss = train()
    train_acc, val_acc, test_acc = test()
    if epoch % 20 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
              f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')
    
model.eval()
with torch.no_grad():
    pred = model(data.x, data.edge_index, data.edge_type).argmax(dim=-1)
mask = data['test_mask']
scores = score(pred[mask],data.y[mask])
for metric, score in scores.items():
    print(metric, ':{:.2f}'.format(score))

Epoch: 020, Loss: 0.5196, Train: 0.7617, Val: 0.7578, Test: 0.7651
Epoch: 040, Loss: 0.5006, Train: 0.7656, Val: 0.7641, Test: 0.7689
Epoch: 060, Loss: 0.4876, Train: 0.7684, Val: 0.7652, Test: 0.7700
Epoch: 080, Loss: 0.4771, Train: 0.7688, Val: 0.7675, Test: 0.7704
Epoch: 100, Loss: 0.4702, Train: 0.7699, Val: 0.7704, Test: 0.7735
Epoch: 120, Loss: 0.4704, Train: 0.7736, Val: 0.7755, Test: 0.7765
Epoch: 140, Loss: 0.4680, Train: 0.7779, Val: 0.7801, Test: 0.7796
Epoch: 160, Loss: 0.4616, Train: 0.7761, Val: 0.7789, Test: 0.7754
Epoch: 180, Loss: 0.4584, Train: 0.7759, Val: 0.7858, Test: 0.7784
Epoch: 200, Loss: 0.4558, Train: 0.7827, Val: 0.7903, Test: 0.7837
tn, fp, fn, tp 1099 222 347 963
tn, fp, fn, tp 1099 222 347 963
acc :0.78
f1 :0.78
auc :0.78
prec :0.81
recall :0.74
fpr :0.17
