# Load DNS and mDNS datasets

In [14]:
import os
import sys

import numpy as np
import pandas as pd

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import torch
import torch_geometric.transforms as T

from src.utils import score
from src.loader import DNS

## Load Graphs

In [15]:
kg_path = lambda graph_name: f'../data/{graph_name}'

#### mDNS

In [16]:
# dataset = DNS(root=kg_path('mDNS'), transform=T.Compose([T.NormalizeFeatures(), T.ToUndirected()]), balance_gt=True)
# data = dataset[0]
# data['domain_node']['test_mask'].unique(return_counts=True)
# # datadomain_node.test_mask

#### DNS

In [17]:
dataset = DNS(root=kg_path('DNS'), transform=T.Compose([T.NormalizeFeatures(), T.ToUndirected()]), balance_gt=True)
data = dataset[0]
data

Remove parallel edges: type
similar    50910
dtype: int64


HeteroData(
  [1mip_node[0m={
    num_nodes=73593,
    x=[73593, 2]
  },
  [1mdomain_node[0m={
    num_nodes=373475,
    x=[373475, 10],
    y=[373475],
    train_mask=[373475],
    test_mask=[373475],
    val_mask=[373475]
  },
  [1m(domain_node, apex, domain_node)[0m={ edge_index=[2, 178944] },
  [1m(domain_node, resolves, ip_node)[0m={ edge_index=[2, 730438] },
  [1m(domain_node, similar, domain_node)[0m={ edge_index=[2, 155356] },
  [1m(ip_node, rev_resolves, domain_node)[0m={ edge_index=[2, 730438] }
)

In [18]:
data = dataset.to_homogeneous()
print(data.train_mask.unique(return_counts=True))
print(data.test_mask.unique(return_counts=True))

(tensor([False,  True]), tensor([442682,   4386]))
(tensor([False,  True]), tensor([444437,   2631]))


In [19]:
from torch_geometric.nn import GATConv, HeteroConv, Linear


import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GCNConv, SAGEConv
import sys
sys.path.append('../')


class GNN(torch.nn.Module):
    def __init__(self, num_features, dim=16, num_classes=2, num_layers=2, model_type='gcn'):
        super(GNN, self).__init__()

        self.conv1 = SAGEConv(num_features, dim) if model_type == 'sage' else (GCNConv(num_features, dim) if model_type == 'gcn' else GATConv(num_features, dim))
        self.gcs = nn.ModuleList()
        self.num_layers = num_layers
        for i in range(1, num_layers):
            conv = SAGEConv(dim, dim) if model_type == 'sage' else (GCNConv(dim, dim) if model_type == 'gcn' else GATConv(dim, dim))
            self.gcs.append(conv)
        self.lin = Linear(dim, num_classes)

    def forward(self, x, edge_index, data=None, save_embedding=False):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        for i in range(1, self.num_layers):
            x = F.relu(self.gcs[i-1](x, edge_index))
            x = F.dropout(x, training=self.training)
        return self.lin(x)


    


In [22]:
cuda_device = 3
torch.manual_seed(42)
from src.utils import score

def train(model, data, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    mask = data.train_mask
    loss = F.cross_entropy(out[mask], data.y[mask])
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(model, data):
    model.eval()
    pred = model(data.x, data.edge_index).argmax(dim=-1)

    accs = []
    for split in ['train_mask', 'val_mask']:
        mask = data[split]
        acc = (pred[mask] == data.y[mask]).sum() / mask.sum()
        accs.append(float(acc))
    return accs

def experiment(model,start,end,test_list, model_type):
    kg_path = lambda graph_name: f'../data/{graph_name}'

    dataset = DNS(root=kg_path('DNS'), transform=T.Compose([T.NormalizeFeatures(), T.ToUndirected()]), balance_gt=True)
    data = dataset.to_homogeneous() # training data

    if torch.cuda.is_available():
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        torch.cuda.set_device(cuda_device)

        data, model = data.to(device), model.to(device)

    with torch.no_grad():  # Initialize lazy modules.
        out = model(data.x, data.edge_index)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)

    for epoch in range(0, 201):
        loss = train(model, data, optimizer)
        train_acc, val_acc = test(model,data)
        if epoch % 20 == 0:
            print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
                f'Val: {val_acc:.4f}')
        
    model.eval()
    test_data = data
    test_data = test_data.to(device)
    with torch.no_grad():
        pred = model(test_data.x, test_data.edge_index).argmax(dim=-1)
    mask = test_data['test_mask']
    scores = score(pred[mask],test_data.y[mask])
    with open("resultsdns_copy.csv", "a") as logger:
        logger.write(model_type + ',')
        logger.write(",".join(str(x) for x in scores.values()))
        logger.write('\n')


    for metric, val in scores.items():
        print(metric, ':{:.4f}'.format(val))
    
for model_type in ['gcn','sage']:
    for i in range(5):
        # model_type='gcn'  
        data.x.size(1) 
        model = GNN(data.x.size(1), dim=64, num_classes=2,
                  num_layers=2, model_type=model_type)
        experiment(model,i,i+6,[i+7,i+8], model_type)

Remove parallel edges: type
similar    50910
dtype: int64
Epoch: 000, Loss: 0.7016, Train: 0.5768, Val: 0.5801
Epoch: 020, Loss: 0.5693, Train: 0.7004, Val: 0.6986
Epoch: 040, Loss: 0.5234, Train: 0.7503, Val: 0.7504
Epoch: 060, Loss: 0.4901, Train: 0.7608, Val: 0.7635
Epoch: 080, Loss: 0.4654, Train: 0.7695, Val: 0.7675
Epoch: 100, Loss: 0.4683, Train: 0.7775, Val: 0.7726
Epoch: 120, Loss: 0.4611, Train: 0.7775, Val: 0.7738
Epoch: 140, Loss: 0.4548, Train: 0.7816, Val: 0.7818
Epoch: 160, Loss: 0.4516, Train: 0.7823, Val: 0.7795
Epoch: 180, Loss: 0.4575, Train: 0.7811, Val: 0.7789
Epoch: 200, Loss: 0.4527, Train: 0.7839, Val: 0.7823
tn, fp, fn, tp 1074 227 390 940
acc :0.7655
f1 :0.7647
auc :0.7661
prec :0.8055
recall :0.7068
fpr :0.1745
Remove parallel edges: type
similar    50910
dtype: int64
Epoch: 000, Loss: 0.7042, Train: 0.6835, Val: 0.6860
Epoch: 020, Loss: 0.5704, Train: 0.6956, Val: 0.6991
Epoch: 040, Loss: 0.5340, Train: 0.7485, Val: 0.7442
Epoch: 060, Loss: 0.4970, Train: 0.

In [20]:
# cuda_device = 3
# torch.manual_seed(42)

# model = GNN(data.x.size(1), dim=64, num_classes=2,
#                   num_layers=2, model_type='gcn')
# if torch.cuda.is_available():
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     torch.cuda.set_device(cuda_device)

#     data, model = data.to(device), model.to(device)

# with torch.no_grad():  # Initialize lazy modules.
#     out = model(data.x, data.edge_index)

# optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)

# def train(model, optimizer, data):
#     model.train()
#     optimizer.zero_grad()
#     out = model(data.x, data.edge_index)
#     mask = data.train_mask
#     loss = F.cross_entropy(out[mask], data.y[mask])
#     loss.backward()
#     optimizer.step()
#     return float(loss)


# @torch.no_grad()
# def test():
#     model.eval()
#     pred = model(data.x, data.edge_index).argmax(dim=-1)

#     accs = []
#     for split in ['train_mask', 'val_mask', 'test_mask']:
#         mask = data[split]
#         acc = (pred[mask] == data.y[mask]).sum() / mask.sum()
#         accs.append(float(acc))
#     return accs


# for epoch in range(1, 201):
#     loss = train()
#     train_acc, val_acc, test_acc = test()
#     if epoch % 20 == 0:
#         print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, '
#               f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')
    
# model.eval()
# with torch.no_grad():
#     pred = model(data.x, data.edge_index).argmax(dim=-1)
# mask = data['test_mask']
# scores = score(pred[mask],data.y[mask])
# with open("resultsdns_copy.csv", "a") as logger:
#             logger.write('gcn,')
#             logger.write(",".join(str(x) for x in scores.values()))
#             logger.write('\n')
# for metric, score in scores.items():
#     print(metric, ':{:.2f}'.format(score))

Epoch: 020, Loss: 0.5827, Train: 0.7020, Val: 0.7003, Test: 0.7001
Epoch: 040, Loss: 0.5217, Train: 0.7540, Val: 0.7544, Test: 0.7480
Epoch: 060, Loss: 0.4907, Train: 0.7620, Val: 0.7658, Test: 0.7613
Epoch: 080, Loss: 0.4746, Train: 0.7693, Val: 0.7698, Test: 0.7662
Epoch: 100, Loss: 0.4783, Train: 0.7727, Val: 0.7744, Test: 0.7723
Epoch: 120, Loss: 0.4774, Train: 0.7729, Val: 0.7772, Test: 0.7685
Epoch: 140, Loss: 0.4732, Train: 0.7763, Val: 0.7801, Test: 0.7739
Epoch: 160, Loss: 0.4745, Train: 0.7718, Val: 0.7766, Test: 0.7681
Epoch: 180, Loss: 0.4666, Train: 0.7756, Val: 0.7795, Test: 0.7704
Epoch: 200, Loss: 0.4652, Train: 0.7763, Val: 0.7783, Test: 0.7674
tn, fp, fn, tp 1079 222 390 940
acc :0.77
f1 :0.77
auc :0.77
prec :0.81
recall :0.71
fpr :0.17
