<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-packages" data-toc-modified-id="Import-packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import packages</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Build-a-graph-data" data-toc-modified-id="Build-a-graph-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Build a graph data</a></span></li><li><span><a href="#Define-the-GNN-model" data-toc-modified-id="Define-the-GNN-model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Define the GNN model</a></span></li><li><span><a href="#Model-training" data-toc-modified-id="Model-training-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Model training</a></span></li><li><span><a href="#predict-results-for-unkown-nodes" data-toc-modified-id="predict-results-for-unkown-nodes-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>predict results for unkown nodes</a></span></li></ul></div>

# Import packages 

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch_geometric
import torch.nn.functional as F
import torch.utils.data

# Load the data

In [2]:
TRAINDATA_DIR = "./TrainData/"

In [3]:
def get_user_data(user_idx):
    train_data_path  = TRAINDATA_DIR+'WorkerData_{}.csv'.format(user_idx)
    t_data = pd.read_csv(train_data_path)
    # read edges data
    train_edges_path = TRAINDATA_DIR+'WorkerDataEdges_{}.csv'.format(user_idx)
    t_edges = pd.read_csv(train_edges_path)
    return t_data,t_edges

In [4]:
data, edges = get_user_data(0)

In [5]:
data.head(5)

Unnamed: 0,txId,class,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,Local_feature_8,...,Aggregate_feature_63,Aggregate_feature_64,Aggregate_feature_65,Aggregate_feature_66,Aggregate_feature_67,Aggregate_feature_68,Aggregate_feature_69,Aggregate_feature_70,Aggregate_feature_71,Aggregate_feature_72
0,0,0,-0.113002,-0.026215,,1.135523,-1.201369,0.001483,0.830915,-0.080147,...,-0.003175,-0.004194,-0.02606,-0.176617,-0.120613,0.162968,1.063787,-0.170322,-0.093145,-0.216814
1,1,1,-0.113002,-0.026195,5.258644,1.135523,-1.201369,0.001483,1.320101,-0.080147,...,-0.003175,-0.004194,-0.02606,-0.176617,-0.120613,0.130886,1.063787,-0.188369,-0.093145,-0.193143
2,2,2,-0.113002,,-0.139726,,1.018602,0.001483,-0.26122,-0.080147,...,-0.003175,-0.004194,-0.02606,-0.176617,-0.120613,0.130491,-0.048171,-0.188716,-0.093145,-0.193143
3,3,1,-0.113002,-0.026203,1.154065,-1.084907,1.018602,0.001483,4.410493,-0.080147,...,-0.003175,-0.004194,-0.02606,-0.176617,-0.120613,,-1.160129,-0.18377,-0.093145,-0.216814
4,4,1,-0.113002,,8.645669,0.292393,1.018602,0.001483,2.27628,-0.080147,...,-0.003175,-0.004194,-0.02606,-0.176617,0.079617,,0.726534,0.599062,-0.076243,-0.216814


In [6]:
data['class'].value_counts() # 2 for unknown, 1 for benign, 2 for malicous

2    3101
1     845
0     329
Name: class, dtype: int64

# Build a graph data

In [7]:
data = data.fillna(data.mean())
# get edge index from edges
pyg_data = {}
pyg_data['edge_index'] = torch.LongTensor(edges.to_numpy()).t().contiguous()
pyg_data['x'] = torch.tensor(data.iloc[:, 2:].to_numpy(), dtype=torch.float)
y = torch.LongTensor(data.iloc[:, 1].to_numpy())
pyg_data['y'] = y
# 0 for illicit, 1 for licit, 2 for unknown, 3 for to be predicted.
pyg_data['train_mask'] = (y!=2)
pyg_data['test_mask'] = (y==2)
pyg_data['num_classes'] = 2
pyg_data['num_node_features'] = pyg_data['x'].size(1)

graph_data = torch_geometric.data.Data.from_dict(pyg_data)

In [8]:
graph_data

Data(x=[4275, 165], edge_index=[2, 4541], y=[4275], train_mask=[4275], test_mask=[4275], num_classes=2, num_node_features=165)

# Define the GNN model

In [9]:
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(graph_data.num_node_features, 200)
        self.conv2 = GCNConv(200, graph_data.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

# Model training

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
graph_data = graph_data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in tqdm(range(200)):
    optimizer.zero_grad()
    out = model(graph_data)
    loss = F.nll_loss(out[graph_data.train_mask], graph_data.y[graph_data.train_mask])
    loss.backward()
    optimizer.step()

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




# predict results for unkown nodes

In [74]:
model.eval()
pred = model(graph_data).argmax(dim=1)

In [75]:
1 - pred[graph_data.test_mask].sum().item()/len(graph_data.test_mask)

0.3939181286549708