In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [30]:
train_name = "30s_split_Train_data.csv"
train_data = pd.read_csv(train_name)

test_name = "30s_split_Test_data.csv"
test_data = pd.read_csv(test_name)

In [31]:
def make_label(df):
    Label = []
    for i in range(len(df)):
        if df.loc[i]['label'] == 'normal':
            Label.append(0)
        else:
            Label.append(1)
    df['Label'] = Label
    df['Attack'] = df['label']
    df = df.drop(columns = "label")
    return df

In [32]:
train_data = make_label(train_data)
test_data = make_label(test_data)

In [33]:
train_data.rename(columns=lambda x: x.strip(), inplace=True)
train_data['#1:Client IP'] = train_data["#1:Client IP"].apply(str)
train_data['3:Client Port'] = train_data["3:Client Port"].apply(str)
train_data['2:Server IP'] = train_data["2:Server IP"].apply(str)
train_data['4:Server Port'] = train_data["4:Server Port"].apply(str)

In [34]:
test_data.rename(columns=lambda x: x.strip(), inplace=True)
test_data['#1:Client IP'] = test_data["#1:Client IP"].apply(str)
test_data['3:Client Port'] = test_data["3:Client Port"].apply(str)
test_data['2:Server IP'] = test_data["2:Server IP"].apply(str)
test_data['4:Server Port'] = test_data["4:Server Port"].apply(str)

In [35]:
train_data.drop(columns=["3:Client Port", "4:Server Port"], inplace=True)
test_data.drop(columns=["3:Client Port", "4:Server Port"], inplace=True)

In [36]:
# train_data.drop(columns=[" 6:Start Time", ' 3:Client Port', ' 4:Server Port'], inplace=True)
# test_data.drop(columns=[" 6:Start Time", ' 3:Client Port', ' 4:Server Port'], inplace=True)

In [39]:
X_train = train_data.drop(columns=['Label',"Attack"])
X_test = test_data.drop(columns=['Label',"Attack"])
y_train = train_data[['Label',"Attack"]]
y_test = test_data[['Label',"Attack"]]

In [40]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [41]:
scaler = StandardScaler()
cols_to_norm = list(set(list(X_train.drop(columns=["#1:Client IP","2:Server IP","6:Start Time"]).columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 5:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 5:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [42]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(train_data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])
# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [43]:
# Training graph
train_g = nx.from_pandas_edgelist(train, "#1:Client IP", "2:Server IP",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())
train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight
# Testing graph
test_g = nx.from_pandas_edgelist(test, "#1:Client IP", "2:Server IP",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())
node = sorted(test_g.nodes())
test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [44]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [45]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [46]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [47]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [48]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [49]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [50]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [51]:
# Convert to GPU
train_g = train_g

In [None]:
cnt_wait = 0
best = 1e9
best_t = 0
dur = []
node_features = train_g.ndata['h']
edge_features = train_g.edata['h']

for epoch in range(epochs):
    dgi.train()
    if epoch >= 3:
        t0 = time.time()

    dgi_optimizer.zero_grad()
    loss = dgi(train_g, node_features, edge_features)
    loss.backward()
    dgi_optimizer.step()

    if loss < best:
        best = loss
        best_t = epoch
        cnt_wait = 0
        torch.save(dgi.state_dict(), 'best_dgi.pkl')
    else:
        cnt_wait += 1

  # if cnt_wait == patience:
  #     print('Early stopping!')
  #     break

    if epoch >= 3:
        dur.append(time.time() - t0)

    if epoch % 50 == 0:

        print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
            "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
              loss.item(),
              train_g.num_edges() / np.mean(dur) / 1000))

In [53]:
dgi.load_state_dict(torch.load('best_dgi.pkl'))

<All keys matched successfully>

In [54]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [55]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [56]:
# Convert to GPU
test_g = test_g

In [57]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [58]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [59]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,0.391535,-1.050940,0.642646,0.841976,0.665297,0.174362,0.000600,0.044048,-0.864206,-0.593591,...,-0.061245,-1.190410,-0.293939,-0.354126,0.582553,2.666381,-0.440458,0.234124,lfi,1
1,0.229704,-0.997782,0.832264,0.753838,0.337912,0.050973,-0.074340,0.230677,-0.911263,-0.680979,...,-0.338866,-1.168872,-0.389057,-0.319773,0.776764,2.966134,-0.405200,0.467887,lfi,1
2,0.229704,-0.997782,0.832264,0.753838,0.337912,0.050973,-0.074340,0.230677,-0.911263,-0.680979,...,-0.338866,-1.168872,-0.389057,-0.319773,0.776764,2.966134,-0.405200,0.467887,lfi,1
3,0.229704,-0.997782,0.832264,0.753838,0.337912,0.050973,-0.074340,0.230677,-0.911263,-0.680979,...,-0.338866,-1.168872,-0.389057,-0.319773,0.776764,2.966134,-0.405200,0.467887,lfi,1
4,0.229704,-0.997782,0.832264,0.753838,0.337912,0.050973,-0.074340,0.230677,-0.911263,-0.680979,...,-0.338866,-1.168872,-0.389057,-0.319773,0.776764,2.966134,-0.405200,0.467887,lfi,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239325,0.147891,-0.194757,0.070401,-0.296726,-0.112638,0.109214,-0.012451,0.243264,0.055302,-0.173221,...,-0.539052,-0.076908,-0.102183,-0.075548,0.036759,0.527297,0.061740,-0.100416,directory_scan,1
239326,0.147891,-0.194757,0.070401,-0.296726,-0.112638,0.109214,-0.012451,0.243264,0.055302,-0.173221,...,-0.539052,-0.076908,-0.102183,-0.075548,0.036759,0.527297,0.061740,-0.100416,directory_scan,1
239327,0.147891,-0.194757,0.070401,-0.296726,-0.112638,0.109214,-0.012451,0.243264,0.055302,-0.173221,...,-0.539052,-0.076908,-0.102183,-0.075548,0.036759,0.527297,0.061740,-0.100416,directory_scan,1
239328,0.147891,-0.194757,0.070401,-0.296726,-0.112638,0.109214,-0.012451,0.243264,0.055302,-0.173221,...,-0.539052,-0.076908,-0.102183,-0.075548,0.036759,0.527297,0.061740,-0.100416,directory_scan,1
