In [76]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

In [78]:
train_name = "30s_split_Train_data.csv"
train_data = pd.read_csv(train_name)

test_name = "30s_split_Test_data.csv"
test_data = pd.read_csv(test_name)

In [80]:
train_data.label.value_counts()

label
directory_scan        15111
normal                15081
csrf                  14338
reflected_xss         13910
brute_force           11958
open_redirect         11654
sql_injection         10924
lfi                    9100
subdomain_takeover     7525
command_injection      5728
ssti                   4336
Name: count, dtype: int64

In [81]:
test_data.label.value_counts()

label
normal                6464
directory_scan        6440
csrf                  6305
reflected_xss         5931
brute_force           5042
open_redirect         5002
sql_injection         4701
lfi                   3891
subdomain_takeover    3233
command_injection     2522
ssti                  1755
Name: count, dtype: int64

In [84]:
train_data.drop(columns=[" 6:Start Time", ' 3:Client Port', ' 4:Server Port'], inplace=True)

In [85]:
test_data.drop(columns=[" 6:Start Time", ' 3:Client Port', ' 4:Server Port'], inplace=True)

In [88]:
# train_data = train_data.groupby(by='label').sample(frac=0.1, random_state=13)

In [90]:
# test_data = test_data.groupby(by='label').sample(frac=0.1, random_state=13)

In [93]:
X_train = train_data.drop(columns=["label"])
y_train = train_data[["label"]]

X_test = test_data.drop(columns=["label"])
y_test = test_data[["label"]]

In [96]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [97]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [99]:
data = pd.concat([train, test])

lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["label"])

label_mapping = pd.DataFrame({
    'Original_Label': lab_enc.classes_,
    'Encoded_Label': lab_enc.transform(lab_enc.classes_)
})

# Transform on training set
train["label"] = lab_enc.transform(train["label"])

# Transform on testing set
test["label"] = lab_enc.transform(test["label"])

In [100]:
label_mapping

Unnamed: 0,Original_Label,Encoded_Label
0,brute_force,0
1,command_injection,1
2,csrf,2
3,directory_scan,3
4,lfi,4
5,normal,5
6,open_redirect,6
7,reflected_xss,7
8,sql_injection,8
9,ssti,9


In [154]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "#1:Client IP", " 2:Server IP",
            ["h", "label"], create_using=nx.MultiGraph())

# traub_node = sorted(train_g.nodes())

# 모든 노드에 'normal' 레이블을 기본값으로 설정
for node in train_g.nodes():
    train_g.nodes[node]['label'] = 5

# 엣지 레이블에 따라 노드 레이블 업데이트
for u, v, data in train_g.edges(data=True):
    new_label = data['label']
    if new_label != 5:
        train_g.nodes[u]['label'] = new_label
        
train_g_nodeName = list(sorted(train_g.nodes()))

train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'label'], node_attrs=['label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(), train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight


# Testing graph
test_g = nx.from_pandas_edgelist(test, "#1:Client IP", " 2:Server IP",
            ["h", "label"], create_using=nx.MultiGraph())

# test_node = sorted(test_g.nodes())

# 모든 노드에 'normal' 레이블을 기본값으로 설정
for node in test_g.nodes():
    test_g.nodes[node]['label'] = 5

# 엣지 레이블에 따라 노드 레이블 업데이트
for u, v, data in test_g.edges(data=True):
    new_label = data['label']
    if new_label != 5:
        test_g.nodes[u]['label'] = new_label

test_g_nodeName = list(sorted(test_g.nodes()))

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'label'], node_attrs=['label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(), test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight


In [158]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [159]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        #e_perm = torch.randperm(g.number_of_edges())
        n_perm = torch.randperm(g.number_of_nodes())
        perm = np.random.permutation(g.number_of_nodes())
        nfeats.values = perm
        
        #efeats = efeats[e_perm]
        nfeats = nfeats[n_perm]

      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [160]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):      
      features = torch.matmul(features, torch.matmul(self.weight, summary))
            
      return features

In [161]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      self.discriminator = Discriminator(128)
      #self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      # positive = positive[1]
      # negative = negative[1]
      positive = positive[0]
      negative = negative[0]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [162]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [163]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [164]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [165]:
# Convert to GPU
train_g = train_g

In [None]:
# cnt_wait = 0
# best = 1e9
# best_t = 0
# dur = []
# node_features = train_g.ndata['h']
# edge_features = train_g.edata['h']

# for epoch in range(epochs):
#     dgi.train()
#     if epoch >= 3:
#         t0 = time.time()

#     dgi_optimizer.zero_grad()
#     loss = dgi(train_g, node_features, edge_features)
#     loss.backward()
#     dgi_optimizer.step()

#     if loss < best:
#         best = loss
#         best_t = epoch
#         cnt_wait = 0
#         torch.save(dgi.state_dict(), 'best_dgi.pkl')
#     else:
#         cnt_wait += 1

#   # if cnt_wait == patience:
#   #     print('Early stopping!')
#   #     break

#     if epoch >= 3:
#         dur.append(time.time() - t0)

#     if epoch % 50 == 0:

#         print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | "
#             "ETputs(KTEPS) {:.2f}".format(epoch, np.mean(dur),
#               loss.item(),
#               train_g.num_edges() / np.mean(dur) / 1000))

In [167]:
dgi.load_state_dict(torch.load('best_dgi.pkl'))

<All keys matched successfully>

In [168]:
training_edge_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_edge_emb = training_edge_emb.detach().cpu().numpy()

In [169]:
training_node_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[0]
training_node_emb = training_node_emb.detach().cpu().numpy()

In [170]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [171]:
# Convert to GPU
test_g = test_g

In [172]:
testing_edge_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_edge_emb = testing_edge_emb.detach().cpu().numpy()

In [173]:
testing_node_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[0]
testing_node_emb = testing_node_emb.detach().cpu().numpy()

In [174]:
df_train_edge = pd.DataFrame(training_edge_emb, )
df_train_edge["label"] = train_g.edata['label'].detach().cpu().numpy()

df_test_edge = pd.DataFrame(testing_edge_emb, )
df_test_edge["label"] = test_g.edata['label'].detach().cpu().numpy()

In [175]:
df_train_node = pd.DataFrame(training_node_emb, )
df_train_node["label"] = lab_enc.inverse_transform(
        train_g.ndata['label'].detach().cpu().numpy())
# df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test_node = pd.DataFrame(testing_node_emb, )
df_test_node["label"] = lab_enc.inverse_transform(
        test_g.ndata['label'].detach().cpu().numpy())
# df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [176]:
df_train_nolabel = df_train_node.drop('label', axis=1)
df_test_nolabel = df_test_node.drop('label', axis=1)

In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# t-SNE 모델 생성 및 학습
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
tsne_result = tsne.fit_transform(df_train_nolabel)

# t-SNE 결과를 DataFrame으로 변환
tsne_df = pd.DataFrame(tsne_result, columns=['TSNE1', 'TSNE2'])
tsne_df['label'] = df_train_node['label']

# 결과 시각화
plt.figure(figsize=(10, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='label', data=tsne_df, palette='viridis', alpha=0.5)
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('t-SNE Plot')
plt.legend(title='label')
plt.show()

In [178]:
a = pd.concat([test_data['label'], test['label']], axis=1)
a.drop_duplicates()

Unnamed: 0,label,label.1
6774,brute_force,0
10905,command_injection,1
2144,csrf,2
24426,directory_scan,3
50105,lfi,4
11449,normal,5
17287,open_redirect,6
7552,reflected_xss,7
36150,sql_injection,8
5140,ssti,9


In [179]:
tsne_df.loc[tsne_df['label'] == 0, 'label'] = 'brute_force'
tsne_df.loc[tsne_df['label'] == 1, 'label'] = 'command_injection'
tsne_df.loc[tsne_df['label'] == 2, 'label'] = 'csrf'
tsne_df.loc[tsne_df['label'] == 3, 'label'] = 'directory_scan'
tsne_df.loc[tsne_df['label'] == 4, 'label'] = 'lfi'
tsne_df.loc[tsne_df['label'] == 5, 'label'] = 'normal'
tsne_df.loc[tsne_df['label'] == 6, 'label'] = 'open_redirect'
tsne_df.loc[tsne_df['label'] == 7, 'label'] = 'reflected_xss'
tsne_df.loc[tsne_df['label'] == 8, 'label'] = 'sql_injection'
tsne_df.loc[tsne_df['label'] == 9, 'label'] = 'ssti'
tsne_df.loc[tsne_df['label'] == 10, 'label'] = 'subdomain_takeover'
tsne_df['IP'] = train_g_nodeName

In [180]:
tsne_df.to_csv("train.csv", index=False)

In [None]:
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# t-SNE 모델 생성 및 학습
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, random_state=42)
tsne_result = tsne.fit_transform(df_test_nolabel)

# t-SNE 결과를 DataFrame으로 변환
tsne_df = pd.DataFrame(tsne_result, columns=['TSNE1', 'TSNE2'])
tsne_df['label'] = df_test_node['label']

# 결과 시각화
plt.figure(figsize=(10, 8))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='label', data=tsne_df, palette='viridis', alpha=0.5)
plt.xlabel('TSNE1')
plt.ylabel('TSNE2')
plt.title('t-SNE Plot')
plt.legend(title='label')
plt.show()

In [182]:
tsne_df.loc[tsne_df['label'] == 0, 'label'] = 'brute_force'
tsne_df.loc[tsne_df['label'] == 1, 'label'] = 'command_injection'
tsne_df.loc[tsne_df['label'] == 2, 'label'] = 'csrf'
tsne_df.loc[tsne_df['label'] == 3, 'label'] = 'directory_scan'
tsne_df.loc[tsne_df['label'] == 4, 'label'] = 'lfi'
tsne_df.loc[tsne_df['label'] == 5, 'label'] = 'normal'
tsne_df.loc[tsne_df['label'] == 6, 'label'] = 'open_redirect'
tsne_df.loc[tsne_df['label'] == 7, 'label'] = 'reflected_xss'
tsne_df.loc[tsne_df['label'] == 8, 'label'] = 'sql_injection'
tsne_df.loc[tsne_df['label'] == 9, 'label'] = 'ssti'
tsne_df.loc[tsne_df['label'] == 10, 'label'] = 'subdomain_takeover'
tsne_df['IP'] = test_g_nodeName

In [183]:
tsne_df.to_csv("test.csv", index=False)