# CIC2018 100%Data Evaluation
- Import CIC2018 100%data from network and check performance of anomaly detection.
- To execute this notebook, need python(3.6), tensorflow, pandas, numpy, sklearn.

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from dagmm import DAGMM

#数据路径
import os
url_base = os.getcwd()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Data Import

In [2]:
#引入数据
# /CSE-CIC-IDS2018 all Data
url_data = f"{url_base}/all.csv"

# info data (column names, col types)
url_info = f"{url_base}/all_names.names"

# Import info data
df_info = pd.read_csv(url_info, sep=":", skiprows=1, index_col=False, names=["colname", "type"])

colnames = df_info.colname.values
coltypes = np.where(df_info["type"].str.contains("continuous"), "float", "str")
# print(df_info)

# Import data
df = pd.read_csv(url_data, names=colnames, index_col=False, dtype=dict(zip(colnames, coltypes)))
# print(df)

# Dumminize
X = pd.get_dummies(df.iloc[:,:-1]).values
# print(X)
# X1 = pd.get_dummies(df.iloc[:,:-1])
# print(X1)

for x1 in X:
    if np.isinf(x1).any():
        print(x1)
        for i in range(len(x1)):
            if np.isinf(x1[i]):
                print(i)
        # print(np.isinf(x1).any())

# Create Traget Flag
# Anomaly data when status is normal, Otherwise, Not anomaly.
y = np.where(df.Label == "BENIGN", 1, 0) # 1是normal，0是Bad
print(y)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=123)
X_train, y_train = X_train[y_train == 0], y_train[y_train == 0]     # 用恶意样本训练
# print(X_train)
# print(y_train)

[1 1 1 ... 1 1 1]


## Fit Data to DAGMM Model
next points are different from original paper:
- $\lambda_2$ is set to 0.0001 (paper: 0.005)
- Add small value($10^{-6}$) to diagonal elements of GMM covariance (paper: no additional value)

Standard Scaler is applied to input data (This DAGMM implementation default)

In [3]:
model = DAGMM(
    comp_hiddens=[60, 30, 10, 1], comp_activation=tf.nn.tanh,
    est_hiddens=[10, 4], est_dropout_ratio=0.5, est_activation=tf.nn.tanh,
    learning_rate=0.0001, epoch_size=2000, minibatch_size=1024, random_seed=1111
)

# epoch_size=200

In [4]:
model.fit(X_train)




Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use keras.layers.dropout instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

 epoch 100/2000 : loss = 21.973
 epoch 200/2000 : loss = 17.978
 epoch 300/2000 : loss = 15.635
 epoch 400/2000 : loss = 14.513
 epoch 500/2000 : loss = 13.775
 epoch 600/2000 : loss = 12.954
 epoch 700/2000 : loss = 12.383
 epoch 800/2000 : loss = 12.126
 epoch 900/2000 : loss = 10.403
 epoch 100

## Apply model to test data

In [5]:
y_pred = model.predict(X_test)

positive = np.int32(np.where(y_test==1)).reshape(-1)#正样本位置
negative = np.int32(np.where(y_test==0)).reshape(-1)#负样本位置
print('positive',positive,len(positive))        
print('negative',negative,len(negative))


positive [     0      1      2 ... 524280 524281 524282] 379923
negative [     3      5      8 ... 524269 524279 524283] 144361


In [6]:
# Energy thleshold to detect anomaly = 30% percentile of energies
# 初始阈值可随意设置，通过对比预测的 精度、回归度以及 Ｆ１分数来逐步调整阈值，数值越大说明越异常
anomaly_energy_threshold = np.percentile(y_pred, 35)
print(f"Energy thleshold to detect anomaly : {anomaly_energy_threshold:.3f}")

Energy thleshold to detect anomaly : 8.989


In [7]:
# # Energy thleshold to detect anomaly = 60% percentile of energies
# anomaly_energy_threshold = np.percentile(y_pred, 60)
# print(f"Energy thleshold to detect anomaly : {anomaly_energy_threshold:.3f}")

In [8]:
# Detect anomalies from test data
y_pred_flag = np.where(y_pred >= anomaly_energy_threshold, 1, 0) # 1是normal，0是Bad，因为模型是用恶意样本训练，数值超过阈值反而正常

In [9]:
prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred_flag, average="binary")
print(f" Precision = {prec:.3f}")
print(f" Recall    = {recall:.3f}")
print(f" F1-Score  = {fscore:.3f}")

 Precision = 0.998
 Recall    = 0.895
 F1-Score  = 0.944


## Input Adversarial Example

In [12]:
# 复制过来的结果

#原始恶意样本
se = [57420.0 ,1718.0 ,6.0 ,43.0 ,1.0 ,1.0 ,0.0 ,6.0 ,0.0 ,0.0 ,0.0 ,0.0 ,6.0 ,6.0 ,6.0 ,0.0 ,43.0 ,0.0 ,43.0 ,43.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,40.0 ,20.0 ,23255.81395 ,23255.81395 ,0.0 ,6.0 ,2.0 ,3.464101615 ,12.0 ,0.0 ,0.0 ,0.0 ,1.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,3.0 ,0.0 ,6.0 ,40.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,0.0 ,1.0 ,6.0 ,29200.0 ,0.0 ,0.0 ,40.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0]

# 由DeepFool白盒攻击算法生成的恶意对抗样本
deepfool_ae = [65535.0 ,0.0 ,7.64 ,7907271.5 ,79.33 ,1674.26 ,3000.83 ,331326.97 ,376.95 ,0.0 ,64.42 ,111.8 ,232.84 ,11.66 ,129.99 ,0.0 ,-13.0 ,0.0 ,4333376.5 ,2078429.0 ,5820347.0 ,1387770.62 ,0.0 ,3090121.5 ,3410977.25 ,1358718.38 ,1473928.0 ,183327.69 ,1060969.5 ,1546550.5 ,0.02 ,0.0 ,0.0 ,0.0 ,1799528.0 ,390757.16 ,131820.67 ,0.0 ,0.0 ,111.86 ,1.15 ,0.0 ,0.0 ,0.0 ,0.02 ,0.0 ,0.86 ,0.0 ,0.06 ,0.0 ,0.0 ,0.69 ,0.0 ,66.03 ,122.1 ,1784553.75 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,84.91 ,3007.71 ,1677.15 ,326482.81 ,32786.21 ,6592.64 ,0.0 ,60.0 ,51610.84 ,5918.59 ,24319.35 ,12805.21 ,1018882.62 ,0.0 ,1241427.5 ,844242.06]

# 由JSMA白盒攻击算法生成的恶意对抗样本
jsmas_ae = [65535.0 ,65534.0 ,12.8 ,43.0 ,1.0 ,1.0 ,0.0 ,6.0 ,0.0 ,0.0 ,0.0 ,0.0 ,6.0 ,6.0 ,6.0 ,0.0 ,43.0 ,0.0 ,43.0 ,43.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,-0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,40.0 ,20.0 ,23255.81 ,23255.81 ,0.0 ,6.0 ,2.0 ,3.46 ,12.0 ,0.0 ,0.0 ,0.0 ,1.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,3.0 ,0.0 ,6.0 ,40.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,0.0 ,1.0 ,6.0 ,29200.0 ,0.0 ,0.0 ,40.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0 ,0.0]

#由GAN训练恶意对抗样本生成的恶意对抗样本
gan_ae = [65467.08, 0.0, 17.0, 37.07, 1.0, 0.0, 0.0, 0.0, 0.0, 26.93, 0.59, 0.0, 30.89, 62.34, 23.96, 0.0, -13.0, 0.0, -5.85, -13.0, 0.0, 0.0, 0.0, 3.58, 0.0, 96.56, 0.0, 17067.28, 21.46, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 83175.84, 136.55, 27.13, 1.37, 49.49, 0.21, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.94, 224.44, 0.57, 89.26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.43, 0.0, 521.33, -1.0, -1.0, 0.0, 13.33, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.58]

In [13]:
#从上文可以看出，阈值是13.284。而生成对抗样本越过了阈值，变为了正类。
se_pred = model.predict([se])
deepfool_ae_pred = model.predict([deepfool_ae])
jsmas_ae_pred = model.predict([jsmas_ae])
gan_ae_pred = model.predict([gan_ae])

print('DAGMM分类器结果:(阈值为 %.3f, 超过阈值视为正常流量)' % (anomaly_energy_threshold))
# print(se_pred,deepfool_ae_pred,jsmas_ae_pred,gan_ae_pred)
print('原恶意样本 %s、DeepFool生成恶意对抗样本 %s、JSMA生成恶意对抗样本 %s、GAN训练生成恶意对抗样本 %s' % (se_pred,deepfool_ae_pred,jsmas_ae_pred,gan_ae_pred))


DAGMM分类器结果:(阈值为 8.989, 超过阈值视为正常流量)
原恶意样本 [-5.3201056]、DeepFool生成恶意对抗样本 [14.917352]、JSMA生成恶意对抗样本 [22.228298]、GAN训练生成恶意对抗样本 [21.5451]


In [15]:
ae_list = np.load('.\wpgan-gp_1b.npy')


def Avg_predict_rate(ae_list):
    lens = len(ae_list)
    print(lens)
    ae_list = np.reshape(ae_list, [lens, 78])
    ae_pred = model.predict(ae_list)
    print(ae_pred)
    
    avarage = 0
    for i in ae_pred:
        if i > anomaly_energy_threshold:
            avarage += 1
    
    rate = avarage / lens
    print("平均欺骗率：{:.2%}".format(rate))

Avg_predict_rate(ae_list)

100
[13.003081  13.116829  13.120704  13.12457   13.131608  13.095785
 13.031114  13.1552305 13.134641  13.073575  13.0170965 13.103628
 13.144381  13.098549  13.077541  13.035868  13.1006155 13.131209
 13.040616  13.106663  13.134786  13.093657  13.102045  13.085722
 13.084902  13.106194  13.093969  13.115697  13.031781  13.340057
 13.099696  13.119233  13.154044  13.12309   13.107837  13.111301
 13.103506  13.081823  13.127545  13.086446  13.080645  13.083693
 13.102724  13.128983  13.011786  13.098311  13.043823  13.119553
 13.065891  13.131181  13.1132965 13.094242  12.9655285 13.069887
 13.0815    13.14597   13.046418  13.103576  13.108398  13.116695
 13.110867  13.090963  13.135449  13.152494  13.082224  13.0915365
 13.090788  13.102898  13.085703  13.1539545 13.016435  13.11187
 13.133017  13.111604  13.019829  13.070192  13.144799  13.132201
 13.08416   13.137001  13.097878  13.126645  13.119916  13.038234
 13.149957  13.148299  13.096014  13.140618  13.147709  13.0873165
 13.1