# Predict DL methods
- Environment setup
- Load both 2017, 2018 original and corrected CICIDS datasets
- Apply methods

### Environment setup

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!wget https://downloads.rclone.org/v1.63.0/rclone-v1.63.0-linux-amd64.deb
!apt install ./rclone-v1.63.0-linux-amd64.deb
!rclone config
!sudo apt-get -y install fuse3


--2023-08-19 07:43:18--  https://downloads.rclone.org/v1.63.0/rclone-v1.63.0-linux-amd64.deb
Resolving downloads.rclone.org (downloads.rclone.org)... 95.217.6.16, 2a01:4f9:c012:7154::1
Connecting to downloads.rclone.org (downloads.rclone.org)|95.217.6.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18373062 (18M) [application/vnd.debian.binary-package]
Saving to: ‘rclone-v1.63.0-linux-amd64.deb’


2023-08-19 07:43:22 (7.16 MB/s) - ‘rclone-v1.63.0-linux-amd64.deb’ saved [18373062/18373062]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'rclone' instead of './rclone-v1.63.0-linux-amd64.deb'
The following NEW packages will be installed:
  rclone
0 upgraded, 1 newly installed, 0 to remove and 16 not upgraded.
Need to get 0 B/18.4 MB of archives.
After this operation, 56.7 MB of additional disk space will be used.
Get:1 /content/rclone-v1.63.0-linux-amd64.deb rclone amd64 1.63.0 [18.4 MB]
Selecti

In [3]:
!sudo mkdir /content/onedrive
!nohup rclone --vfs-cache-mode writes mount onedrive: /content/onedrive &

nohup: appending output to 'nohup.out'


### Load both 2017, 2018 original and corrected CICIDS datasets

In [1]:
# Import Necessary Libraries
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt # plotting
import seaborn as sns
from sklearn.manifold import TSNE
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
from sklearn.metrics import precision_recall_fscore_support, f1_score, accuracy_score

# for Deep Learning
import tensorflow as tf

from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from keras import regularizers
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

import time
import os


In [5]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m1.0 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.7.0


In [2]:
# data2017_corrected = pd.read_parquet('/content/onedrive/data_2017_corrected.parquet', engine="fastparquet")
# data2018_original = pd.read_parquet('/content/onedrive/data_2018_original.parquet', engine="fastparquet")
data2018_corrected = pd.read_parquet('/content/onedrive/data_2018_corrected_sampling.parquet', engine="fastparquet")

In [21]:
# data2017_corrected_resampling = pd.read_parquet('/content/onedrive/data_2017_corrected_resampling.parquet', engine="fastparquet")
# data2018_original_resampling = pd.read_parquet('/content/onedrive/data_2018_original_resampling.parquet', engine="fastparquet")
data2018_corrected_resampling = pd.read_parquet('/content/onedrive/data_2018_corrected_sampling_resampling.parquet', engine="fastparquet")

In [7]:
data2017_corrected = data2017_corrected.drop(['Fwd RST Flags',
                                                   'Bwd RST Flags',
                                                   'ICMP Code',
                                                   'ICMP Type',
                                                   'Total TCP Flow Time',
                                                   'Attempted Category'], axis = 1)

In [None]:
data2017_corrected_resampling = data2017_corrected_resampling.drop(['Fwd RST Flags',
                                                   'Bwd RST Flags',
                                                   'ICMP Code',
                                                   'ICMP Type',
                                                   'Total TCP Flow Time',
                                                   'Attempted Category'], axis = 1)

In [3]:
data2018_corrected = data2018_corrected.drop(['Fwd RST Flags',
                                              'Bwd RST Flags',
                                              'ICMP Code',
                                              'ICMP Type',
                                              'Total TCP Flow Time'], axis = 1)

In [22]:
data2018_corrected_resampling = data2018_corrected_resampling.drop(['Fwd RST Flags',
                                              'Bwd RST Flags',
                                              'ICMP Code',
                                              'ICMP Type',
                                              'Total TCP Flow Time'], axis = 1)

In [4]:
def get_x_y(df):
    X = df.loc[:, df.columns != 'label_encoded']
    y = df['label_encoded']
    return X, y

In [5]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
# Load the saved model
input_dim = 76 #num of feature columns

class AutoEncoder(Model):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = tf.keras.Sequential([
                  tf.keras.layers.Dense(64, activation="relu"),
                  tf.keras.layers.Dense(32, activation="relu"),
                  tf.keras.layers.Dense(16, activation="relu"),
              ])
        self.decoder = tf.keras.Sequential([
                  tf.keras.layers.Dense(32, activation="relu"),
                  tf.keras.layers.Dense(64, activation="relu"),
                  tf.keras.layers.Dense(input_dim, activation="sigmoid")
              ])
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        label_pred = self.output_layer(decoded)
        return label_pred

autoencoder = AutoEncoder()


In [None]:
def predict_ae_from_upstream_model(df, evaluate_x, evaluate_y, is_resampling = False):
  autoencoder.compile(loss='mae',
                    optimizer='rmsprop',
                    metrics=['acc',f1_m, precision_m, recall_m])
  if (is_resampling):
    autoencoder.load_weights("/content/onedrive/models/autoencoder_classifier_resampling.ckpt")
  else:
    autoencoder.load_weights("/content/onedrive/models/autoencoder_classifier.ckpt")

  ypred = autoencoder.predict(evaluate_x)
  # Define a threshold for converting predictions to binary class labels
  threshold = 0.5

  # Convert predictions to binary class labels
  predicted_classes = (ypred >= threshold).astype(int)
  df['ypreds_ae'] = predicted_classes
  print(classification_report(evaluate_y, predicted_classes))
  autoencoder.evaluate(evaluate_x, evaluate_y, verbose=2)

### 2017 improved

In [None]:
X_test, y_test = get_x_y(data2017_corrected)

data_test_2017_corrected = pd.concat([X_test, y_test], axis=1)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
predict_ae_from_upstream_model(data_test_2017_corrected, X_test, y_test)

              precision    recall  f1-score   support

           0       0.85      0.99      0.91   1594540
           1       0.94      0.44      0.60    505431

    accuracy                           0.86   2099971
   macro avg       0.89      0.71      0.75   2099971
weighted avg       0.87      0.86      0.84   2099971

65625/65625 - 80s - loss: 0.1428 - acc: 0.8572 - f1_m: 0.2182 - precision_m: 0.3467 - recall_m: 0.1860 - 80s/epoch - 1ms/step


### Resampling

In [None]:
X_test, y_test = get_x_y(data2017_corrected_resampling)

data_test_2017_corrected_r = pd.concat([X_test, y_test], axis=1)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
predict_ae_from_upstream_model(data_test_2017_corrected_r, X_test, y_test, True)

              precision    recall  f1-score   support

           0       0.77      0.89      0.82    505431
           1       0.87      0.73      0.79    505431

    accuracy                           0.81   1010862
   macro avg       0.82      0.81      0.81   1010862
weighted avg       0.82      0.81      0.81   1010862

31590/31590 - 40s - loss: 0.1891 - acc: 0.8107 - f1_m: 0.3952 - precision_m: 0.4931 - recall_m: 0.3669 - 40s/epoch - 1ms/step


### 2018 original

In [None]:
X_test, y_test = get_x_y(data2018_original)
del data2018_original
data_test_2018_original = pd.concat([X_test, y_test], axis=1)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
predict_ae_from_upstream_model(data_test_2018_original, X_test, y_test)

              precision    recall  f1-score   support

           0       0.87      0.97      0.92   9493199
           1       0.03      0.01      0.01   1325845

    accuracy                           0.85  10819044
   macro avg       0.45      0.49      0.47  10819044
weighted avg       0.77      0.85      0.81  10819044

338096/338096 - 441s - loss: 0.1503 - acc: 0.8497 - f1_m: 0.0017 - precision_m: 0.0071 - recall_m: 0.0011 - 441s/epoch - 1ms/step


In [None]:
del autoencoder

### Resampling

In [None]:
X_test, y_test = get_x_y(data2018_original_resampling)

In [None]:
del data2018_original_resampling

In [None]:
data_test_2018_original_r = pd.concat([X_test, y_test], axis=1)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
predict_ae_from_upstream_model(data_test_2018_original_r, X_test, y_test, True)

              precision    recall  f1-score   support

           0       0.57      0.67      0.61   1325845
           1       0.60      0.49      0.54   1325845

    accuracy                           0.58   2651690
   macro avg       0.58      0.58      0.58   2651690
weighted avg       0.58      0.58      0.58   2651690

82866/82866 - 105s - loss: 0.4198 - acc: 0.5798 - f1_m: 0.2760 - precision_m: 0.3653 - recall_m: 0.2447 - 105s/epoch - 1ms/step


### 2018 improved

In [None]:
X_test, y_test = get_x_y(data2018_corrected)
del data2018_corrected

data_test_2018_corrected = pd.concat([X_test, y_test], axis=1)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
predict_ae_from_upstream_model(data_test_2018_corrected, X_test, y_test)

              precision    recall  f1-score   support

           0       0.62      0.97      0.76   5935341
           1       0.62      0.07      0.12   3841659

    accuracy                           0.62   9777000
   macro avg       0.62      0.52      0.44   9777000
weighted avg       0.62      0.62      0.51   9777000

305532/305532 - 398s - loss: 0.3831 - acc: 0.6169 - f1_m: 0.0282 - precision_m: 0.0329 - recall_m: 0.0263 - 398s/epoch - 1ms/step


### resampling

In [None]:
X_test, y_test = get_x_y(data2018_corrected_resampling)

In [None]:
del data2018_corrected_resampling

In [None]:
data_test_2018_corrected_r = pd.concat([X_test, y_test], axis=1)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
predict_ae_from_upstream_model(data_test_2018_corrected_r, X_test, y_test, True)

              precision    recall  f1-score   support

           0       0.65      0.81      0.72   3841654
           1       0.75      0.57      0.65   3841659

    accuracy                           0.69   7683313
   macro avg       0.70      0.69      0.69   7683313
weighted avg       0.70      0.69      0.69   7683313

240104/240104 - 311s - loss: 0.3110 - acc: 0.6897 - f1_m: 0.2927 - precision_m: 0.3431 - recall_m: 0.2861 - 311s/epoch - 1ms/step


### DNN

In [7]:
# define the model architecture
input_dim = 76 #num of feature columns

# Initialize the constructor, for dense layers added
dnn = tf.keras.Sequential()

# dnn.add(tf.keras.layers.Flatten()) #flattens the input data

# Add an input layer
dnn.add(tf.keras.layers.Dense(32, activation='sigmoid', input_dim=input_dim))

# Add one hidden layer
dnn.add(tf.keras.layers.Dense(8, activation='sigmoid'))

# Add an output layer
dnn.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
def predict_dnn_from_upstream_model(df, evaluate_x, evaluate_y, is_resampling = False):
  dnn.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['acc',f1_m,precision_m, recall_m])

  if (is_resampling):
    dnn.load_weights("/content/onedrive/models/dnn_classifier_resampling.ckpt")
  else:
    dnn.load_weights("/content/onedrive/models/dnn_classifier.ckpt")

  ypred = dnn.predict(evaluate_x)
  # Define a threshold for converting predictions to binary class labels
  threshold = 0.5

  # Convert predictions to binary class labels
  predicted_classes = (ypred >= threshold).astype(int)
  df['ypreds_dnn'] = predicted_classes
  print(classification_report(evaluate_y, predicted_classes))
  dnn.evaluate(evaluate_x, evaluate_y, verbose=2)

### 2017 improved

In [None]:
predict_dnn_from_upstream_model(data_test_2017_corrected, X_test, y_test)

              precision    recall  f1-score   support

           0       0.76      0.99      0.86   1594540
           1       0.31      0.02      0.03    505431

    accuracy                           0.75   2099971
   macro avg       0.54      0.50      0.45   2099971
weighted avg       0.65      0.75      0.66   2099971

65625/65625 - 80s - loss: 1.4043 - acc: 0.7543 - f1_m: 0.0154 - precision_m: 0.0714 - recall_m: 0.0104 - 80s/epoch - 1ms/step


In [None]:
data_test_2017_corrected.to_parquet('/content/onedrive/test_result/data_test_dl_tr_2017_corrected.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2017_corrected.parquet')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded,ypreds_ae,ypreds_dnn
0,112740690,32,16,6448,1152,403,0,201.500000,204.724205,72,...,1.199802e+01,380,343,1.610540e+07,4.988048e+05,16399772,15375229,0,0,0
1,112740560,32,16,6448,5056,403,0,201.500000,204.724205,316,...,1.574499e+01,330,285,1.610543e+07,4.987937e+05,16399782,15375263,0,0,0
2,113757377,545,0,0,0,0,0,0.000000,0.000000,0,...,7.324646e+06,18851791,19,1.221036e+07,6.935824e+06,20757030,5504997,0,0,0
3,91997219,388,0,37151,0,227,37,95.750000,55.785320,0,...,1.152782e+07,24721964,16,1.319764e+07,5.826905e+06,19776791,5817470,0,0,0
4,66966070,6,6,288,288,48,48,48.000000,0.000000,48,...,0.000000e+00,1968172,1968172,6.497443e+07,0.000000e+00,64974431,64974431,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099966,5074745,7,6,582,1204,582,0,83.142857,219.975323,602,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
2099967,209,2,2,70,230,35,35,35.000000,0.000000,115,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
2099968,116281383,24,21,699,5411,322,0,29.125000,79.231808,1448,...,8.804912e+04,328226,23012,9.639210e+06,1.321437e+06,10024910,5443180,0,0,0
2099969,149,2,2,72,104,36,36,36.000000,0.000000,52,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0


### Resampling

In [None]:
predict_dnn_from_upstream_model(data_test_2017_corrected_r, X_test, y_test, True)

              precision    recall  f1-score   support

           0       0.44      0.38      0.41    505431
           1       0.45      0.52      0.49    505431

    accuracy                           0.45   1010862
   macro avg       0.45      0.45      0.45   1010862
weighted avg       0.45      0.45      0.45   1010862

31590/31590 - 38s - loss: 7.5412 - acc: 0.4485 - f1_m: 0.2858 - precision_m: 0.3817 - recall_m: 0.2602 - 38s/epoch - 1ms/step


In [None]:
data_test_2017_corrected_r.to_parquet('/content/onedrive/test_result/data_test_dl_tr_2017_corrected_resampling.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2017_corrected_resampling.parquet')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded,ypreds_ae,ypreds_dnn
0,115514613,17,16,450,282,439,0,26.470588,106.307407,282,...,1.992275e+04,84099,14933,9.598773e+06,1.386276e+06,10014002,5197043,0,0,0
1,195613,14,14,831,15880,357,0,59.357143,115.745481,2896,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
2,117347,2,2,68,312,34,34,34.000000,0.000000,156,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,0,0,1
3,5196055,9,11,1355,5738,712,0,150.555556,270.671901,1460,...,0.000000e+00,150774,150774,5.001138e+06,0.000000e+00,5001138,5001138,0,0,0
4,60744909,14,15,1158,4292,517,0,82.714286,186.590225,2097,...,1.029975e+05,277140,24618,1.005307e+07,3.207239e+05,10215374,9413421,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010857,656477,11,5,1041,11595,347,0,94.636364,162.083480,7240,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,1,1,0
1010858,103680007,13,6,1472,11632,368,0,113.230769,176.781482,4344,...,4.668730e+05,829021,3207,3.426969e+07,5.022459e+07,92263198,5000791,1,1,0
1010859,11464350,9,5,588,11632,588,0,65.333333,196.000000,7288,...,0.000000e+00,749,749,6.423318e+06,0.000000e+00,6423318,6423318,1,1,0
1010860,177006,8,8,322,11595,322,0,40.250000,113.844192,7240,...,0.000000e+00,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0


### 2018 original

In [None]:
predict_dnn_from_upstream_model(data_test_2018_original, X_test, y_test)

NameError: ignored

In [None]:
data_test_2018_original.to_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_original.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
del data_test_2018_original, X_test, y_test

In [None]:
pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_original.parquet')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded,ypreds_ae,ypreds_dnn
0,141385.0,9.0,7.0,553.0,3773.0,202.0,0.0,61.444444,87.534438,1460.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,281.0,2.0,1.0,38.0,0.0,38.0,0.0,19.000000,26.870058,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
2,279824.0,11.0,15.0,1086.0,10527.0,385.0,0.0,98.727273,129.392497,1460.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
3,132.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
4,274016.0,9.0,13.0,1285.0,6141.0,517.0,0.0,142.777778,183.887722,1460.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10819039,20.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1
10819040,105445.0,2.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
10819041,733880.0,2.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
10819042,732728.0,2.0,2.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1


### Resampling

In [None]:
predict_dnn_from_upstream_model(data_test_2018_original_r, X_test, y_test, True)

              precision    recall  f1-score   support

           0       0.66      0.49      0.56   1325845
           1       0.59      0.74      0.66   1325845

    accuracy                           0.62   2651690
   macro avg       0.62      0.62      0.61   2651690
weighted avg       0.62      0.62      0.61   2651690

82866/82866 - 104s - loss: 3.4703 - acc: 0.6156 - f1_m: 0.3862 - precision_m: 0.4265 - recall_m: 0.3725 - 104s/epoch - 1ms/step


In [None]:
data_test_2018_original_r.to_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_original_resampling.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
del data_test_2018_original_r, X_test, y_test

In [None]:
pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_original_resampling.parquet')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded,ypreds_ae,ypreds_dnn
0,60134050.0,9.0,7.0,2044.0,4174.0,837.0,0.0,227.111111,350.915746,1460.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,1103521.0,8.0,7.0,1148.0,1581.0,677.0,0.0,143.500000,228.129662,1173.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,10086791.0,7.0,9.0,1046.0,5283.0,731.0,0.0,149.428571,267.599114,1460.0,...,0.0,208480.0,208480.0,9878310.0,0.0,9878310.0,9878310.0,0,1,0
3,85929.0,2.0,2.0,74.0,262.0,37.0,37.0,37.000000,0.000000,131.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
4,303073.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2651685,137728.0,2.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
2651686,11172035.0,5.0,5.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
2651687,22.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1
2651688,20.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1


### 2018 improved

In [None]:
predict_dnn_from_upstream_model(data_test_2018_corrected, X_test, y_test)

              precision    recall  f1-score   support

           0       0.60      0.96      0.74   5935341
           1       0.02      0.00      0.00   3841659

    accuracy                           0.58   9777000
   macro avg       0.31      0.48      0.37   9777000
weighted avg       0.37      0.58      0.45   9777000

305532/305532 - 388s - loss: 2.5857 - acc: 0.5822 - f1_m: 0.0011 - precision_m: 0.0051 - recall_m: 6.3366e-04 - 388s/epoch - 1ms/step


In [None]:
data_test_2018_corrected.to_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_corrected.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
del data_test_2018_corrected, X_test, y_test

In [None]:
pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_corrected.parquet')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded,ypreds_ae,ypreds_dnn
0,115985546,20,19,1914,1268,1304,0,95.700000,306.775093,1085,...,60803.836874,231615,20927,9.625176e+06,1.320240e+06,10025690,5433219,0,0,0
1,60837973,12,11,323,337,317,0,26.916667,91.353916,337,...,63152.965095,170207,11714,1.007032e+07,3.421857e+04,10088042,10000748,0,0,0
2,1227,1,1,42,102,42,42,42.000000,0.000000,102,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
3,887,1,1,52,108,52,52,52.000000,0.000000,108,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
4,1298,1,1,42,138,42,42,42.000000,0.000000,138,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9776995,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0
9776996,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0
9776997,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0
9776998,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0


### resampling

In [None]:
predict_dnn_from_upstream_model(data_test_2018_corrected_r, X_test, y_test)


              precision    recall  f1-score   support

           0       0.49      0.95      0.65   3841654
           1       0.27      0.02      0.03   3841659

    accuracy                           0.49   7683313
   macro avg       0.38      0.49      0.34   7683313
weighted avg       0.38      0.49      0.34   7683313

240104/240104 - 302s - loss: 3.0581 - acc: 0.4856 - f1_m: 0.0131 - precision_m: 0.0477 - recall_m: 0.0084 - 302s/epoch - 1ms/step


In [None]:
data_test_2018_corrected_r.to_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_corrected_resampling.parquet', index = False, compression=None, engine='fastparquet')

In [None]:
del data_test_2018_corrected_r

In [None]:
pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_corrected_resampling.parquet')

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label_encoded,ypreds_ae,ypreds_dnn
0,5298120,11,10,1815,1933,677,0,165.000000,261.498375,1173,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
1,90103210,13,11,1442,1731,725,0,110.923077,196.522035,1179,...,0.000000,4060528,4060528,8.604268e+07,0.000000e+00,86042682,86042682,0,0,0
2,40164,4,2,77,0,46,0,19.250000,23.056091,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,0,1,1
3,1722788,8,7,1128,1581,661,0,141.000000,222.623321,1173,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,0,0,0
4,23258044,9,10,1148,2754,677,0,127.555556,218.691056,1173,...,321757.539778,670442,34380,7.370407e+06,2.640889e+06,10329404,5252514,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7683308,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0
7683309,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0
7683310,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0
7683311,2,1,1,0,0,0,0,0.000000,0.000000,0,...,0.000000,0,0,0.000000e+00,0.000000e+00,0,0,1,0,0


### Retrain the model, fine tuning

In [8]:
# data_test_2017_corrected = pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2017_corrected.parquet', engine="fastparquet")
# data_test_2017_corrected_r = pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2017_corrected_resampling.parquet', engine="fastparquet")
# data_test_2018_original = pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_original.parquet', engine="fastparquet")
# data_test_2018_original_r = pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_original_resampling.parquet', engine="fastparquet")
data_test_2018_corrected = pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_corrected.parquet', engine="fastparquet")
# data_test_2018_corrected_r = pd.read_parquet('/content/onedrive/test_result/data_test_dl_tr_2018_corrected_resampling.parquet', engine="fastparquet")

In [9]:
def gen_misclassified_data(df, y):
  return df[df['label_encoded'] != df[y]]

In [10]:
def gen_train_x_y(df, is_resampling = False):
  X_retrained = df.loc[:, 'Flow Duration': 'Idle Min']
  y_retrained = df['label_encoded']

  scaler = StandardScaler()
  X_retrained = scaler.fit_transform(X_retrained)

  return X_retrained, y_retrained

In [11]:
epochs = 16
batch_size = 128

def retrain_autoencoder(df, X_test, y_test, is_resampling):


  misclassfied_data = gen_misclassified_data(df, 'ypreds_ae')
  X_train, y_train = gen_train_x_y(misclassfied_data, is_resampling)


  model = "/content/onedrive/models/autoencoder_classifier.ckpt"

  if is_resampling:
    model = "/content/onedrive/models/autoencoder_classifier_resampling.ckpt"

  autoencoder = AutoEncoder()
  autoencoder.load_weights(model)


  autoencoder.compile(loss='mae',
                    optimizer='rmsprop',
                    metrics=['acc',f1_m, precision_m, recall_m])

  autoencoder.fit(X_train, y_train,
                  epochs=epochs,
                  batch_size=batch_size,
                  shuffle=True,
                  validation_split=0.1,
                  verbose=1)

  ypred = autoencoder.predict(X_test)
  # Define a threshold for converting predictions to binary c
  threshold = 0.5
  # Convert predictions to binary class labels
  predicted_classes = (ypred >= threshold).astype(int)
  print(classification_report(y_test, predicted_classes))

In [12]:
batch_size = 32
def retrain_dnn(df, X_test, y_test, is_resampling):

  misclassfied_data = gen_misclassified_data(df, 'ypreds_dnn')
  X_train, y_train = gen_train_x_y(misclassfied_data, is_resampling)


  model = "/content/onedrive/models/dnn_classifier.ckpt"

  if is_resampling:
    model = "/content/onedrive/models/dnn_classifier_resampling.ckpt"

  dnn.load_weights(model)


  dnn.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['acc',f1_m,precision_m, recall_m])

  dnn.fit(X_train, y_train,
                      epochs=epochs,
                      batch_size=batch_size,
                      shuffle=True,
                      validation_split=0.1,
                      verbose=1)

  ypred = dnn.predict(X_test)
  # Define a threshold for converting predictions to binary class
  threshold = 0.5
  # Convert predictions to binary class labels
  predicted_classes = (ypred >= threshold).astype(int)
  print(classification_report(y_test, predicted_classes))


### 2017 improved

In [17]:
X_test, y_test = get_x_y(data2017_corrected)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [18]:
del data2017_corrected

In [19]:
retrain_autoencoder(data_test_2017_corrected, X_test, y_test, False)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.76      1.00      0.86   1594540
           1       0.00      0.00      0.00    505431

    accuracy                           0.76   2099971
   macro avg       0.38      0.50      0.43   2099971
weighted avg       0.58      0.76      0.66   2099971



  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
retrain_dnn(data_test_2017_corrected, X_test, y_test, False)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.97      0.08      0.15   1594540
           1       0.25      0.99      0.41    505431

    accuracy                           0.30   2099971
   macro avg       0.61      0.54      0.28   2099971
weighted avg       0.80      0.30      0.21   2099971



In [23]:
del data_test_2017_corrected, X_test, y_test

### resampling

In [None]:
X_test, y_test = get_x_y(data2017_corrected_resampling)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
del data2017_corrected_resampling

In [None]:
retrain_autoencoder(data_test_2017_corrected_r, X_test, y_test, True)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67    505431
           1       0.00      0.00      0.00    505431

    accuracy                           0.50   1010862
   macro avg       0.25      0.50      0.33   1010862
weighted avg       0.25      0.50      0.33   1010862



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
retrain_dnn(data_test_2017_corrected_r, X_test, y_test, True)


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.66      0.92      0.77    505431
           1       0.87      0.53      0.66    505431

    accuracy                           0.72   1010862
   macro avg       0.77      0.72      0.71   1010862
weighted avg       0.77      0.72      0.71   1010862



In [None]:
del data_test_2017_corrected_r, X_test, y_test


### 2018 original

In [30]:
X_test, y_test = get_x_y(data2018_original)
del data2018_original

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [31]:
retrain_autoencoder(data_test_2018_original, X_test, y_test, False)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.88      1.00      0.94   9493199
           1       1.00      0.02      0.04   1325845

    accuracy                           0.88  10819044
   macro avg       0.94      0.51      0.49  10819044
weighted avg       0.89      0.88      0.83  10819044



In [32]:
retrain_dnn(data_test_2018_original, X_test, y_test, False)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.85      0.57      0.68   9493199
           1       0.08      0.28      0.13   1325845

    accuracy                           0.54  10819044
   macro avg       0.47      0.43      0.41  10819044
weighted avg       0.76      0.54      0.62  10819044



In [33]:
del data_test_2018_original, X_test, y_test

### resampling

In [None]:
X_test, y_test = get_x_y(data2018_original_resampling)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
del data2018_original_resampling


In [None]:
retrain_autoencoder(data_test_2018_original_r, X_test, y_test, True)


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.31      0.00      0.00   1325845
           1       0.50      1.00      0.67   1325845

    accuracy                           0.50   2651690
   macro avg       0.41      0.50      0.33   2651690
weighted avg       0.41      0.50      0.33   2651690



In [None]:
retrain_dnn(data_test_2018_original_r, X_test, y_test, True)


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.47      0.63      0.54   1325845
           1       0.43      0.27      0.33   1325845

    accuracy                           0.45   2651690
   macro avg       0.45      0.45      0.43   2651690
weighted avg       0.45      0.45      0.43   2651690



In [None]:
del data_test_2018_original_r, X_test, y_test


### 2018 Improved

In [14]:
X_test, y_test = get_x_y(data2018_corrected)
del data2018_corrected

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [42]:
retrain_autoencoder(data_test_2018_corrected, X_test, y_test, False)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.67      0.86      0.75   5935341
           1       0.62      0.34      0.44   3841659

    accuracy                           0.66   9777000
   macro avg       0.64      0.60      0.60   9777000
weighted avg       0.65      0.66      0.63   9777000



In [15]:
retrain_dnn(data_test_2018_corrected, X_test, y_test, False)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       1.00      0.47      0.64   5935341
           1       0.55      1.00      0.71   3841659

    accuracy                           0.68   9777000
   macro avg       0.77      0.73      0.67   9777000
weighted avg       0.82      0.68      0.66   9777000



In [16]:
del data_test_2018_corrected, X_test, y_test

### resampling

In [None]:
X_test, y_test = get_x_y(data2018_corrected_resampling)

scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
del data2018_corrected_resampling


In [None]:
retrain_autoencoder(data_test_2018_corrected_r, X_test, y_test, True)


In [29]:
retrain_dnn(data_test_2018_corrected_r, X_test, y_test, True)


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
              precision    recall  f1-score   support

           0       0.94      0.60      0.73   3841654
           1       0.70      0.96      0.81   3841659

    accuracy                           0.78   7683313
   macro avg       0.82      0.78      0.77   7683313
weighted avg       0.82      0.78      0.77   7683313



In [30]:
del data_test_2018_corrected_r, X_test, y_test
