In [1]:
import os
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support

import joblib

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
import tensorflow.keras as keras

In [2]:
MODEL_DIR_PATH = "/Users/smalih/CICIDS_models/my_model"
PROCESSED_DIR_PATH = "/Users/smalih/CICIDS_models/processed_dataset"

In [3]:
def reshape_dataset_cnn(x: np.ndarray) -> np.ndarray:
    # Add padding columns
    result = np.zeros((x.shape[0], 49)) # changed 81 to 49 as 48 features
    result[:, :-1] = x # changed -3 to -1 as only one column is padding

    # Reshaping dataset
    result = np.reshape(result, (result.shape[0], 7, 7))
    result = result[..., tf.newaxis]
    return result

def preprocessing(df: pd.DataFrame) -> (np.ndarray, np.ndarray):
    # Shuffle the dataset
    df = df.sample(frac=1)

    # Split features and labels
    x = df.iloc[:, df.columns != 'Label']
    y = df[['Label']].to_numpy()

    # Scale the features between 0 ~ 1
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    if not os.path.isfile(os.path.join(MODEL_DIR_PATH, 'x_scaler.pkl')):
        joblib.dump(scaler, os.path.join(MODEL_DIR_PATH, 'x_scaler.pkl'))

    return x, y


In [4]:
def load_model(model_dir):
    model = keras.models.load_model(os.path.join(model_dir, '06_cnn.h5'))
    return model

def load_data(data_path):
    df = pd.read_csv(data_path)
    # Split features and labels
    # scaler = MinMaxScaler()
    scaler = joblib.load("/Users/smalih/CICIDS_models/my_model/x_scaler.pkl")
    x = scaler.fit_transform(df)
    return reshape_dataset_cnn(x)


In [5]:
def get_predictions(model, data, label_encoder):
    y_pred = model.predict(data, batch_size=1024, verbose=False)
    y_pred = label_encoder[np.argmax(y_pred, axis=1)]
    return y_pred

In [6]:
X = load_data("/Users/smalih/CICIDS_models/dataset/data_collect.csv")
model = load_model(MODEL_DIR_PATH)



In [7]:
label_encoder = np.load(os.path.join(PROCESSED_DIR_PATH, 'label_encoder.npy'), allow_pickle=True)

print(label_encoder)

['BENIGN' 'Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web Attack-Brute Force' 'Web Attack-Sql Injection'
 'Web Attack-XSS']


In [8]:

preds = get_predictions(model, X, label_encoder)

In [9]:
print(preds)

['BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'Heartbleed' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'Heartbleed' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN'
 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIGN' 'BENIG

In [10]:
attacks = preds[preds != 'BENIGN']

In [11]:
attacks

array(['Heartbleed', 'Heartbleed', 'Heartbleed', 'DDoS', 'DoS slowloris',
       'PortScan', 'DoS Hulk'], dtype=object)

In [12]:
df = pd.read_csv(os.path.join(PROCESSED_DIR_PATH, 'test.csv'), skipinitialspace=True)
logging.info("Class distribution\n{}".format(df.Label.value_counts()))
print(df.Label)

0            0
1            0
2            0
3            0
4            0
            ..
10384011    10
10384012     2
10384013    10
10384014    10
10384015     0
Name: Label, Length: 10384016, dtype: int64


In [13]:
X, Y = preprocessing(df)
del df

In [14]:
_, x_test, _, y_test = train_test_split(X, Y, stratify=Y, test_size=0.20,
                                                        random_state=np.random.randint(10))
del X, Y


In [15]:
x_test = reshape_dataset_cnn(x_test)
results = model.evaluate(x_test, y_test, batch_size=1024, verbose=0)
print("test loss, test acc:", results)

test loss, test acc: [0.004454783629626036, 0.9986898303031921]


In [16]:
preds = model.predict(x_test, batch_size=1024, verbose=0)
# del x_test


In [17]:
n_preds = np.argmax(preds, axis=1)

In [18]:
print(n_preds)

[0 0 0 ... 0 2 4]


In [19]:
# precision, recall, f1, _ = classification_report(y_test, n_preds)
# print(f"Precision: {precision:.4f}")
# print(f"Recall: {recall:.4f}")
# print(f"F1 Score: {f1:.4f}")
print(classification_report(y_test, n_preds, target_names=label_encoder))


                          precision    recall  f1-score   support

                  BENIGN       1.00      1.00      1.00   1664971
                     Bot       0.70      0.81      0.75      1417
                    DDoS       1.00      1.00      1.00     92178
           DoS GoldenEye       1.00      1.00      1.00      7824
                DoS Hulk       1.00      1.00      1.00    175617
        DoS Slowhttptest       0.99      0.99      0.99      4180
           DoS slowloris       1.00      0.99      1.00      4404
             FTP-Patator       1.00      1.00      1.00      6350
              Heartbleed       1.00      1.00      1.00         7
            Infiltration       1.00      0.96      0.98        27
                PortScan       0.99      1.00      1.00    114430
             SSH-Patator       1.00      1.00      1.00      4616
  Web Attack-Brute Force       0.48      0.99      0.64       542
Web Attack-Sql Injection       0.55      0.86      0.67         7
         

In [20]:
print(len(label_encoder))

15
