In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns
import warnings
import time
import os
import random
from tqdm import trange
import math

warnings.filterwarnings('ignore')

In [None]:
# Load Data
train_raw = pd.read_csv('../../Data/UNSW-NB15/train.csv')
print(train_raw.shape)
test_raw = pd.read_csv('../../Data/UNSW-NB15/test.csv')
print(test_raw.shape)

In [None]:
# Seperate label and Drop ID
train_X = train_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
train_Y = train_raw['label']
test_X = test_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
test_Y = test_raw['label']

In [None]:
# Normalize data with min, max of training data
test_X1 = (test_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
train_X1 = (train_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))

test_X1[test_X1 < 0] = 0
test_X1[test_X1 > 1] = 1

In [None]:
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFECV, SequentialFeatureSelector, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
# select 1 worse feature iteratively with chi2, ANOVA, mutual info

subset_all = []
selector = SelectKBest(mutual_info_classif, k=train_X1.shape[1])
selector.fit(train_X1, train_Y)
sort_index = np.argsort(selector.scores_)
cols = train_X1.columns[sort_index]
subset_all.append(cols)
print(cols)

In [None]:
# select 1 worst feature iteratively with SFS, using RF, LR
cols = []
model = RandomForestClassifier(random_state=0, n_jobs=-1)
selector = SequentialFeatureSelector(model, n_features_to_select=1, direction='backward', scoring='f1', cv=5, n_jobs=-1)
for k in trange(train_X1.shape[1]-1):
    selector.fit(train_X1.drop(cols, axis=1), train_Y)
    f = train_X1.columns.drop(cols)[selector.get_support()]
    cols.append(f[0])
cols.append(train_X1.columns.drop(cols)[0])

print(cols)

In [None]:
subset_all.append(cols)

In [None]:
# select 1 worst feature iteratively with SFS, using RF, LR
model = RandomForestClassifier(random_state=0, n_jobs=-1)
selector = RFECV(model, min_features_to_select=train_X1.shape[1], scoring='f1', cv=5, n_jobs=-1)
selector.fit(train_X1, train_Y)
sort_index = np.argsort(selector.cv_results_.mean_test_score)
cols = train_X1.columns[sort_index]
print(cols)

In [None]:
subset_all.append(cols)

In [None]:
model = RandomForestClassifier(random_state=0, n_jobs=-1)
cols = []
train_X2 = train_X1.copy()
for k in trange(train_X1.shape[1]-1):
    model.fit(train_X2, train_Y)
    sort_index = np.argsort(model.feature_importances_)
    f = train_X1.columns[sort_index[0]]
    cols.append(f)
    train_X2 = train_X2.drop([f], axis=1)
cols.append(train_X1.columns.drop(cols)[0])
print(cols)

In [None]:
# measure performance by cv(f1 score)
cv_times_all = []
f1_all = []
model = LogisticRegression(max_iter=10000, random_state=0, n_jobs=-1)
for i in range(len(subset_all)):
    cv_times = []
    f1s = []
    for k in trange(train_X1.shape[1]):
        # cross validation
        second = time.time()
        cv = cross_val_score(model, train_X1[subset_all[i][:k+1]], train_Y, scoring='f1', n_jobs=-1)
        second2 = time.time()
        cv_times.append(second2 - second)
        f1s.append((cv.mean(), cv.std()))
    
    cv_times_all.append(cv_times)
    f1_all.append(f1s)

In [None]:
methods = ['mi', 'sbs(rf)', 'rfe(rf)', 'im(rf)']
pd.DataFrame(cv_times_all, index=methods).to_csv('../Results/Paper/Time_LR.csv')
pd.DataFrame(f1_all, index=methods).to_csv('../Results/Paper/F1_LR.csv')

In [None]:
methods = []
fig, axis = plt.subplots(1, 2, figsize=(12, 9))

plt.title('F1 Score and Time over number of features on Logistic Regression', loc='center')
plt.subplot(1, 2, 1)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')
plt.ylim((0, 1))

plt.plot(range(train_X1.shape[1]), np.array(f1_all)[0,:,0], color='blue', linestyle='-', label=methods[0])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[1,:,0], color='red', linestyle='-', label=methods[1])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[2,:,0], color='black', linestyle='-', label=methods[2])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[3,:,0], color='cyan', linestyle='-', label=methods[3])

plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Number of Features')
plt.ylabel('Time')

plt.plot(range(train_X1.shape[1]), cv_times_all[0], color='blue', linestyle='-', label=methods[0])
plt.plot(range(train_X1.shape[1]), cv_times_all[1], color='red', linestyle='-', label=methods[1])
plt.plot(range(train_X1.shape[1]), cv_times_all[2], color='black', linestyle='-', label=methods[2])
plt.plot(range(train_X1.shape[1]), cv_times_all[3], color='cyan', linestyle='-', label=methods[3])

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
cv_times_all = []
f1_all = []

model = GradientBoostingClassifier(random_state=0)
for i in range(len(subset_all)):
    cv_times = []
    f1s = []
    for k in trange(train_X1.shape[1]):
        # cross validation
        second = time.time()
        cv = cross_val_score(model, train_X1[subset_all[i][:k+1]], train_Y, scoring='f1', n_jobs=-1)
        second2 = time.time()
        cv_times.append(second2 - second)
        f1s.append((cv.mean(), cv.std()))
    
    cv_times_all.append(cv_times)
    f1_all.append(f1s)

In [None]:
pd.DataFrame(cv_times_all, index=methods).to_csv('../Results/Paper/Time_GB.csv')
pd.DataFrame(f1_all, index=methods).to_csv('../Results/Paper/F1_GB.csv')

In [None]:
methods = []
fig, axis = plt.subplots(1, 2, figsize=(12, 9))

plt.title('F1 Score and Time over number of features on Gradient Boosting', loc='center')
plt.subplot(1, 2, 1)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')
plt.ylim((0, 1))

plt.plot(range(train_X1.shape[1]), np.array(f1_all)[0,:,0], color='blue', linestyle='-', label=methods[0])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[1,:,0], color='red', linestyle='-', label=methods[1])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[2,:,0], color='black', linestyle='-', label=methods[2])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[3,:,0], color='cyan', linestyle='-', label=methods[3])

plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Number of Features')
plt.ylabel('Time')

plt.plot(range(train_X1.shape[1]), cv_times_all[0], color='blue', linestyle='-', label=methods[0])
plt.plot(range(train_X1.shape[1]), cv_times_all[1], color='red', linestyle='-', label=methods[1])
plt.plot(range(train_X1.shape[1]), cv_times_all[2], color='black', linestyle='-', label=methods[2])
plt.plot(range(train_X1.shape[1]), cv_times_all[3], color='cyan', linestyle='-', label=methods[3])

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from tensorflow.python.keras import Sequential, layers, optimizers, losses, metrics, callbacks
from sklearn.model_selection import StratifiedKFold

In [None]:
def ModelCreate(input_shape):
    model = Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape=input_shape))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])
    return model

In [None]:
cv_times_all = []
f1_all = []

kf = StratifiedKFold(shuffle=True, random_state=0)
callback = callbacks.EarlyStopping(patience=3, min_delta=0.1, restore_best_weights=True)
for i in range(len(subset_all)):
    cv_times = []
    f1s = []
    for k in trange(train_X1.shape[1]):
        model = ModelCreate((k+1,))
        # cross validation
        j = 0
        cv_time = 0
        cv = np.zeros(shape=5)
        train_X2 = train_X1[subset_all[i][:k+1]].copy()
        for train_index, test_index in kf.split(train_X2, train_Y):
            x_train_fold, x_test_fold = train_X2.iloc[train_index, :], train_X2.iloc[test_index, :]
            y_train_fold, y_test_fold = train_Y.iloc[train_index], train_Y.iloc[test_index]

            second = time.time()
            model.fit(x_train_fold.values, y_train_fold.values, validation_data=(x_test_fold, y_test_fold), epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
            predict = model.predict(x_test_fold, use_multiprocessing=True)
            predict = np.where(predict < 0.5, 0, 1)
            cv[j] = f1_score(y_test_fold, predict)
            second2 = time.time()
            cv_time += second2 - second
            j += 1
        cv_times.append(cv_time)
        f1s.append((cv.mean(), cv.std()))
    
    cv_times_all.append(cv_times)
    f1_all.append(f1s)

In [None]:
pd.DataFrame(cv_times_all, index=methods).to_csv('../Results/Paper/Time_DNN.csv')
pd.DataFrame(f1_all, index=methods).to_csv('../Results/Paper/F1_DNN.csv')

In [None]:
methods = []
fig, axis = plt.subplots(1, 2, figsize=(12, 9))

plt.title('F1 Score and Time over number of features on DNN', loc='center')
plt.subplot(1, 2, 1)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')
plt.ylim((0, 1))

plt.plot(range(train_X1.shape[1]), np.array(f1_all)[0,:,0], color='blue', linestyle='-', label=methods[0])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[1,:,0], color='red', linestyle='-', label=methods[1])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[2,:,0], color='black', linestyle='-', label=methods[2])
plt.plot(range(train_X1.shape[1]), np.array(f1_all)[3,:,0], color='cyan', linestyle='-', label=methods[3])

plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Number of Features')
plt.ylabel('Time')

plt.plot(range(train_X1.shape[1]), cv_times_all[0], color='blue', linestyle='-', label=methods[0])
plt.plot(range(train_X1.shape[1]), cv_times_all[1], color='red', linestyle='-', label=methods[1])
plt.plot(range(train_X1.shape[1]), cv_times_all[2], color='black', linestyle='-', label=methods[2])
plt.plot(range(train_X1.shape[1]), cv_times_all[3], color='cyan', linestyle='-', label=methods[3])

plt.legend()

plt.tight_layout()
plt.show()