In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import random
from tqdm import trange
from scipy.stats import pointbiserialr
import math

warnings.filterwarnings('ignore')

In [None]:
# Load Data
train_raw = pd.read_csv('../Data/UNSW-NB15/train.csv')
display(train_raw.shape)
test_raw = pd.read_csv('../Data/UNSW-NB15/test.csv')
display(test_raw.shape)

# Seperate label and Drop ID
train_X = train_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
train_Y = train_raw['label']
test_X = test_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
test_Y = test_raw['label']

# Normalize data with min, max of training data
test_X1 = (test_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
train_X1 = (train_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))

test_X1[test_X1 < 0] = 0
test_X1[test_X1 > 1] = 1

In [None]:
# correlation based feature selection
corr = train_X1.corr().abs()

threshold = 0.8
corr.values[np.tril_indices_from(corr.values)] = np.nan
redundant = []
for j in corr.columns:
    for i in corr.index:
        if corr.loc[i, j] > threshold:
            redundant.append((i, j))

train_X2 = train_X1.copy()
train_X2['label'] = train_Y
corr2 = train_X2.corr().abs()

corr3 = corr2['label'].iloc[:-1].copy()
drop = []

for i, j in redundant:
    if corr3[i] > corr3[j] and j not in drop:
        drop.append(j)
    elif i not in drop:
        drop.append(i)
print(drop)

train_X1 = train_X1.drop(drop, axis=1)
test_X1 = test_X1.drop(drop, axis=1)
print(train_X1.shape)
print(test_X1.shape)

In [None]:
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFECV, SequentialFeatureSelector, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [None]:
# greedy search
greedy_all = []
clf = RandomForestClassifier(random_state=0, n_jobs=-1)
model = LogisticRegression(max_iter=10000, random_state=0, n_jobs=-1)
for k in trange(train_X1.shape[1]-1):
    features = []
    scores = []
    selector = SelectKBest(mutual_info_classif, k='all')
    # select one best feature and add it to subset
    selector.fit(train_X1.drop(greedy_all, axis=1), train_Y)
    f = train_X1.columns.drop(greedy_all)[np.argsort(selector.scores_)[0]]
    features.append(f)
    cv = cross_val_score(model, train_X1.drop(greedy_all+[f], axis=1), train_Y, scoring='f1', n_jobs=-1)
    scores.append(cv.mean())

    selector = SequentialFeatureSelector(clf, n_features_to_select=1, direction='backward', scoring='f1', cv=5, n_jobs=-1)
    # select one best feature and add it to subset
    selector.fit(train_X1.drop(greedy_all, axis=1), train_Y)
    f = train_X1.columns.drop(greedy_all)[selector.get_support()]
    features.append(f[0])
    cv = cross_val_score(model, train_X1.drop(greedy_all+f, axis=1), train_Y, scoring='f1', n_jobs=-1)
    scores.append(cv.mean())

    selector = RFECV(clf, min_features_to_select=train_X1.shape[1], scoring='f1', cv=5, n_jobs=-1)
    selector.fit(train_X1, train_Y)
    f = train_X1.columns.drop(greedy_all)[np.argsort(selector.cv_results_.mean_test_score)[0]]
    features.append(f)
    cv = cross_val_score(model, train_X1.drop(greedy_all+[f], axis=1), train_Y, scoring='f1', n_jobs=-1)
    scores.append(cv.mean())

    i_best = np.argmax(scores)
    greedy_all.append(features[i_best])

In [None]:
# test with LR
cv_times_all = []
f1_all = []
model = LogisticRegression(max_iter=10000, random_state=0, n_jobs=-1)
for k in trange(train_X1.shape[1]):
    # cross validation
    second = time.time()
    cv = cross_val_score(model, train_X1[greedy_all[k:]], train_Y, scoring='f1', n_jobs=-1)
    second2 = time.time()
    cv_times_all.append(second2 - second)
    f1_all.append((cv.mean(), cv.std()))

In [None]:
pd.DataFrame(cv_times_all[::-1], index=['greedy']).to_csv('../Results/Paper/Greedy_Time_LR.csv')
pd.DataFrame(f1_all[::-1], index=['greedy']).to_csv('../Results/Paper/Greedy_F1_LR.csv')

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(12, 9))

plt.title('F1 Score and Time over number of features on Logistic Regression', loc='center')
plt.subplot(1, 2, 1)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')
plt.ylim((0, 1))

plt.plot(range(train_X1.shape[1], 0, -1), np.array(f1_all)[:,0], color='blue', linestyle='-', label='greedy')

plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Number of Features')
plt.ylabel('Time')

plt.plot(range(train_X1.shape[1], 0, -1), cv_times_all, color='blue', linestyle='-', label='greedy')

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# test with GB
cv_times_all = []
f1_all = []
model = GradientBoostingClassifier(random_state=0)
for k in trange(train_X1.shape[1]):
    # cross validation
    second = time.time()
    cv = cross_val_score(model, train_X1[greedy_all[k:]], train_Y, scoring='f1', n_jobs=-1)
    second2 = time.time()
    cv_times_all.append(second2 - second)
    f1_all.append((cv.mean(), cv.std()))

In [None]:
pd.DataFrame(cv_times_all[::-1], index=['greedy']).to_csv('../Results/Paper/Greedy_Time_GB.csv')
pd.DataFrame(f1_all[::-1], index=['greedy']).to_csv('../Results/Paper/Greedy_F1_GB.csv')

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(12, 9))

plt.title('F1 Score and Time over number of features on Gradient Boosting', loc='center')
plt.subplot(1, 2, 1)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')
plt.ylim((0, 1))

plt.plot(range(train_X1.shape[1], 0, -1), np.array(f1_all)[:,0], color='blue', linestyle='-', label='greedy')

plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Number of Features')
plt.ylabel('Time')

plt.plot(range(train_X1.shape[1], 0, -1), cv_times_all, color='blue', linestyle='-', label='greedy')

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from keras import Sequential, layers, losses, metrics, callbacks
from sklearn.model_selection import StratifiedKFold

In [None]:
def ModelCreate(input_shape):
    model = Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape=input_shape))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])
    return model

In [None]:
cv_times_all = []
f1_all = []
kf = StratifiedKFold(shuffle=True, random_state=0)
callback = callbacks.EarlyStopping(patience=3, min_delta=0.1, restore_best_weights=True)
for k in trange(train_X1.shape[1]):
    model = ModelCreate((train_X1.shape[1]-k,))
    # cross validation
    j = 0
    cv_time = 0
    cv = np.zeros(shape=5)
    train_X2 = train_X1[greedy_all[k:]].copy()
    for train_index, test_index in kf.split(train_X2, train_Y):
        x_train_fold, x_test_fold = train_X2.iloc[train_index, :], train_X2.iloc[test_index, :]
        y_train_fold, y_test_fold = train_Y.iloc[train_index], train_Y.iloc[test_index]

        second = time.time()
        model.fit(x_train_fold.values, y_train_fold.values, validation_data=(x_test_fold, y_test_fold), epochs=30, callbacks=[callback], verbose=0)
        predict = model.predict(x_test_fold, use_multiprocessing=True)
        predict = np.where(predict < 0.5, 0, 1)
        cv[j] = f1_score(y_test_fold, predict)
        second2 = time.time()
        cv_time += second2 - second
        j += 1
    cv_times_all.append(cv_time)
    f1_all.append((cv.mean(), cv.std()))

In [None]:
pd.DataFrame(cv_times_all[::-1], index=['greedy']).to_csv('../Results/Paper/Greedy_Time_DNN.csv')
pd.DataFrame(f1_all[::-1], index=['greedy']).to_csv('../Results/Paper/Greedy_F1_DNN.csv')

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(12, 9))

plt.title('F1 Score and Time over number of features on Deep Neuron Network', loc='center')
plt.subplot(1, 2, 1)
plt.xlabel('Number of Features')
plt.ylabel('F1 Score')
plt.ylim((0, 1))

plt.plot(range(train_X1.shape[1], 0, -1), np.array(f1_all)[:,0], color='blue', linestyle='-', label='greedy')

plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Number of Features')
plt.ylabel('Time')

plt.plot(range(train_X1.shape[1], 0, -1), cv_times_all, color='blue', linestyle='-', label='greedy')

plt.legend()

plt.tight_layout()
plt.show()