In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearnex import patch_sklearn

In [3]:
# Load Data
train_raw = pd.read_csv('../Data/UNSW-NB15/train.csv')
print(train_raw.shape)
test_raw = pd.read_csv('../Data/UNSW-NB15/test.csv')
print(test_raw.shape)

# Seperate label and Drop
train_X = train_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
train_Y = train_raw['label']
test_X = test_raw.drop(['id', 'attack_cat', 'label'], axis=1).select_dtypes(include='number')
test_Y = test_raw['label']

# Normalize data with min, max of training data
test_X1 = (test_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
train_X1 = (train_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))

test_X1[test_X1 < 0] = 0
test_X1[test_X1 > 1] = 1

(82332, 45)
(175341, 45)


In [4]:
model_lr = LogisticRegression(max_iter=10000, random_state=0, n_jobs=-1)
model_gb = GradientBoostingClassifier(random_state=0)

In [5]:
Individual_feature_sets = pd.read_csv('../Results/Individual_Feature_sets.csv')
stopping_points = pd.read_csv('../Results/stopping_points.csv')

In [6]:
f1_all = []
for i in range(7):
    f1s = []
    for k in trange(2, Individual_feature_sets.shape[1]+1):
        features = Individual_feature_sets.iloc[i, 1:k]
        model_lr.fit(train_X[features], train_Y)
        predict = model_lr.predict(test_X[features])
        f1s.append(f1_score(test_Y, predict))
    f1_all.append(f1s)

100%|██████████| 24/24 [00:37<00:00,  1.58s/it]
100%|██████████| 24/24 [00:31<00:00,  1.31s/it]
100%|██████████| 24/24 [00:30<00:00,  1.29s/it]
100%|██████████| 24/24 [00:28<00:00,  1.19s/it]
100%|██████████| 24/24 [00:30<00:00,  1.29s/it]
100%|██████████| 24/24 [00:28<00:00,  1.18s/it]
100%|██████████| 24/24 [00:26<00:00,  1.11s/it]


In [7]:
pd.DataFrame(f1_all, index=['chi2', 'ANOVA', 'mutualinfo', 'sfs(rf)', 'sfs(lr)', 'im(rf)', 'im(lr)']).to_csv('../Results/Individual_F1_LR_Test.csv')

In [8]:
f1_all = []
for i in range(7):
    f1s = []
    for k in trange(2, Individual_feature_sets.shape[1]+1):
        features = Individual_feature_sets.iloc[i, 1:k]
        model_gb.fit(train_X[features], train_Y)
        predict = model_gb.predict(test_X[features])
        f1s.append(f1_score(test_Y, predict))
    f1_all.append(f1s)

100%|██████████| 24/24 [03:35<00:00,  8.99s/it]
100%|██████████| 24/24 [03:32<00:00,  8.87s/it]
100%|██████████| 24/24 [04:01<00:00, 10.08s/it]
100%|██████████| 24/24 [03:35<00:00,  8.98s/it]
100%|██████████| 24/24 [03:29<00:00,  8.72s/it]
100%|██████████| 24/24 [03:40<00:00,  9.17s/it]
100%|██████████| 24/24 [03:38<00:00,  9.12s/it]


In [9]:
pd.DataFrame(f1_all, index=['chi2', 'ANOVA', 'mutualinfo', 'sfs(rf)', 'sfs(lr)', 'im(rf)', 'im(lr)']).to_csv('../Results/Individual_F1_GB_Test.csv')

In [10]:
from tensorflow.python.keras import Sequential, layers, optimizers, losses, metrics, callbacks, backend

In [11]:
def ModelCreate(input_shape):
    model = Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape=input_shape))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])
    return model

In [12]:
callback = callbacks.EarlyStopping(monitor='loss', mode='min', patience=3, min_delta=0.01, restore_best_weights=True)

f1_all = []
for i in range(7):
    f1s = []
    for k in trange(2, Individual_feature_sets.shape[1]+1):
        model_nn = ModelCreate((k-1,))
        features = Individual_feature_sets.iloc[i, 1:k]
        model_nn.fit(train_X[features], train_Y, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
        predict = model_nn.predict(test_X[features], use_multiprocessing=True)
        predict = np.where(predict < 0.5, 0, 1)
        f1s.append(f1_score(test_Y, predict))
    f1_all.append(f1s)

100%|██████████| 24/24 [52:12<00:00, 130.52s/it]
100%|██████████| 24/24 [53:22<00:00, 133.42s/it]
100%|██████████| 24/24 [55:37<00:00, 139.07s/it]
100%|██████████| 24/24 [54:48<00:00, 137.04s/it]
100%|██████████| 24/24 [53:38<00:00, 134.12s/it]
100%|██████████| 24/24 [52:00<00:00, 130.01s/it]
100%|██████████| 24/24 [50:34<00:00, 126.44s/it]


In [13]:
pd.DataFrame(f1_all, index=['chi2', 'ANOVA', 'mutualinfo', 'sfs(rf)', 'sfs(lr)', 'im(rf)', 'im(lr)']).to_csv('../Results/Individual_F1_DNN_Test.csv')

In [15]:
set_feature_sets = pd.read_csv('../Results/Set_Feature_sets.csv')

In [24]:
set_feature_sets.iloc[0, 2]

"['dur', 'sttl', 'synack', 'smean', 'ct_state_ttl', 'ct_dst_sport_ltm']"

In [34]:
f_raw = set_feature_sets.iloc[0, 2].translate({ord(i): None for i in "[] '"})
f_str = f_raw.split(',')
print(f_str)

['dur', 'sttl', 'synack', 'smean', 'ct_state_ttl', 'ct_dst_sport_ltm']


In [36]:
f1_all = []
for i in range(3):
    f1s = []
    for k in trange(1, set_feature_sets.shape[1]):
        features = set_feature_sets.iloc[i, k].translate({ord(i): None for i in "[] '"}).split(',')
        if features[0] != '':
            model_lr.fit(train_X[features], train_Y)
            predict = model_lr.predict(test_X[features])
            f1s.append(f1_score(test_Y, predict))
        else:
            f1s.append(0)
    f1_all.append(f1s)

100%|██████████| 24/24 [00:51<00:00,  2.13s/it]
100%|██████████| 24/24 [00:12<00:00,  1.98it/s]
100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


In [37]:
pd.DataFrame(f1_all, index=['union', 'intersection', 'quorum']).to_csv('../Results/Set_F1_LR_Test.csv')

In [38]:
f1_all = []
for i in range(3):
    f1s = []
    for k in trange(1, set_feature_sets.shape[1]):
        features = set_feature_sets.iloc[i, k].translate({ord(i): None for i in "[] '"}).split(',')
        if features[0] != '':
            model_lr.fit(train_X[features], train_Y)
            predict = model_lr.predict(test_X[features])
            f1s.append(f1_score(test_Y, predict))
        else:
            f1s.append(0)
    f1_all.append(f1s)

100%|██████████| 24/24 [00:43<00:00,  1.80s/it]
100%|██████████| 24/24 [00:12<00:00,  1.98it/s]
100%|██████████| 24/24 [00:24<00:00,  1.01s/it]


In [39]:
pd.DataFrame(f1_all, index=['union', 'intersection', 'quorum']).to_csv('../Results/Set_F1_GB_Test.csv')

In [42]:
f1_all = []
for i in range(3):
    f1s = []
    for k in trange(1, set_feature_sets.shape[1]):
        features = set_feature_sets.iloc[i, k].translate({ord(i): None for i in "[] '"}).split(',')
        if features[0] != '':
            model_nn = ModelCreate((len(features),))
            model_nn.fit(train_X[features], train_Y, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
            predict = model_nn.predict(test_X[features], use_multiprocessing=True)
            predict = np.where(predict < 0.5, 0, 1)
            f1s.append(f1_score(test_Y, predict))
            backend.clear_session()
        else:
            f1s.append(0)
    f1_all.append(f1s)

100%|██████████| 24/24 [49:23<00:00, 123.49s/it]
100%|██████████| 24/24 [29:07<00:00, 72.80s/it] 
100%|██████████| 24/24 [45:03<00:00, 112.63s/it]


In [43]:
pd.DataFrame(f1_all, index=['union', 'intersection', 'quorum']).to_csv('../Results/Set_F1_DNN_Test.csv')

In [44]:
greedy_feature_sets = pd.read_csv('../Results/Greedy_Feature_sets.csv')

In [45]:
f1_all = []
for k in trange(2, greedy_feature_sets.shape[1]+1):
    features = greedy_feature_sets.iloc[0, 1:k]
    model_lr.fit(train_X[features], train_Y)
    predict = model_lr.predict(test_X[features])
    f1_all.append(f1_score(test_Y, predict))

100%|██████████| 23/23 [00:32<00:00,  1.40s/it]


In [46]:
pd.DataFrame([f1_all], index=['greedy']).to_csv('../Results/Greedy_F1_LR_Test.csv')

In [47]:
f1_all = []
for k in trange(2, greedy_feature_sets.shape[1]+1):
    features = greedy_feature_sets.iloc[0, 1:k]
    model_gb.fit(train_X[features], train_Y)
    predict = model_gb.predict(test_X[features])
    f1_all.append(f1_score(test_Y, predict))

100%|██████████| 23/23 [03:31<00:00,  9.20s/it]


In [48]:
pd.DataFrame([f1_all], index=['greedy']).to_csv('../Results/Greedy_F1_GB_Test.csv')

In [49]:
f1_all = []
for k in trange(2, greedy_feature_sets.shape[1]+1):
    features = greedy_feature_sets.iloc[0, 1:k]
    model_nn = ModelCreate((len(features),))
    model_nn.fit(train_X[features], train_Y, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
    predict = model_nn.predict(test_X[features], use_multiprocessing=True)
    predict = np.where(predict < 0.5, 0, 1)
    f1_all.append(f1_score(test_Y, predict))
    backend.clear_session()

100%|██████████| 23/23 [52:33<00:00, 137.09s/it]


In [50]:
pd.DataFrame([f1_all], index=['greedy']).to_csv('../Results/Greedy_F1_DNN_Test.csv')

# Paper

In [51]:
greedy_lr = ['dmean', 'dload',
       'sload', 'swin', 'ct_state_ttl', 'djit',
       'sjit', 'dinpkt', 'synack', 'dttl',
       'ct_dst_sport_ltm', 'sinpkt']
greedy_gb = ['smean', 'rate', 'ct_ftp_cmd',
       'ct_flw_http_mthd', 'sttl', 'response_body_len',
       'dtcpb', 'stcpb', 'dmean', 'dload',
       'sload', 'swin', 'ct_state_ttl', 'djit',
       'sjit', 'dinpkt', 'synack', 'dttl',
       'ct_dst_sport_ltm', 'sinpkt']
greedy_nn = ['swin', 'ct_state_ttl', 'djit',
       'sjit', 'dinpkt', 'synack', 'dttl',
       'ct_dst_sport_ltm', 'sinpkt']

In [52]:
model_lr.fit(train_X[greedy_lr], train_Y)
predict = model_lr.predict(test_X[greedy_lr])
print(f1_score(test_Y, predict))

0.7591398282881212


In [53]:
model_gb.fit(train_X[greedy_gb], train_Y)
predict = model_gb.predict(test_X[greedy_gb])
print(f1_score(test_Y, predict))

0.926429676787895


In [54]:
model_nn = ModelCreate((len(greedy_nn),))
model_nn.fit(train_X[greedy_nn], train_Y, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
predict = model_nn.predict(test_X[greedy_nn], use_multiprocessing=True)
predict = np.where(predict < 0.5, 0, 1)
print(f1_score(test_Y, predict))

0.8070279034873309
