In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Load Data
train_raw = pd.read_csv('../Data/KDDCUP99/train.csv')
test_raw = pd.read_csv('../Data/KDDCUP99/test.csv')
print(train_raw.shape)
print(test_raw.shape)

# Seperate label and Drop ID
def LabelEncode(x):
    if x == 'normal.':
        return 0
    else:
        return 1
train_X = train_raw.drop(['type'], axis=1).select_dtypes(include='number')
print(train_X.shape)
test_X = test_raw.drop(['type'], axis=1).select_dtypes(include='number')
print(test_X.shape)
train_Y = train_raw['type'].apply(LabelEncode)
test_Y = test_raw['type'].apply(LabelEncode)

# Normalize data with min, max of training data
test_X1 = (test_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))
train_X1 = (train_X - train_X.min(axis=0)) / (train_X.max(axis=0) - train_X.min(axis=0))

train_X1.fillna(0, inplace=True)
test_X1.fillna(0, inplace=True)

(494021, 42)
(311029, 42)
(494021, 38)
(311029, 38)


In [4]:
model_lr = LogisticRegression(max_iter=10000, random_state=0, n_jobs=-1)
model_gb = GradientBoostingClassifier(random_state=0)

In [5]:
Individual_feature_sets = pd.read_csv('../Results/KDDCUP99/Feature_sets.csv')

In [None]:
f1_all = []
for i in range(7):
    f1s = []
    for k in trange(2, Individual_feature_sets.shape[1]+1):
        features = Individual_feature_sets.iloc[i, 1:k]
        model_lr.fit(train_X[features], train_Y)
        predict = model_lr.predict(test_X[features])
        f1s.append(f1_score(test_Y, predict))
    f1_all.append(f1s)

In [19]:
pd.DataFrame(f1_all, index=['chi2', 'ANOVA', 'mutualinfo', 'sfs(rf)', 'sfs(lr)', 'im(rf)', 'im(lr)']).to_csv('../Results/KDDCUP99/Individual_F1_LR_Test.csv')

In [6]:
f1_all = []
for i in range(7):
    f1s = []
    for k in trange(2, Individual_feature_sets.shape[1]+1):
        features = Individual_feature_sets.iloc[i, 1:k]
        model_gb.fit(train_X[features], train_Y)
        predict = model_gb.predict(test_X[features])
        f1s.append(f1_score(test_Y, predict))
    f1_all.append(f1s)

100%|██████████| 25/25 [07:22<00:00, 17.71s/it]
100%|██████████| 25/25 [07:26<00:00, 17.86s/it]
100%|██████████| 25/25 [08:12<00:00, 19.70s/it]
100%|██████████| 25/25 [07:56<00:00, 19.04s/it]
100%|██████████| 25/25 [07:24<00:00, 17.79s/it]
100%|██████████| 25/25 [08:06<00:00, 19.48s/it]
100%|██████████| 25/25 [07:12<00:00, 17.29s/it]


In [7]:
pd.DataFrame(f1_all, index=['chi2', 'ANOVA', 'mutualinfo', 'sfs(rf)', 'sfs(lr)', 'im(rf)', 'im(lr)']).to_csv('../Results/KDDCUP99/Individual_F1_GB_Test.csv')

In [7]:
from tensorflow.python.keras import Sequential, layers, optimizers, losses, metrics, callbacks, backend

In [8]:
def ModelCreate(input_shape):
    model = Sequential()
    model.add(layers.Dense(50, activation='relu', input_shape=input_shape))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(50, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy])
    return model

In [9]:
callback = callbacks.EarlyStopping(monitor='loss', mode='min', patience=3, min_delta=0.01, restore_best_weights=True)

In [10]:
f1_all = []
for i in range(7):
    f1s = []
    for k in trange(2, Individual_feature_sets.shape[1]+1):
        model_nn = ModelCreate((k-1,))
        features = Individual_feature_sets.iloc[i, 1:k]
        model_nn.fit(train_X[features], train_Y, batch_size=1024, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
        predict = model_nn.predict(test_X[features], use_multiprocessing=True)
        predict = np.where(predict < 0.5, 0, 1)
        f1s.append(f1_score(test_Y, predict))
    f1_all.append(f1s)

100%|██████████| 25/25 [13:09<00:00, 31.57s/it]
100%|██████████| 25/25 [13:37<00:00, 32.69s/it]
100%|██████████| 25/25 [17:27<00:00, 41.91s/it]
100%|██████████| 25/25 [15:56<00:00, 38.27s/it]
100%|██████████| 25/25 [14:37<00:00, 35.11s/it]
100%|██████████| 25/25 [15:09<00:00, 36.38s/it]
100%|██████████| 25/25 [12:18<00:00, 29.55s/it]


In [11]:
pd.DataFrame(f1_all, index=['chi2', 'ANOVA', 'mutualinfo', 'sfs(rf)', 'sfs(lr)', 'im(rf)', 'im(lr)']).to_csv('../Results/KDDCUP99/Individual_F1_DNN_Test.csv')

In [12]:
set_feature_sets = pd.read_csv('../Results/KDDCUP99/Set_Feature_sets.csv')

In [13]:
f1_all = []
for i in range(3):
    f1s = []
    for k in trange(1, set_feature_sets.shape[1]):
        features = set_feature_sets.iloc[i, k].translate({ord(i): None for i in "[] '"}).split(',')
        if features[0] != '':
            model_lr.fit(train_X[features], train_Y)
            predict = model_lr.predict(test_X[features])
            f1s.append(f1_score(test_Y, predict))
        else:
            f1s.append(0)
    f1_all.append(f1s)

100%|██████████| 25/25 [01:07<00:00,  2.72s/it]
100%|██████████| 25/25 [00:35<00:00,  1.42s/it]
100%|██████████| 25/25 [00:46<00:00,  1.87s/it]


In [14]:
pd.DataFrame(f1_all, index=['union', 'intersection', 'quorum']).to_csv('../Results/KDDCUP99/Set_F1_LR_Test.csv')

In [15]:
f1_all = []
for i in range(3):
    f1s = []
    for k in trange(1, set_feature_sets.shape[1]):
        features = set_feature_sets.iloc[i, k].translate({ord(i): None for i in "[] '"}).split(',')
        if features[0] != '':
            model_gb.fit(train_X[features], train_Y)
            predict = model_gb.predict(test_X[features])
            f1s.append(f1_score(test_Y, predict))
        else:
            f1s.append(0)
    f1_all.append(f1s)

100%|██████████| 25/25 [09:02<00:00, 21.71s/it]
100%|██████████| 25/25 [06:18<00:00, 15.13s/it]
100%|██████████| 25/25 [07:58<00:00, 19.15s/it]


In [16]:
pd.DataFrame(f1_all, index=['union', 'intersection', 'quorum']).to_csv('../Results/KDDCUP99/Set_F1_GB_Test.csv')

In [17]:
f1_all = []
for i in range(3):
    f1s = []
    for k in trange(1, set_feature_sets.shape[1]):
        features = set_feature_sets.iloc[i, k].translate({ord(i): None for i in "[] '"}).split(',')
        if features[0] != '':
            model_nn = ModelCreate((len(features),))
            model_nn.fit(train_X[features], train_Y, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
            predict = model_nn.predict(test_X[features], use_multiprocessing=True)
            predict = np.where(predict < 0.5, 0, 1)
            f1s.append(f1_score(test_Y, predict))
            backend.clear_session()
        else:
            f1s.append(0)
    f1_all.append(f1s)

100%|██████████| 25/25 [3:42:16<00:00, 533.47s/it]  
100%|██████████| 25/25 [2:20:40<00:00, 337.61s/it]  
100%|██████████| 25/25 [2:44:35<00:00, 395.02s/it]  


In [18]:
pd.DataFrame(f1_all, index=['union', 'intersection', 'quorum']).to_csv('../Results/KDDCUP99/Set_F1_DNN_Test.csv')

In [5]:
greedy_feature_sets = ['count', 'logged_in', 'dst_bytes', 'dst_host_count', 'srv_diff_host_rate', 'dst_host_srv_diff_host_rate', 'same_srv_rate', 'dst_host_diff_srv_rate', 'duration', 'num_access_files', 'src_bytes', 'diff_srv_rate', 'dst_host_srv_rerror_rate', 'num_root', 'num_file_creations', 'is_guest_login', 'wrong_fragment', 'num_shells', 'su_attempted', 'root_shell', 'land', 'urgent', 'num_failed_logins', 'num_outbound_cmds', 'is_host_login']

In [23]:
f1_all = []
for k in trange(len(greedy_feature_sets)):
    features = greedy_feature_sets[:k+1]
    model_lr.fit(train_X[features], train_Y)
    predict = model_lr.predict(test_X[features])
    f1_all.append(f1_score(test_Y, predict))

100%|██████████| 25/25 [01:01<00:00,  2.47s/it]


In [24]:
pd.DataFrame([f1_all], index=['greedy']).to_csv('../Results/KDDCUP99/Greedy_F1_LR_Test.csv')

In [25]:
f1_all = []
for k in trange(len(greedy_feature_sets)):
    features = greedy_feature_sets[:k+1]
    model_gb.fit(train_X[features], train_Y)
    predict = model_gb.predict(test_X[features])
    f1_all.append(f1_score(test_Y, predict))

100%|██████████| 25/25 [08:13<00:00, 19.75s/it]


In [26]:
pd.DataFrame([f1_all], index=['greedy']).to_csv('../Results/KDDCUP99/Greedy_F1_GB_Test.csv')

In [10]:
f1_all = []
for k in trange(len(greedy_feature_sets)):
    features = greedy_feature_sets[:k+1]
    model_nn = ModelCreate((len(features),))
    model_nn.fit(train_X[features], train_Y, batch_size=1024, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
    predict = model_nn.predict(test_X[features], use_multiprocessing=True)
    predict = np.where(predict < 0.5, 0, 1)
    f1_all.append(f1_score(test_Y, predict))
    backend.clear_session()

100%|██████████| 25/25 [14:47<00:00, 35.50s/it]


In [11]:
pd.DataFrame([f1_all], index=['greedy']).to_csv('../Results/KDDCUP99/Greedy_F1_DNN_Test.csv')

# Paper

In [51]:
greedy_lr = ['dmean', 'dload',
       'sload', 'swin', 'ct_state_ttl', 'djit',
       'sjit', 'dinpkt', 'synack', 'dttl',
       'ct_dst_sport_ltm', 'sinpkt']
greedy_gb = ['smean', 'rate', 'ct_ftp_cmd',
       'ct_flw_http_mthd', 'sttl', 'response_body_len',
       'dtcpb', 'stcpb', 'dmean', 'dload',
       'sload', 'swin', 'ct_state_ttl', 'djit',
       'sjit', 'dinpkt', 'synack', 'dttl',
       'ct_dst_sport_ltm', 'sinpkt']
greedy_nn = ['swin', 'ct_state_ttl', 'djit',
       'sjit', 'dinpkt', 'synack', 'dttl',
       'ct_dst_sport_ltm', 'sinpkt']

In [52]:
model_lr.fit(train_X[greedy_lr], train_Y)
predict = model_lr.predict(test_X[greedy_lr])
print(f1_score(test_Y, predict))

0.7591398282881212


In [53]:
model_gb.fit(train_X[greedy_gb], train_Y)
predict = model_gb.predict(test_X[greedy_gb])
print(f1_score(test_Y, predict))

0.926429676787895


In [54]:
model_nn = ModelCreate((len(greedy_nn),))
model_nn.fit(train_X[greedy_nn], train_Y, epochs=30, callbacks=[callback], use_multiprocessing=True, verbose=0)
predict = model_nn.predict(test_X[greedy_nn], use_multiprocessing=True)
predict = np.where(predict < 0.5, 0, 1)
print(f1_score(test_Y, predict))

0.8070279034873309
