In [437]:
## Used Code From https://www.kaggle.com/code/timgoodfellow/nsl-kdd-explorations to get NSL-KDD setup

In [438]:
import pandas as pd
import tensorflow as tf
import numpy as np
from mpi4py import MPI

In [439]:
# Package Options
pd.set_option("display.max_columns", 10)

In [440]:
colnames = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
            'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
            'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
            'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
            'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
            'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate', 'attack_type', 'level']
train_df = pd.read_csv("Data/KDDTrain+.txt", names=colnames, sep=",")
test_df = pd.read_csv("Data/KDDTest+.txt", names=colnames, sep=",")

# display DataFrame
train_df.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,...,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,level
0,0,tcp,ftp_data,SF,491,...,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,...,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,...,1.0,0.0,0.0,neptune,19


In [441]:
# Create an attack flag
# map normal to 0, all attacks to 1
is_attack_train = train_df.attack_type.map(lambda a: 0 if a == 'normal' else 1)
is_attack_test = test_df.attack_type.map(lambda a: 0 if a == 'normal' else 1)

#data_with_attack = df.join(is_attack, rsuffix='_flag')
train_df['attack_flag'] = is_attack_train
test_df['attack_flag'] = is_attack_test

In [442]:
# lists to hold our attack classifications
dos_attacks = ['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm']
probe_attacks = ['ipsweep','mscan','nmap','portsweep','saint','satan']
privilege_attacks = ['buffer_overflow','loadmdoule','perl','ps','rootkit','sqlattack','xterm']
access_attacks = ['ftp_write','guess_passwd','http_tunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xclock','xsnoop']

# we will use these for plotting below
attack_labels = ['Normal','DoS','Probe','Privilege','Access']

# helper function to pass to data frame mapping
def map_attack(attack):
    if attack in dos_attacks:
        # dos_attacks map to 1
        attack_type = 1
    elif attack in probe_attacks:
        # probe_attacks mapt to 2
        attack_type = 2
    elif attack in privilege_attacks:
        # privilege escalation attacks map to 3
        attack_type = 3
    elif attack in access_attacks:
        # remote access attacks map to 4
        attack_type = 4
    else:
        # normal maps to 0
        attack_type = 0

    return attack_type

# map the data and join to the data set
attack_map = train_df.attack_type.apply(map_attack)
train_df['attack_map'] = attack_map

test_attack_map = test_df.attack_type.apply(map_attack)
test_df['attack_map'] = test_attack_map

# view the result
test_df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,...,dst_host_srv_rerror_rate,attack_type,level,attack_flag,attack_map
0,0,tcp,private,REJ,0,...,1.00,neptune,21,1,1
1,0,tcp,private,REJ,0,...,1.00,neptune,21,1,1
2,2,tcp,ftp_data,SF,12983,...,0.00,normal,21,0,0
3,0,icmp,eco_i,SF,20,...,0.00,saint,15,1,2
4,1,tcp,telnet,RSTO,0,...,0.71,mscan,11,1,2
...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,...,0.00,normal,21,0,0
22540,0,tcp,http,SF,317,...,0.00,normal,21,0,0
22541,0,tcp,http,SF,54540,...,0.07,back,15,1,1
22542,0,udp,domain_u,SF,42,...,0.00,normal,21,0,0


In [443]:
onehot_features = ['protocol_type', 'service', 'flag']
other_features = ['duration', 'src_bytes', 'dst_bytes', 'land',
            'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
            'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
            'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
            'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
            'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate', 'level']
train_onehot = pd.get_dummies(train_df[onehot_features])
test_onehot = pd.get_dummies(test_df[onehot_features])

test_index = np.arange(len(test_df.index))
column_diffs = list(set(train_onehot.columns.values)-set(test_onehot.columns.values))
diff_df = pd.DataFrame(0, index=test_index, columns=column_diffs)
column_order = train_onehot.columns.to_list()
test_temp = test_onehot.join(diff_df)
test_final = test_temp[column_order].fillna(0)

test_set = test_final.join(test_df[other_features])
train_set = train_onehot.join(train_df[other_features])

train_set_norm = train_set.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
test_set_norm = test_set.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
train_set_norm = train_set_norm.fillna(0)
test_set_norm = test_set_norm.fillna(0)
num_inputs = len(train_set.columns.to_list())
num_outputs_multi = len(attack_labels)

In [458]:
batch_size = 64
# make data into tensors
train_set = tf.convert_to_tensor(train_set_norm)
test_set = tf.convert_to_tensor(test_set_norm)
# train labels (sparse)
train_multi_y = tf.convert_to_tensor(train_df['attack_map'].to_list())
train_binary_y = tf.convert_to_tensor(train_df['attack_flag'])
# test labels (sparse)
test_binary_y = tf.convert_to_tensor(test_df['attack_flag'])
test_multi_y = tf.convert_to_tensor(test_df['attack_map'].to_list())
# create tensorflow dataset for test and train
training_binary = tf.data.Dataset.from_tensor_slices((train_set, train_binary_y)).batch(batch_size)
training_multi = tf.data.Dataset.from_tensor_slices((train_set, train_multi_y)).batch(batch_size)
test_binary = tf.data.Dataset.from_tensor_slices((test_set, test_binary_y)).batch(batch_size)
test_multi = tf.data.Dataset.from_tensor_slices((test_set, test_multi_y)).batch(batch_size)

In [445]:
# binary classification model
binary_model = tf.keras.Sequential()
binary_model.add(tf.keras.layers.Dense(128, activation='relu', input_shape=(num_inputs,)))
binary_model.add(tf.keras.layers.Dense(256, activation='relu'))
binary_model.add(tf.keras.layers.Dense(128, activation='relu'))
binary_model.add(tf.keras.layers.Dense(64, activation='relu'))
binary_model.add(tf.keras.layers.Dense(10, activation='relu'))
binary_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [446]:
# multi classification model
multi_model = tf.keras.Sequential()
multi_model.add(tf.keras.layers.Dense(128, activation='relu', input_shape=(num_inputs,)))
multi_model.add(tf.keras.layers.Dense(256, activation='relu'))
multi_model.add(tf.keras.layers.Dense(128, activation='relu'))
multi_model.add(tf.keras.layers.Dense(64, activation='relu'))
multi_model.add(tf.keras.layers.Dense(10, activation='relu'))
multi_model.add(tf.keras.layers.Dense(num_outputs_multi, activation='softmax'))

In [447]:
def scheduler(epoch, lr):
  if epoch < 1:
    return lr
  else:
    return lr * tf.math.exp(-0.1)

In [455]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
binary_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [456]:
binary_model.fit(training_binary, epochs=10, callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1709ef430>

In [None]:
binary_model.evaluate(test_binary)

In [452]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
multi_model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [453]:
multi_model.fit(training_multi, epochs=10, callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x159bae260>

In [457]:
multi_model.evaluate(test_multi)



[8.15997314453125, 0.783135175704956]