# Network Intrusion Detection with Deep Learning

In [1]:
# For a broad introduction to the problem and dataset: https://arxiv.org/pdf/1701.02145.pdf
# For modern results using deep learning: http://ieeexplore.ieee.org/document/7777224/

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## The Data

In [3]:
# For the original '99 KDD dataset: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
# For the NSL-KDD Train+/Test+ data: https://github.com/defcom17/NSL_KDD

In [4]:
with open('kddcup.names', 'r') as infile:
    kdd_names = infile.readlines()
kdd_cols = [x.split(':')[0] for x in kdd_names[1:]]

In [5]:
# The Train+/Test+ datasets include sample difficulty rating and the attack class

In [6]:
kdd_cols += ['class', 'difficulty']

In [7]:
kdd = pd.read_csv('KDDTrain+.txt', names=kdd_cols)
kdd_t = pd.read_csv('KDDTest+.txt', names=kdd_cols)

In [8]:
kdd.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [9]:
# Consult the linked references for attack categories: 
# https://www.researchgate.net/post/What_are_the_attack_types_in_the_NSL-KDD_TEST_set_For_example_processtable_is_a_attack_type_in_test_set_Im_wondering_is_it_prob_DoS_R2L_U2R
# The traffic can be grouped into 5 categories: Normal, DOS, U2R, R2L, Probe
# or more coarsely into Normal vs Anomalous for the binary classification task

In [10]:
kdd_cols = [kdd.columns[0]] + sorted(list(set(kdd.protocol_type.values))) + sorted(list(set(kdd.service.values))) + sorted(list(set(kdd.flag.values))) + kdd.columns[4:].tolist()

In [11]:
attack_map = [x.strip().split() for x in open('training_attack_types', 'r')]
attack_map = {k:v for (k,v) in attack_map}

In [12]:
attack_map

{'apache2': 'dos',
 'arppoison': 'dos',
 'back': 'dos',
 'buffer_overflow': 'u2r',
 'casesen': 'u2r',
 'crashiis': 'dos',
 'desnuke': 'dos',
 'dict': 'r2l',
 'eject': 'u2r',
 'fdformat': 'u2r',
 'ffbconfig': 'u2r',
 'framespoof': 'r2l',
 'ftp_write': 'r2l',
 'ftpwrite': 'r2l',
 'guess_passwd': 'r2l',
 'guest': 'r2l',
 'httptunnel': 'r2l',
 'illegal-sniffer': 'probe',
 'imap': 'r2l',
 'ipsweep': 'probe',
 'land': 'dos',
 'loadmodule': 'u2r',
 'lsdomain': 'probe',
 'mailbomb': 'dos',
 'mscan': 'probe',
 'msscan': 'probe',
 'multihop': 'r2l',
 'named': 'r2l',
 'ncftp': 'r2l',
 'neptune': 'dos',
 'netbus': 'r2l',
 'netcat': 'r2l',
 'nmap': 'probe',
 'ntfsdos': 'u2r',
 'ntinfoscan': 'probe',
 'nukepw': 'u2r',
 'perl': 'u2r',
 'phf': 'r2l',
 'pod': 'dos',
 'portsweep': 'probe',
 'ppmacro': 'r2l',
 'processtable': 'dos',
 'ps': 'u2r',
 'queso': 'probe',
 'rootkit': 'u2r',
 'saint': 'probe',
 'satan': 'probe',
 'sechole': 'u2r',
 'secret': 'u2r',
 'selfping': 'dos',
 'sendmail': 'r2l',
 'smurf

In [13]:
# Here we opt for the 5-class problem

In [14]:
kdd['class'] = kdd['class'].replace(attack_map)
kdd_t['class'] = kdd_t['class'].replace(attack_map)

In [15]:
def cat_encode(df, col):
    return pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col].values)], axis=1)

In [16]:
def log_trns(df, col):
    return df[col].apply(np.log1p)

In [17]:
cat_lst = ['protocol_type', 'service', 'flag']
for col in cat_lst:
    kdd = cat_encode(kdd, col)
    kdd_t = cat_encode(kdd_t, col)

In [18]:
log_lst = ['duration', 'src_bytes', 'dst_bytes']
for col in log_lst:
    kdd[col] = log_trns(kdd, col)
    kdd_t[col] = log_trns(kdd_t, col)

In [19]:
kdd = kdd[kdd_cols]
for col in kdd_cols:
    if col not in kdd_t.columns:
        kdd_t[col] = 0
kdd_t = kdd_t[kdd_cols]

In [20]:
# Now we have used one-hot encoding and log scaling

In [21]:
kdd.head()

Unnamed: 0,duration,icmp,tcp,udp,IRC,X11,Z39_50,aol,auth,bgp,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
0,0.0,0,1,0,0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0.0,0,0,1,0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0.0,0,1,0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos,19
3,0.0,0,1,0,0,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0.0,0,1,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [22]:
difficulty = kdd.pop('difficulty')
target = kdd.pop('class')
y_diff = kdd_t.pop('difficulty')
y_test = kdd_t.pop('class')

In [23]:
target = pd.get_dummies(target)
y_test = pd.get_dummies(y_test)

In [24]:
target

Unnamed: 0,dos,normal,probe,r2l,u2r
0,0,1,0,0,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
5,1,0,0,0,0
6,1,0,0,0,0
7,1,0,0,0,0
8,1,0,0,0,0
9,1,0,0,0,0


In [25]:
y_test

Unnamed: 0,dos,normal,probe,r2l,u2r
0,1,0,0,0,0
1,1,0,0,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,1,0,0
5,0,1,0,0,0
6,0,1,0,0,0
7,0,0,0,1,0
8,0,1,0,0,0
9,0,0,0,1,0


In [26]:
target = target.values
train = kdd.values
test = kdd_t.values
y_test = y_test.values

In [27]:
# We rescale features to [0, 1]

In [28]:
min_max_scaler = MinMaxScaler()
train = min_max_scaler.fit_transform(train)
test = min_max_scaler.transform(test)

In [29]:
train.shape

(125973, 122)

In [30]:
for idx, col in enumerate(list(kdd.columns)):
    print(idx, col)

0 duration
1 icmp
2 tcp
3 udp
4 IRC
5 X11
6 Z39_50
7 aol
8 auth
9 bgp
10 courier
11 csnet_ns
12 ctf
13 daytime
14 discard
15 domain
16 domain_u
17 echo
18 eco_i
19 ecr_i
20 efs
21 exec
22 finger
23 ftp
24 ftp_data
25 gopher
26 harvest
27 hostnames
28 http
29 http_2784
30 http_443
31 http_8001
32 imap4
33 iso_tsap
34 klogin
35 kshell
36 ldap
37 link
38 login
39 mtp
40 name
41 netbios_dgm
42 netbios_ns
43 netbios_ssn
44 netstat
45 nnsp
46 nntp
47 ntp_u
48 other
49 pm_dump
50 pop_2
51 pop_3
52 printer
53 private
54 red_i
55 remote_job
56 rje
57 shell
58 smtp
59 sql_net
60 ssh
61 sunrpc
62 supdup
63 systat
64 telnet
65 tftp_u
66 tim_i
67 time
68 urh_i
69 urp_i
70 uucp
71 uucp_path
72 vmnet
73 whois
74 OTH
75 REJ
76 RSTO
77 RSTOS0
78 RSTR
79 S0
80 S1
81 S2
82 S3
83 SF
84 SH
85 src_bytes
86 dst_bytes
87 land
88 wrong_fragment
89 urgent
90 hot
91 num_failed_logins
92 logged_in
93 num_compromised
94 root_shell
95 su_attempted
96 num_root
97 num_file_creations
98 num_shells
99 num_access_files


## The Model

In [31]:
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Activation, Merge, Reshape, Dropout
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [32]:
# We apply a fairly simple MLP architecture

In [33]:
def build_embedding_network():

    models = []
    model = Sequential()
    model.add(Dense(64, input_dim=122))

    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(5))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [35]:
# We use early stopping on a holdout validation set

In [36]:
NN = build_embedding_network()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [37]:
NN.fit(x=train, y=target, epochs=100, validation_split=0.1, batch_size=128, callbacks=[early_stopping])

Train on 113375 samples, validate on 12598 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


<keras.callbacks.History at 0x7f2d50b53fd0>

## The Performance

In [38]:
from sklearn.metrics import confusion_matrix
preds = NN.predict(test)
pred_lbls = np.argmax(preds, axis=1)
true_lbls = np.argmax(y_test, axis=1)

In [39]:
NN.evaluate(test, y_test)



[2.2678563233129179, 0.77537260468417313]

In [40]:
# With the confusion matrix, we can aggregate model predictions
# This helps to understand the mistakes and refine the model

In [41]:
confusion_matrix(true_lbls, pred_lbls)

array([[5856, 1503,   98,    1,    0],
       [  90, 9396,  222,    2,    1],
       [ 219,  238, 1951,   13,    0],
       [   1, 2581,   28,  276,    1],
       [   0,   64,    0,    2,    1]])

In [42]:
from sklearn.metrics import f1_score
f1_score(true_lbls, pred_lbls, average='weighted')

0.74004048472759298

In [43]:
# Overall, we report similar model performance to the reference above.
# Their research suggest using unsupervised pretraining with autoencoders over
# both train and test before adding classifier layers for fine-tuning.
# I have done no parameter tuning but report comparable performance.
# Note the model has diffuculty with U2R and R2L.