In [1]:
import numpy as np
import pandas as pd
import sys
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop, adam
from keras import backend as K

from cleverhans.attacks import FastGradientMethod, SaliencyMapMethod
from cleverhans.utils_tf import model_train , model_eval , batch_eval, model_argmax
from cleverhans.attacks_tf import jacobian_graph
from cleverhans.utils import other_classes
from cleverhans.utils_keras import KerasModelWrapper

import tensorflow as tf
from tensorflow.python.platform import flags

from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import accuracy_score, roc_curve, auc, f1_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import pickle
plt.style.use('bmh')

K.set_learning_phase(1)

FLAGS = flags.FLAGS

flags.DEFINE_integer('nb_epochs', 5, 'Number of epochs to train model')
flags.DEFINE_integer('batch_size', 64, 'Size of training batches')
flags.DEFINE_integer('learning_rate', 0.005, 'Learning rate for training')
flags.DEFINE_integer('nb_classes', 5, 'Number of classification classes')
flags.DEFINE_integer('source_samples', 10, 'Nb of test set examples to attack')

print()
print()
print("================================= Start of preprocessing stage ==============================")

names = ['duration', 'protocol', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'other']

df_train = pd.read_csv('../NSL_KDD/KDDTrain+.txt', names=names, header=None)
df_test = pd.read_csv('../NSL_KDD/KDDTest+.txt', names=names, header=None)
print("Initial training and test data shapes: ", df_train.shape, df_test.shape)

full = pd.concat([df_train, df_test])
assert full.shape[0] == df_train.shape[0] + df_test.shape[0]

full['label'] = full['attack_type']

# DoS Attacks
full.loc[full.label == 'neptune', 'label'] = 'dos'
full.loc[full.label == 'back', 'label'] = 'dos'
full.loc[full.label == 'land', 'label'] = 'dos'
full.loc[full.label == 'pod', 'label'] = 'dos'
full.loc[full.label == 'smurf', 'label'] = 'dos'
full.loc[full.label == 'teardrop', 'label'] = 'dos'
full.loc[full.label == 'mailbomb', 'label'] = 'dos'
full.loc[full.label == 'processtable', 'label'] = 'dos'
full.loc[full.label == 'udpstorm', 'label'] = 'dos'
full.loc[full.label == 'apache2', 'label'] = 'dos'
full.loc[full.label == 'worm', 'label'] = 'dos'

# User-to-root (U2R)
full.loc[full.label == 'buffer_overflow', 'label'] = 'u2r'
full.loc[full.label == 'loadmodule', 'label'] = 'u2r'
full.loc[full.label == 'perl', 'label'] = 'u2r'
full.loc[full.label == 'rootkit', 'label'] = 'u2r'
full.loc[full.label == 'sqlattack', 'label'] = 'u2r'
full.loc[full.label == 'xterm', 'label'] = 'u2r'
full.loc[full.label == 'ps', 'label'] = 'u2r'

# Remote-to-local (R2L)
full.loc[full.label == 'ftp_write', 'label'] = 'r2l'
full.loc[full.label == 'guess_passwd', 'label'] = 'r2l'
full.loc[full.label == 'imap', 'label'] = 'r2l'
full.loc[full.label == 'multihop', 'label'] = 'r2l'
full.loc[full.label == 'phf', 'label'] = 'r2l'
full.loc[full.label == 'spy', 'label'] = 'r2l'
full.loc[full.label == 'warezclient', 'label'] = 'r2l'
full.loc[full.label == 'warezmaster', 'label'] = 'r2l'
full.loc[full.label == 'xlock', 'label'] = 'r2l'
full.loc[full.label == 'xsnoop', 'label'] = 'r2l'
full.loc[full.label == 'snmpgetattack', 'label'] = 'r2l'
full.loc[full.label == 'httptunnel', 'label'] = 'r2l'
full.loc[full.label == 'snmpguess', 'label'] = 'r2l'
full.loc[full.label == 'sendmail', 'label'] = 'r2l'
full.loc[full.label == 'named', 'label'] = 'r2l'

# Probe attacls
full.loc[full.label == 'satan', 'label'] = 'probe'
full.loc[full.label == 'ipsweep', 'label'] = 'probe'
full.loc[full.label == 'nmap', 'label'] = 'probe'
full.loc[full.label == 'portsweep', 'label'] = 'probe'
full.loc[full.label == 'saint', 'label'] = 'probe'
full.loc[full.label == 'mscan', 'label'] = 'probe'

full = full.drop(['other', 'attack_type'], axis=1)
print("Unique labels", full.label.unique())
full = full.sample(frac=1).reset_index(drop=True)
# Generate One - Hot encoding
full2 = pd.get_dummies(full, drop_first=False)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d




Initial training and test data shapes:  (125973, 43) (22544, 43)
Unique labels ['normal' 'dos' 'r2l' 'probe' 'u2r']


In [2]:
full2

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_dos,label_normal,label_probe,label_r2l,label_u2r
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,208,6845,0,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0,78,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
# Separate training and test sets again
features = list(full2.columns[:-5])   # Due to One-Hot encoding
y_train = np.array(full2[0: df_train.shape[0]][['label_normal', 'label_dos', 'label_probe', 'label_r2l', 'label_u2r']])
X_train = full2[0: df_train.shape[0]][features]

y_test = np.array(full2[df_train.shape[0]: ][['label_normal', 'label_dos', 'label_probe', 'label_r2l', 'label_u2r']])
X_test = full2[df_train.shape[0]: ][features]

# Scale data
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = np.array(scaler.transform(X_train))
X_test_scaled = np.array(scaler.transform(X_test))

In [4]:
# Generate label encoding for Logistic regression
labels = full.label.unique()
le = LabelEncoder()
le.fit(labels)

LabelEncoder()

In [8]:
full.label

0            dos
1         normal
2            dos
3            dos
4            dos
5         normal
6          probe
7            dos
8         normal
9            dos
10        normal
11           dos
12           dos
13           dos
14         probe
15        normal
16           r2l
17        normal
18           dos
19        normal
20        normal
21        normal
22        normal
23        normal
24         probe
25        normal
26        normal
27        normal
28        normal
29           dos
           ...  
148487       dos
148488       dos
148489     probe
148490    normal
148491    normal
148492     probe
148493    normal
148494    normal
148495    normal
148496    normal
148497    normal
148498    normal
148499       dos
148500       dos
148501     probe
148502       dos
148503       dos
148504    normal
148505    normal
148506       r2l
148507    normal
148508       dos
148509    normal
148510       dos
148511    normal
148512    normal
148513       dos
148514       d

In [7]:
y_full = le.transform(full.label)
y_full

array([0, 1, 0, ..., 0, 1, 0])