In [1]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
import itertools
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.layers import Input,Dropout,Dense
from keras.models import Model
from keras import regularizers
from keras.utils.data_utils import get_file
%matplotlib inline

Using TensorFlow backend.


In [2]:


# Downloading training and test sets to local drive
try:
    training_set_path = get_file('KDDTrain%2B.csv', origin='https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain%2B.csv')
except:
    print('Error downloading')
    raise
    

try:
    test_set_path = get_file('KDDTest%2B.csv', origin='https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest%2B.csv')
except:
    print('Error downloading')
    raise
training_df = pd.read_csv(training_set_path, header=None)
testing_df = pd.read_csv(test_set_path, header=None)


In [3]:
training_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [4]:
testing_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11


In [5]:
 columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome',
    'difficulty'
]
training_df.columns = columns
testing_df.columns = columns
testing_df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'outcome', 'difficulty'],
      dtype='object')

In [6]:
print("Training set has {} rows.".format(len(training_df)))
print("Testing set has {} rows.".format(len(testing_df)))

Training set has 125973 rows.
Testing set has 22543 rows.


In [7]:
training_outcomes=training_df["outcome"].unique()
testing_outcomes=testing_df["outcome"].unique()
print("The training set has {} possible outcomes \n".format(len(training_outcomes)) )
print(", ".join(training_outcomes)+".")
print("\nThe testing set has {} possible outcomes \n".format(len(testing_outcomes)))
print(", ".join(testing_outcomes)+".")

The training set has 23 possible outcomes 

normal, neptune, warezclient, ipsweep, portsweep, teardrop, nmap, satan, smurf, pod, back, guess_passwd, ftp_write, multihop, rootkit, buffer_overflow, imap, warezmaster, phf, land, loadmodule, spy, perl.

The testing set has 38 possible outcomes 

neptune, normal, saint, mscan, guess_passwd, smurf, apache2, satan, buffer_overflow, back, warezmaster, snmpgetattack, processtable, pod, httptunnel, nmap, ps, snmpguess, ipsweep, mailbomb, portsweep, multihop, named, sendmail, loadmodule, xterm, worm, teardrop, rootkit, xlock, perl, land, xsnoop, sqlattack, ftp_write, imap, udpstorm, phf.


In [8]:
# A list ot attack names that belong to each general attack type
dos_attacks=["snmpgetattack","back","land","neptune","smurf","teardrop","pod","apache2","udpstorm","processtable","mailbomb"]
r2l_attacks=["snmpguess","worm","httptunnel","named","xlock","xsnoop","sendmail","ftp_write","guess_passwd","imap","multihop","phf","spy","warezclient","warezmaster"]
u2r_attacks=["sqlattack","buffer_overflow","loadmodule","perl","rootkit","xterm","ps"]
probe_attacks=["ipsweep","nmap","portsweep","satan","saint","mscan"]

# Our new labels
classes=["Normal","Dos","R2L","U2R","Probe"]

#Helper function to label samples to 5 classes
def label_attack (row):
    if row["outcome"] in dos_attacks:
        return classes[1]
    if row["outcome"] in r2l_attacks:
        return classes[2]
    if row["outcome"] in u2r_attacks:
        return classes[3]
    if row["outcome"] in probe_attacks:
        return classes[4]
    return classes[0]


#We combine the datasets temporarily to do the labeling 
test_samples_length = len(testing_df)
df=pd.concat([training_df,testing_df])
df["Class"]=df.apply(label_attack,axis=1)


# The old outcome field is dropped since it was replaced with the Class field, the difficulty field will be dropped as well.
df=df.drop("outcome",axis=1)
df=df.drop("difficulty",axis=1)


df[:5]

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,Normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,Normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,Dos
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,Normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Normal


In [9]:
#we again split the data into training and test sets.
training_df= df.iloc[:-test_samples_length, :]
testing_df= df.iloc[-test_samples_length:,:]

In [10]:
print(testing_df.shape)

(22543, 42)


In [11]:
training_outcomes=training_df["Class"].unique()
testing_outcomes=testing_df["Class"].unique()
print("The training set has {} possible outcomes \n".format(len(training_outcomes)) )
print(", ".join(training_outcomes)+".")
print("\nThe testing set has {} possible outcomes \n".format(len(testing_outcomes)))
print(", ".join(testing_outcomes)+".")

The training set has 5 possible outcomes 

Normal, Dos, R2L, Probe, U2R.

The testing set has 5 possible outcomes 

Dos, Normal, Probe, R2L, U2R.


In [12]:
# Helper function for scaling continous values
def minmax_scale_values(training_df,testing_df, col_name):
    scaler = MinMaxScaler()
    scaler = scaler.fit(training_df[col_name].values.reshape(-1, 1))#reshape(-1,1)-->1 column only...and reshape(-1,2) means 2 colums.
    train_values_standardized = scaler.transform(training_df[col_name].values.reshape(-1, 1))
    training_df[col_name] = train_values_standardized
    test_values_standardized = scaler.transform(testing_df[col_name].values.reshape(-1, 1))
    testing_df[col_name] = test_values_standardized
    
         
#Helper function for one hot encoding
def encode_text(training_df,testing_df, name):
    #pd.get_dummies==https://towardsdatascience.com/the-dummys-guide-to-creating-dummy-variables-f21faddb1d40
    training_set_dummies = pd.get_dummies(training_df[name])
    testing_set_dummies = pd.get_dummies(testing_df[name])
    for x in training_set_dummies.columns:
        dummy_name = "{}_{}".format(name, x)
        training_df[dummy_name] = training_set_dummies[x]
        if x in testing_set_dummies.columns :
            testing_df[dummy_name]=testing_set_dummies[x]
        else :
            testing_df[dummy_name]=np.zeros(len(testing_df))
    training_df.drop(name, axis=1, inplace=True)
    testing_df.drop(name, axis=1, inplace=True)
    
    
sympolic_columns=["protocol_type","service","flag"]
label_column="Class"
for column in df.columns :
    if column in sympolic_columns:
        encode_text(training_df,testing_df,column)
    elif not column == label_column:
        minmax_scale_values(training_df,testing_df, column)

In [13]:
training_df.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,3.558064e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0.0,1.057999e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.0,1.681203e-07,6.223962e-06,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0.0,1.442067e-07,3.20626e-07,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
testing_df.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,4.7e-05,9.408217e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,0.0,1.449313e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,2.3e-05,0.0,1.145093e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
x,y=training_df,training_df.pop("Class").values
x=x.values
x_test,y_test=testing_df,testing_df.pop("Class").values
x_test=x_test.values
y0=np.ones(len(y),np.int8)
y0[np.where(y==classes[0])]=0
y0_test=np.ones(len(y_test),np.int8)
y0_test[np.where(y_test==classes[0])]=0
print(y0_test)

[1 1 0 ... 1 0 1]


In [16]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier()

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x,y)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
y_pred=clf.predict(x_test)
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7587721243845096


In [18]:
from sklearn.linear_model import LogisticRegression

LR=LogisticRegression()

LR.fit(x,y)
y_pred1=LR.predict(x_test)
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred1))

Accuracy: 0.7420928891451892


In [19]:
x.shape

(125973, 122)

In [20]:
#Autoencoders (AE) are neural networks that aims to copy their inputs to their outputs.
#They work by compressing the input into a latent-space representation, and
#then reconstructing the output from this representation
#Buildling and training the model
xtrain=pd.DataFrame()
def getModel():
    inp = Input(shape=(x.shape[1],))
    # Encoder Layers
    encoded1 = Dense(120, activation = 'relu')(inp)
    encoded2 = Dense(110, activation = 'relu')(encoded1)
    encoded3 = Dense(100, activation = 'relu')(encoded2)
    encoded4 = Dense(90, activation = 'relu')(encoded3)
    encoded5 = Dense(80, activation = 'relu')(encoded4)
    encoded6 = Dense(70, activation = 'relu')(encoded5)
    encoded7 = Dense(60, activation = 'relu')(encoded6)
    encoded8 = Dense(50, activation = 'relu')(encoded7)
    encoded9 = Dense(40, activation = 'relu')(encoded8)
    encoded10 = Dense(30, activation = 'relu')(encoded9)
    encoded11 = Dense(20, activation = 'relu')(encoded10)
    encoded12 = Dense(10, activation = 'relu')(encoded11)

    # Decoder Layers
    decoded1 = Dense(20, activation = 'relu')(encoded12)
    decoded2 = Dense(30, activation = 'relu')(decoded1)
    decoded3 = Dense(40, activation = 'relu')(decoded2)
    decoded4 = Dense(50, activation = 'relu')(decoded3)
    decoded5 = Dense(60, activation = 'relu')(decoded4)
    decoded6 = Dense(70, activation = 'relu')(decoded5)
    decoded7 = Dense(80, activation = 'relu')(decoded6)
    decoded8 = Dense(90, activation = 'relu')(decoded7)
    decoded9 = Dense(100, activation = 'relu')(decoded8)
    decoded10 = Dense(110, activation = 'relu')(decoded9)
    decoded11 = Dense(120, activation = 'relu')(decoded10)
    decoded12 = Dense(x.shape[1], activation = 'sigmoid')(decoded11)
    
    
    
    from sklearn.preprocessing import minmax_scale
    train_scaled = minmax_scale(x, axis = 0)
    test_scaled = minmax_scale(x_test, axis = 0)
    encoder =Model(inp, encoded12)
    encoded_train = pd.DataFrame(encoder.predict(train_scaled))
    encoded_test = pd.DataFrame(encoder.predict(test_scaled))
    xtrain=encoded_train
    xtest=encoded_test
    print(encoded_train.shape)
    
    
    from sklearn.ensemble import RandomForestClassifier

    #Create a Gaussian Classifier
    clf=RandomForestClassifier()
    
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(xtrain,y)
    y_pred=clf.predict(xtest)
    h=pd.DataFrame(y_pred,columns=['Attack'])
    print(h['Attack'].value_counts())
    from sklearn import metrics
    # Model Accuracy, how often is the classifier correct?
    print("Accuracy from random forest after applying autoencoder:",metrics.accuracy_score(y_test, y_pred))
        

    autoencoder = Model(inp, decoded12)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error',metrics=['accuracy']) #adam-->adaptive learning rate optimization algorithm
    return autoencoder

autoencoder=getModel() 
history=autoencoder.fit(x[np.where(y0==0)],x[np.where(y0==0)],
               epochs=4,
                batch_size=300,
                shuffle=True,
                validation_split=0.1
                       )



(125973, 10)
Normal    13283
Dos        7451
Probe      1757
R2L          50
U2R           2
Name: Attack, dtype: int64
Accuracy from random forest after applying autoencoder: 0.7213769241006077
Train on 60608 samples, validate on 6735 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [21]:
#Autoencoders (AE) are neural networks that aims to copy their inputs to their outputs.
#They work by compressing the input into a latent-space representation, and
#then reconstructing the output from this representation
#Buildling and training the model
xtrain=pd.DataFrame()
def getModel():
    inp = Input(shape=(x.shape[1],))
    # Encoder Layers
    encoded1 = Dense(120, activation = 'relu')(inp)
    encoded2 = Dense(110, activation = 'relu')(encoded1)
    encoded3 = Dense(100, activation = 'relu')(encoded2)
    encoded4 = Dense(90, activation = 'relu')(encoded3)
    encoded5 = Dense(80, activation = 'relu')(encoded4)
    encoded6 = Dense(70, activation = 'relu')(encoded5)
    encoded7 = Dense(60, activation = 'relu')(encoded6)
    encoded8 = Dense(50, activation = 'relu')(encoded7)
    encoded9 = Dense(40, activation = 'relu')(encoded8)
    encoded10 = Dense(30, activation = 'relu')(encoded9)
    encoded11 = Dense(20, activation = 'relu')(encoded10)
    encoded12 = Dense(10, activation = 'relu')(encoded11)

    # Decoder Layers
    decoded1 = Dense(20, activation = 'relu')(encoded12)
    decoded2 = Dense(30, activation = 'relu')(decoded1)
    decoded3 = Dense(40, activation = 'relu')(decoded2)
    decoded4 = Dense(50, activation = 'relu')(decoded3)
    decoded5 = Dense(60, activation = 'relu')(decoded4)
    decoded6 = Dense(70, activation = 'relu')(decoded5)
    decoded7 = Dense(80, activation = 'relu')(decoded6)
    decoded8 = Dense(90, activation = 'relu')(decoded7)
    decoded9 = Dense(100, activation = 'relu')(decoded8)
    decoded10 = Dense(110, activation = 'relu')(decoded9)
    decoded11 = Dense(120, activation = 'relu')(decoded10)
    decoded12 = Dense(x.shape[1], activation = 'sigmoid')(decoded11)
    
    
    
    from sklearn.preprocessing import minmax_scale
    train_scaled = minmax_scale(x, axis = 0)
    test_scaled = minmax_scale(x_test, axis = 0)
    encoder =Model(inp, encoded12)
    encoded_train = pd.DataFrame(encoder.predict(train_scaled))
    encoded_test = pd.DataFrame(encoder.predict(test_scaled))
    xtrain=encoded_train
    xtest=encoded_test
    print(encoded_train.shape)
    
    
    from sklearn.ensemble import RandomForestClassifier

    #Create a Gaussian Classifier
    clf=RandomForestClassifier()
    
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(xtrain,y0)
    y_pred1=clf.predict(xtest)
    h1=pd.DataFrame(y_pred1,columns=['Attack'])
    print(h1['Attack'].value_counts())
    from sklearn import metrics
    # Model Accuracy, how often is the classifier correct?
    print("Accuracy from random forest after applying autoencoder:",metrics.accuracy_score(y0_test, y_pred1))
    
    accuracy=metrics.accuracy_score(y0_test,y_pred1)
    recall=metrics.recall_score(y0_test,y_pred1)
    precision=metrics.precision_score(y0_test,y_pred1)
    f1=metrics.f1_score(y0_test,y_pred1)
    print("Performance over the testing data set \n")
    print("Accuracy : {} , Recall : {} , Precision : {} , F1 : {}\n".format(accuracy,recall,precision,f1 ))
        

    autoencoder = Model(inp, decoded12)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error',metrics=['accuracy']) #adam-->adaptive learning rate optimization algorithm
    return autoencoder

autoencoder=getModel() 
history=autoencoder.fit(x[np.where(y0==0)],x[np.where(y0==0)],
               epochs=4,
                batch_size=300,
                shuffle=True,
                validation_split=0.1
                       )



(125973, 10)
0    14279
1     8264
Name: Attack, dtype: int64
Accuracy from random forest after applying autoencoder: 0.7565097813068358
Performance over the testing data set 

Accuracy : 0.7565097813068358 , Recall : 0.6081196914205563 , Precision : 0.9443368828654405 , F1 : 0.7398208276058207

Train on 60608 samples, validate on 6735 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
