# Deep Learning

In [1]:
from keras.layers import Input, Dense, Lambda, Layer
from keras.models import Model
from keras import regularizers
import keras
import pandas as pd
import numpy as np
from keras import backend as K
from keras import metrics
from collections import namedtuple
pd.set_option("display.max_rows",35)
%matplotlib inline

Using TensorFlow backend.


In [2]:
kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels.pkl")

#y_train_labels = pd.read_pickle("dataset/kdd_train_2labels_y.pkl")
#y_train_labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
#y_test_labels = pd.read_pickle("dataset/kdd_test_2labels_y.pkl")

output_columns_2labels = ['is_Attack','is_Normal']

from sklearn import model_selection as ms
from sklearn import preprocessing as pp

x_input = kdd_train_2labels.drop(output_columns_2labels, axis = 1)
y_output = kdd_train_2labels.loc[:,output_columns_2labels]

ss = pp.StandardScaler()
x_input = ss.fit_transform(x_input)

#le = pp.LabelEncoder()
#y_train = le.fit_transform(y_train_labels).reshape(-1, 1)
#y_test = le.transform(y_test_labels).reshape(-1, 1)

y_train = kdd_train_2labels.loc[:,output_columns_2labels]

x_train, x_valid, y_train, y_valid = ms.train_test_split(x_input, 
                              y_train, 
                              test_size=0.1)
#x_valid, x_test, y_valid, y_test = ms.train_test_split(x_valid, y_valid, test_size = 0.4)

x_test = kdd_test_2labels.drop(output_columns_2labels, axis = 1)
y_test = kdd_test_2labels.loc[:,output_columns_2labels]

x_test = ss.transform(x_test)

x_train = np.hstack((x_train, y_train))
x_valid = np.hstack((x_valid, y_valid))

x_test = np.hstack((x_test, np.random.normal(loc = 0, scale = 0.01, size = y_test.shape)))

In [3]:
input_dim = 124
intermediate_dim = 124
latent_dim = 32
batch_size = 1409
epochs = 5
hidden_layers = 8

class Train:
    def train():
        Train.x = Input(shape=(input_dim,))
        
        hidden_encoder = Train.x
        for i in range(hidden_layers):
            hidden_encoder = Dense(intermediate_dim, activation='relu')(hidden_encoder)

        mean_encoder = Dense(latent_dim, activation=None)(hidden_encoder)

        logvar_encoder = Dense(latent_dim, activation=None)(hidden_encoder)

        def get_distrib(args):

            Train.mean_encoder, Train.logvar_encoder = args

            # Sample epsilon
            epsilon = np.random.normal(loc=0.0, scale=0.05, size = (batch_size, latent_dim))

            # Sample latent variable
            z = mean_encoder + K.exp(logvar_encoder / 2) * epsilon
            return z

        z = Lambda(get_distrib)([mean_encoder, logvar_encoder])

        hidden_decoder = z
        for i in range(hidden_layers):
            hidden_decoder = Dense(intermediate_dim, activation="relu")(hidden_decoder)

        Train.x_ = Dense(input_dim, activation=None)(hidden_decoder)

def get_loss(args):
    x, x_ = args
    xent_loss = metrics.binary_crossentropy(x, x_) #input_dim *
    kl_loss = - 0.5 * K.sum(1 + Train.logvar_encoder - K.square(Train.mean_encoder) - K.exp(Train.logvar_encoder), axis=-1)
    label_loss = K.mean(K.argmax(Train.x[:,-2:], axis = 1) - K.argmax(Train.x_[:,-2:], axis = 1))
    
    ls = xent_loss + kl_loss
    #ls += label_loss
    
    return ls



In [4]:
import itertools
#features_arr = [4, 16, 32, 256, 1024]
#hidden_layers_arr = [2, 6, 10, 100]

features_arr = [4, 16, 32, 122]
hidden_layers_arr = [2, 6, 10]

epoch_arr = [50]

def label_accuracy(y_true, y_pred):
    return K.mean(K.equal(K.argmax(y_true, axis = 1), K.argmax(y_pred, axis = 1)))

score = namedtuple("score", ['epoch', 'no_of_features','hidden_layers','train_score', 'test_score'])
scores = []
predictions = pd.DataFrame()

for e, h, f in itertools.product(epoch_arr, hidden_layers_arr, features_arr):
    
    print(" \n Current Layer Attributes - epochs:{} hidden layers:{} features count:{}".format(e,h,f))
    latent_dim = f
    epochs = e
    hidden_layers = h

    Train.train()

    vae_model = Model(inputs = Train.x, outputs = Train.x_ )
    vae_model.compile(optimizer = "adam", loss = "mean_squared_error", metrics = ['accuracy', label_accuracy] )
    #vae_model.compile(optimizer = "adam", loss = Lambda(get_loss)([Train.x, Train.x_]), metrics = ['accuracy', label_accuracy] )

    train_size = x_train.shape[0] - x_train.shape[0]%batch_size
    valid_size = x_valid.shape[0] - x_valid.shape[0]%batch_size

    vae_model.fit(x = x_train[:train_size,:], y = x_train[:train_size,:], 
                  shuffle=True, epochs=epochs, 
                  batch_size = batch_size, 
                  #validation_data = (x_valid[:valid_size,:], x_valid[:valid_size,:]),
                  validation_data = (x_test, x_test),
                  verbose = 1)
    
    score_train = vae_model.evaluate(x_valid[:valid_size,:], y = x_valid[:valid_size,:],
                               batch_size = batch_size,
                               verbose = 1)
    
    score_test = vae_model.evaluate(x_test, y = x_test,
                           batch_size = batch_size,
                           verbose = 1)
    y_test_pred = vae_model.predict(x_test, batch_size=batch_size)
    
    y_pred = np.argmax(y_test_pred[:,-2:], axis = 1)
    y_test_1d = np.argmax(y_test.values, axis = 1)
    
    #y_pred[y_pred >= y_test_pred[:,-1].mean()] = 1
    #y_pred[y_pred < y_test_pred[:,-1].mean()] = 0
    
    label_acc = np.mean(np.equal(y_test_1d, y_pred))
    
    scores.append(score(e,f,h,score_train[-1], label_acc)) #score_test[-1]))
    
    curr_pred = pd.DataFrame({"{}_{}_{}".format(e,f,h):y_pred},)
    predictions = pd.concat([predictions, curr_pred], axis = 1)
    
    print("\n Train Acc: {}, Test Acc: {}, Label Acc: {}".format(score_train[-1], 
                                                                 score_test[-1], 
                                                                 label_acc)  )
    
scores = pd.DataFrame(scores)

    

 
 Current Layer Attributes - epochs:50 hidden layers:2 features count:4
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.8445706218481064, Test Acc: 0.7562987916171551, Label Acc: 0.77541696238467
 
 Current Layer Attributes - epochs:50 hidden layers:2 features count:16
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoc

Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

 Train Acc: 0.7657026275992393, Test Acc: 0.8757097236812115, Label Acc: 0.8129435770049681
 
 Current Layer Attributes - epochs:50 hidden layers:2 features count:32
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

 Train Acc: 0.0, Test 

Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.0, Test Acc: 0.0, Label Acc: 0.43075762952448543
 
 Current Layer Attributes - epochs:50 hidden layers:6 features count:4
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 

Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.9361249059438705, Test Acc: 0.8989088013768196, Label Acc: 0.7813165365507452
 
 Current Layer Attributes - epochs:50 hidden layers:6 features count:32
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 

Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.891944631934166, Test Acc: 0.743745569139719, Label Acc: 0.7535042583392477
 
 Current Layer Attributes - epochs:50 hidden layers:6 features count:122
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.7365152686834335, Test Acc: 0.77036

Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.8706529513001442, Test Acc: 0.8266057521104813, Label Acc: 0.7554559971611071
 
 Current Layer Attributes - epochs:50 hidden layers:10 features count:16
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50

Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

 Train Acc: 0.8787260502576828, Test Acc: 0.7887686342000961, Label Acc: 0.8222143364088006
 
 Current Layer Attributes - epochs:50 hidden layers:10 features count:122
Train on 112720 samples, validate on 22544 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/

Epoch 48/50
Epoch 49/50
Epoch 50/50
 Train Acc: 0.9054293930530548, Test Acc: 0.816270399838686, Label Acc: 0.7818488289567069


In [5]:
scores.sort_values("test_score", ascending=False)

Unnamed: 0,epoch,no_of_features,hidden_layers,train_score,test_score
10,50,32,10,0.878726,0.822214
7,50,122,6,0.736515,0.821283
1,50,16,2,0.765703,0.812944
9,50,16,10,0.88263,0.802253
4,50,4,6,0.902147,0.792362
11,50,122,10,0.905429,0.781849
5,50,16,6,0.936125,0.781317
0,50,4,2,0.844571,0.775417
8,50,4,10,0.870653,0.755456
6,50,32,6,0.891945,0.753504


In [6]:
predictions.to_pickle("dataset/vae_only_predictions.pkl")
scores.to_pickle("dataset/vae_only_scores.pkl")