# Deep Learning

In [1]:
from keras.layers import Input, Dense, Lambda, Layer
from keras.models import Model
from keras import regularizers
import keras
import pandas as pd
import numpy as np
from keras import backend as K
from keras import metrics
from collections import namedtuple
pd.set_option("display.max_rows",35)
%matplotlib inline

Using TensorFlow backend.


In [2]:
kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels.pkl")

#y_train_labels = pd.read_pickle("dataset/kdd_train_2labels_y.pkl")
#y_train_labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
#y_test_labels = pd.read_pickle("dataset/kdd_test_2labels_y.pkl")

output_columns_2labels = ['is_Attack','is_Normal']

from sklearn import model_selection as ms
from sklearn import preprocessing as pp

x_input = kdd_train_2labels.drop(output_columns_2labels, axis = 1)
y_output = kdd_train_2labels.loc[:,output_columns_2labels]

ss = pp.StandardScaler()
x_input = ss.fit_transform(x_input)

#le = pp.LabelEncoder()
#y_train = le.fit_transform(y_train_labels).reshape(-1, 1)
#y_test = le.transform(y_test_labels).reshape(-1, 1)

y_train = kdd_train_2labels.loc[:,output_columns_2labels]

x_train, x_valid, y_train, y_valid = ms.train_test_split(x_input, 
                              y_train, 
                              test_size=0.1)
#x_valid, x_test, y_valid, y_test = ms.train_test_split(x_valid, y_valid, test_size = 0.4)

x_test = kdd_test_2labels.drop(output_columns_2labels, axis = 1)
y_test = kdd_test_2labels.loc[:,output_columns_2labels]

x_test = ss.transform(x_test)

x_train = np.hstack((x_train, y_train))
x_valid = np.hstack((x_valid, y_valid))

x_test = np.hstack((x_test, np.random.normal(loc = 0, scale = 0.01, size = y_test.shape)))

In [3]:
input_dim = 124
intermediate_dim = 124
latent_dim = 32
batch_size = 1409
epochs = 5
hidden_layers = 8

class Train:
    def train():
        Train.x = Input(shape=(input_dim,))
        
        hidden_encoder = Train.x
        for i in range(hidden_layers):
            hidden_encoder = Dense(intermediate_dim, activation='relu')(hidden_encoder)

        mean_encoder = Dense(latent_dim, activation=None)(hidden_encoder)

        logvar_encoder = Dense(latent_dim, activation=None)(hidden_encoder)

        def get_distrib(args):

            Train.mean_encoder, Train.logvar_encoder = args

            # Sample epsilon
            epsilon = np.random.normal(loc=0.0, scale=0.05, size = (batch_size, latent_dim))

            # Sample latent variable
            z = mean_encoder + K.exp(logvar_encoder / 2) * epsilon
            return z

        z = Lambda(get_distrib)([mean_encoder, logvar_encoder])

        hidden_decoder = z
        for i in range(hidden_layers):
            hidden_decoder = Dense(intermediate_dim, activation="relu")(hidden_decoder)

        Train.x_ = Dense(input_dim, activation=None)(hidden_decoder)

def get_loss(args):
    x, x_ = args
    xent_loss = metrics.binary_crossentropy(x, x_) #input_dim *
    kl_loss = - 0.5 * K.sum(1 + Train.logvar_encoder - K.square(Train.mean_encoder) - K.exp(Train.logvar_encoder), axis=-1)
    label_loss = K.mean(K.argmax(Train.x[:,-2:], axis = 1) - K.argmax(Train.x_[:,-2:], axis = 1))
    
    ls = xent_loss + kl_loss
    #ls += label_loss
    
    return ls



In [4]:
import itertools
#features_arr = [4, 16, 32, 256, 1024]
#hidden_layers_arr = [2, 6, 10, 100]

features_arr = [4, 16, 32, 122]
hidden_layers_arr = [2, 6, 10]

epoch_arr = [10]

def label_accuracy(y_true, y_pred):
    return K.mean(K.equal(K.argmax(y_true, axis = 1), K.argmax(y_pred, axis = 1)))

score = namedtuple("score", ['epoch', 'no_of_features','hidden_layers','train_score', 'test_score'])
scores = []
predictions = pd.DataFrame()

for e, h, f in itertools.product(epoch_arr, hidden_layers_arr, features_arr):
    
    print(" \n Current Layer Attributes - epochs:{} hidden layers:{} features count:{}".format(e,h,f))
    latent_dim = f
    epochs = e
    hidden_layers = h

    Train.train()

    vae_model = Model(inputs = Train.x, outputs = Train.x_ )
    vae_model.compile(optimizer = "adam", loss = "mean_squared_error", metrics = ['accuracy', label_accuracy] )
    #vae_model.compile(optimizer = "adam", loss = Lambda(get_loss)([Train.x, Train.x_]), metrics = ['accuracy', label_accuracy] )

    train_size = x_train.shape[0] - x_train.shape[0]%batch_size
    valid_size = x_valid.shape[0] - x_valid.shape[0]%batch_size

    vae_model.fit(x = x_train[:train_size,:], y = x_train[:train_size,:], 
                  shuffle=True, epochs=epochs, 
                  batch_size = batch_size, 
                  #validation_data = (x_valid[:valid_size,:], x_valid[:valid_size,:]),
                  validation_data = (x_test, x_test),
                  verbose = 1)
    
    score_train = vae_model.evaluate(x_valid[:valid_size,:], y = x_valid[:valid_size,:],
                               batch_size = batch_size,
                               verbose = 1)
    
    score_test = vae_model.evaluate(x_test, y = x_test,
                           batch_size = batch_size,
                           verbose = 1)
    y_test_pred = vae_model.predict(x_test, batch_size=batch_size)
    
    y_pred = np.argmax(y_test_pred[:,-2:], axis = 1)
    y_test_1d = np.argmax(y_test.values, axis = 1)
    
    #y_pred[y_pred >= y_test_pred[:,-1].mean()] = 1
    #y_pred[y_pred < y_test_pred[:,-1].mean()] = 0
    
    label_acc = np.mean(np.equal(y_test_1d, y_pred))
    
    scores.append(score(e,f,h,score_train[-1], label_acc)) #score_test[-1]))
    
    curr_pred = pd.DataFrame({"{}_{}_{}".format(e,f,h):y_pred},)
    predictions = pd.concat([predictions, curr_pred], axis = 1)
    
    print("\n Train Acc: {}, Test Acc: {}, Label Acc: {}".format(score_train[-1], 
                                                                 score_test[-1], 
                                                                 label_acc)  )
    
scores = pd.DataFrame(scores)

    

 
 Current Layer Attributes - epochs:10 hidden layers:2 features count:4
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.8545954525470734, Test Acc: 0.8408002182841301, Label Acc: 0.7944907735982967
 
 Current Layer Attributes - epochs:10 hidden layers:2 features count:16
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.6480660066008568, Test Acc: 0.7032469809055328, Label Acc: 0.7665897799858056
 
 Current Layer Attributes - epochs:10 hidden layers:2 features count:32
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.14992902614176273, Test Acc: 0.15041696559637785, Label Acc: 0.7997249822569198
 
 Current Laye

 Train Acc: 0.03805890679359436, Test Acc: 0.09394960990175605, Label Acc: 0.5689318665720369
 
 Current Layer Attributes - epochs:10 hidden layers:6 features count:4
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.8783711865544319, Test Acc: 0.8320617415010929, Label Acc: 0.7854418026969482
 
 Current Layer Attributes - epochs:10 hidden layers:6 features count:16
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.9220191612839699, Test Acc: 0.8680358417332172, Label Acc: 0.7764815471965933
 
 Current Layer Attributes - epochs:10 hidden layers:6 features count:32
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.8

Epoch 9/10
Epoch 10/10
 Train Acc: 0.7398864403367043, Test Acc: 0.6986781395971775, Label Acc: 0.7694730305180979
 
 Current Layer Attributes - epochs:10 hidden layers:10 features count:4
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

 Train Acc: 0.6304116472601891, Test Acc: 0.5899574086070061, Label Acc: 0.7806068133427964
 
 Current Layer Attributes - epochs:10 hidden layers:10 features count:16
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

 Train Acc: 0.8234563618898392, Test Acc: 0.7898775711655617, Label Acc: 0.7919180269694819
 
 Current Layer Attributes - epochs:10 hidden layers:10 features count:32
Train on 112720 samples, validate on 22544 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
E

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Train Acc: 0.6501951888203621, Test Acc: 0.696371540427208, Label Acc: 0.7638839602555003


In [5]:
scores.sort_values("test_score", ascending=False)

Unnamed: 0,epoch,no_of_features,hidden_layers,train_score,test_score
2,10,32,2,0.149929,0.799725
0,10,4,2,0.854595,0.794491
9,10,16,10,0.823456,0.791918
4,10,4,6,0.878371,0.785442
6,10,32,6,0.868169,0.781893
8,10,4,10,0.630412,0.780607
5,10,16,6,0.922019,0.776482
7,10,122,6,0.739886,0.769473
10,10,32,10,0.670334,0.768453
1,10,16,2,0.648066,0.76659


In [6]:
predictions.to_pickle("dataset/vae_only_predictions.pkl")
scores.to_pickle("dataset/vae_only_scores.pkl")