# Deep Learning

In [1]:
from keras.layers import Input, Dense, Lambda, Layer
from keras.models import Model
from keras import regularizers
import keras
import pandas as pd
import numpy as np
from keras import backend as K
from keras import metrics
from collections import namedtuple
pd.set_option("display.max_rows",35)
%matplotlib inline

Using TensorFlow backend.


In [2]:
kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels.pkl")
y_train_labels = pd.read_pickle("dataset/kdd_train_2labels_y.pkl")
y_test_labels = pd.read_pickle("dataset/kdd_test_2labels_y.pkl")

output_columns_2labels = ['is_Attack','is_Normal']

from sklearn import model_selection as ms
from sklearn import preprocessing as pp

x_input = kdd_train_2labels.drop(output_columns_2labels, axis = 1)
#y_output = kdd_train_2labels.loc[:,output_columns_2labels]

ss = pp.StandardScaler()
x_input = ss.fit_transform(x_input)

le = pp.LabelEncoder()
y_train = le.fit_transform(y_train_labels).reshape(-1, 1)
y_test = le.transform(y_test_labels).reshape(-1, 1)


x_train, x_valid, y_train, y_valid = ms.train_test_split(x_input, 
                              y_train, 
                              test_size=0.2)
#x_valid, x_test, y_valid, y_test = ms.train_test_split(x_valid, y_valid, test_size = 0.4)

x_test = kdd_test_2labels.drop(output_columns_2labels, axis = 1)
#y_test = kdd_test_2labels.loc[:,output_columns_2labels]

x_test = ss.transform(x_test)

x_train = np.hstack((x_train, y_train))
x_valid = np.hstack((x_valid, y_valid))

x_test = np.hstack((x_test, np.random.normal(loc = 0, scale = 0.05, size = y_test.shape)))

In [3]:
input_dim = 123
intermediate_dim = 80
latent_dim = 32
batch_size = 1409
epochs = 5
hidden_layers = 8

class Train:
    def train():
        Train.x = Input(shape=(input_dim,))
        
        hidden_encoder = Train.x
        for i in range(hidden_layers):
            hidden_encoder = Dense(intermediate_dim, activation='relu')(hidden_encoder)

        mean_encoder = Dense(latent_dim, activation=None)(hidden_encoder)

        logvar_encoder = Dense(latent_dim, activation=None)(hidden_encoder)

        def get_distrib(args):

            mean_encoder, logvar_encoder = args

            # Sample epsilon
            epsilon = np.random.normal(loc=0.0, scale=0.05, size = (batch_size, latent_dim))

            # Sample latent variable
            z = mean_encoder + K.exp(logvar_encoder / 2) * epsilon
            return z

        z = Lambda(get_distrib)([mean_encoder, logvar_encoder])

        hidden_decoder = z
        for i in range(hidden_layers):
            hidden_decoder = Dense(intermediate_dim, activation="relu")(hidden_decoder)

        Train.x_ = Dense(input_dim, activation=None)(hidden_decoder)

def get_loss(x, x_):
    xent_loss = input_dim * metrics.binary_crossentropy(x, x_) 
    kl_loss = - 0.5 * K.sum(1 + logvar_encoder - K.square(mean_encoder) - K.exp(logvar_encoder), axis=-1)
    return K.abs(K.mean(xent_loss + kl_loss))



In [4]:
import itertools
features_arr = [4, 8, 16, 32, 256, 1024]
hidden_layers_arr = [2, 6, 10, 100]

epoch_arr = [1]

score = namedtuple("score", ['epoch', 'no_of_features','hidden_layers','train_score', 'test_score'])
scores = []
predictions = pd.DataFrame()

for e, h, f in itertools.product(epoch_arr, hidden_layers_arr, features_arr):
    
    print(" \n Current Layer Attributes - epochs:{} hidden layers:{} features count:{}".format(e,h,f))
    latent_dim = f
    epochs = e
    hidden_layers = h

    Train.train()

    vae_model = Model(inputs = Train.x, outputs = Train.x_ )
    vae_model.compile(optimizer = "adam", loss = "mean_squared_error" )

    train_size = x_train.shape[0] - x_train.shape[0]%batch_size
    valid_size = x_valid.shape[0] - x_valid.shape[0]%batch_size

    vae_model.fit(x = x_train[:train_size,:], y = x_train[:train_size,:], 
                  shuffle=True, epochs=epochs, 
                  batch_size = batch_size, 
                  validation_data = (x_valid[:valid_size,:], x_valid[:valid_size,:]),
                  verbose = 0)
    score_train = vae_model.evaluate(x_valid[:valid_size,:], y = x_valid[:valid_size,:],
                               batch_size = batch_size,
                               verbose = 0)
    score_test = vae_model.evaluate(x_test, y = x_test,
                           batch_size = batch_size,
                           verbose = 1)
    y_test_pred = vae_model.predict(x_test, batch_size=batch_size)
    
    y_pred = y_test_pred[:,-1]
    
    y_pred = y_test_pred[:,-1]
    y_pred[y_pred >= y_test_pred[:,-1].mean()] = 1
    y_pred[y_pred < y_test_pred[:,-1].mean()] = 0
    #print (y_pred)
    
    scores.append(score(e,f,h,score_train, score_test))
    curr_pred = pd.DataFrame({"{}_{}_{}".format(e,f,h):y_pred},)
    predictions = pd.concat([predictions, curr_pred], axis = 1)
    
    print("\n Train Loss: {}, Test Loss: {}".format(score_train, score_test)  )
    
scores = pd.DataFrame(scores)

    

 
 Current Layer Attributes - epochs:1 hidden layers:2 features count:4
 Train Loss: 0.865976894603056, Test Loss: 1.9359437078237534
 
 Current Layer Attributes - epochs:1 hidden layers:2 features count:8
 Train Loss: 0.8005657160983366, Test Loss: 1.8731894120573997
 
 Current Layer Attributes - epochs:1 hidden layers:2 features count:16
 Train Loss: 0.8843785594491398, Test Loss: 2.8732849955558777
 
 Current Layer Attributes - epochs:1 hidden layers:2 features count:32
 Train Loss: 0.8616735549534068, Test Loss: 1.9559364691376686
 
 Current Layer Attributes - epochs:1 hidden layers:2 features count:256
 Train Loss: 1.8556335919043596, Test Loss: 1443.4688436612487
 
 Current Layer Attributes - epochs:1 hidden layers:2 features count:1024
 Train Loss: 0.7048630118370056, Test Loss: 15.560515195131302
 
 Current Layer Attributes - epochs:1 hidden layers:6 features count:4
 Train Loss: 0.7588324091013741, Test Loss: 1.8503988049924374
 
 Current Layer Attributes - epochs:1 hidden lay

In [5]:
scores

Unnamed: 0,epoch,no_of_features,hidden_layers,train_score,test_score
0,1,4,2,0.865977,1.935944
1,1,8,2,0.800566,1.873189
2,1,16,2,0.884379,2.873285
3,1,32,2,0.861674,1.955936
4,1,256,2,1.855634,1443.468844
5,1,1024,2,0.704863,15.560515
6,1,4,6,0.758832,1.850399
7,1,8,6,0.760923,1.84578
8,1,16,6,0.768247,1.837274
9,1,32,6,0.781125,1.859858


In [6]:
predictions.to_pickle("dataset/vae_only_predictions.pkl")
scores.to_pickle("dataset/vae_only_scores.pkl")