# Read Data Sample

In [1]:
import pandas as pd
import numpy as np
from collections import namedtuple
pd.set_option("display.max_rows",35)
%matplotlib inline

In [2]:
class dataset:
    kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
    kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels.pkl")
    
    kdd_train_5labels = pd.read_pickle("dataset/kdd_train_5labels.pkl")
    kdd_test_5labels = pd.read_pickle("dataset/kdd_test_5labels.pkl")
    

In [3]:
dataset.kdd_train_2labels.shape

(125973, 124)

In [4]:
dataset.kdd_test_2labels.shape

(22544, 124)

In [5]:
from sklearn import model_selection as ms
from sklearn import preprocessing as pp

class preprocess:
    
    output_columns_2labels = ['is_Attack','is_Normal']
    
    x_input = dataset.kdd_train_2labels.drop(output_columns_2labels, axis = 1)
    y_output = dataset.kdd_train_2labels.loc[:,output_columns_2labels]

    x_test_input = dataset.kdd_test_2labels.drop(output_columns_2labels, axis = 1)
    y_test = dataset.kdd_test_2labels.loc[:,output_columns_2labels]

    ss = pp.StandardScaler()

    x_train = ss.fit_transform(x_input)
    x_test = ss.transform(x_test_input)

    y_train = y_output.values


In [6]:
import tensorflow as tf


In [7]:
class network(object):
    
    input_dim = 122
    classes = 2
    hidden_encoder_dim = 80
    hidden_layers = 1
    latent_dim = 10

    hidden_decoder_dim = 80
    lam = 0.01
    
    def __init__(self, classes, hidden_layers, num_of_features):
        self.classes = classes
        self.hidden_layers = hidden_layers
        self.latent_dim = num_of_features
            
    def build_layers(self):
        tf.reset_default_graph()
        #learning_rate = tf.Variable(initial_value=0.001)

        input_dim = self.input_dim
        classes = self.classes
        hidden_encoder_dim = self.hidden_encoder_dim
        hidden_layers = self.hidden_layers
        latent_dim = self.latent_dim
        hidden_decoder_dim = self.hidden_decoder_dim
        lam = self.lam
        
        with tf.variable_scope("Input"):
            self.x = tf.placeholder("float", shape=[None, input_dim])
            self.y_ = tf.placeholder("float", shape=[None, classes])
            self.keep_prob = tf.placeholder("float")
        
        with tf.variable_scope("Layer_Encoder"):

            hidden_encoder = tf.layers.dense(self.x, hidden_encoder_dim, activation = tf.nn.relu, kernel_regularizer=tf.nn.l2_loss)
            for h in range(hidden_layers - 1):
                hidden_encoder = tf.layers.dense(hidden_encoder, hidden_encoder_dim, activation = tf.nn.relu, kernel_regularizer=tf.nn.l2_loss)
            
        with tf.variable_scope("Layer_Mean"):
            mu_encoder = tf.layers.dense(hidden_encoder, latent_dim, activation = None, kernel_regularizer=tf.nn.l2_loss)

        with tf.variable_scope("Layer_Variance"):
            logvar_encoder = tf.layers.dense(hidden_encoder, latent_dim, activation = None, kernel_regularizer=tf.nn.l2_loss)

        with tf.variable_scope("Sampling_Distribution"):
            # Sample epsilon
            epsilon = tf.random_normal(tf.shape(logvar_encoder), mean=0.0, stddev=0.05, name='epsilon')

            # Sample latent variable
            std_encoder = tf.exp(0.5 * logvar_encoder)
            z = mu_encoder + tf.multiply(std_encoder, epsilon)
            
            tf.summary.histogram("Sample_Distribution", z)

        with tf.variable_scope("Layer_Decoder"):
            hidden_decoder = tf.layers.dense(z, hidden_decoder_dim, activation = tf.nn.relu, kernel_regularizer=tf.nn.l2_loss)
            for h in range(hidden_layers - 1):
                hidden_decoder = tf.layers.dense(hidden_decoder, hidden_decoder_dim, activation = tf.nn.relu, kernel_regularizer=tf.nn.l2_loss)

        with tf.variable_scope("Layer_Reconstruction"):
            x_hat = tf.layers.dense(hidden_decoder, input_dim, activation = None)
            
        with tf.variable_scope("Layer_Dense_Hidden"):
            hidden_output = tf.layers.dense(z,latent_dim, activation=tf.nn.relu)

        with tf.variable_scope("Layer_Dense_Softmax"):
            y = tf.layers.dense(z, classes, activation=tf.nn.softmax)

        with tf.variable_scope("Loss"):
            BCE = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=x_hat, labels=self.x), reduction_indices=1)
            KLD = -0.5 * tf.reduce_sum(1 + logvar_encoder - tf.pow(mu_encoder, 2) - tf.exp(logvar_encoder), reduction_indices=1)
            softmax_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = self.y_, logits = y))

            loss = tf.reduce_mean(BCE + KLD + softmax_loss)

            self.regularized_loss = tf.abs(loss, name = "Regularized_loss")
            correct_prediction = tf.equal(tf.argmax(self.y_, 1), tf.argmax(y, 1))
            self.tf_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name = "Accuracy")

        with tf.variable_scope("Optimizer"):
            learning_rate=0.001
            train_op = tf.train.AdamOptimizer(learning_rate)
            self.optimizer = train_op.minimize(self.regularized_loss)  
            
        # add op for merging summary
        self.summary_op = tf.summary.merge_all()
        self.pred = tf.argmax(y, 1)
        self.actual = tf.argmax(self.y_, 1)

        # add Saver ops
        self.saver = tf.train.Saver()
        

In [8]:
import collections

class Train:    
    
    result = namedtuple("score", ['epoch', 'no_of_features','hidden_layers','train_score', 'test_score'])

    predictions = pd.DataFrame()

    results = []
    best_acc = 0
    
    def train(epochs, net, h,f):
        batch_iterations = 100
    
        with tf.Session() as sess:
            summary_writer_train = tf.summary.FileWriter('./logs/kdd/VAE/training', graph=sess.graph)
            summary_writer_valid = tf.summary.FileWriter('./logs/kdd/VAE/validation')

            sess.run(tf.global_variables_initializer())

            for epoch in range(1, (epochs+1)):
                x_train, x_valid, y_train, y_valid, = ms.train_test_split(preprocess.x_train, 
                                                                          preprocess.y_train, 
                                                                          test_size=0.4)
                batch_indices = np.array_split(np.arange(x_train.shape[0]), 
                                           batch_iterations)
                                                                          
                for i in batch_indices:
                    _, train_loss, summary_str = sess.run([net.optimizer, 
                                                           net.regularized_loss, 
                                                           net.summary_op],
                                                          feed_dict={net.x: x_train[i,:], 
                                                                     net.y_: y_train[i,:], 
                                                                     net.keep_prob:0.8})
                    summary_writer_train.add_summary(summary_str, epoch)


                valid_accuracy, summary_str = sess.run([net.tf_accuracy, net.summary_op], 
                                                      feed_dict={net.x: x_valid, 
                                                                 net.y_: y_valid, 
                                                                 net.keep_prob:1})
                summary_writer_valid.add_summary(summary_str, epoch)

                if epoch % 10 == 0:
                    print("Step {} | Training Loss: {:.6f} | Validation Accuracy: {:.6f}".format(epoch, train_loss, valid_accuracy))

            accuracy, pred_value, actual_value = sess.run([net.tf_accuracy, 
                                                           net.pred, 
                                                           net.actual], 
                                                          feed_dict={net.x: preprocess.x_test, 
                                                                     net.y_: preprocess.y_test, 
                                                                     net.keep_prob:1})


            print("Accuracy on Test data: {}".format(accuracy))
            
            curr_pred = pd.DataFrame({"{}_{}_{}".format(epochs,f,h):pred_value},)
            Train.predictions = pd.concat([Train.predictions, curr_pred], axis = 1)
            
            if accuracy > Train.best_acc:
                Train.best_acc = accuracy
                Train.pred_value = pred_value
                Train.actual_value = actual_value
                Train.best_parameters = "Hidden Layers:{}, Features Count:{}".format(h, f)
                #net.saver.save(sess, "dataset/epochs_{}_hidden layers_{}_features count_{}".format(epochs,h,f))
            Train.results.append(Train.result(epochs, h, f,valid_accuracy, accuracy))
            

In [9]:
import itertools
class Hyperparameters:
    features_arr = [2, 4, 8, 16, 32, 64, 128, 256]
    hidden_layers_arr = [2, 4, 6, 10]
    epochs = [1]
    
    for e, h, f in itertools.product(epochs, hidden_layers_arr, features_arr):
        print("Current Layer Attributes - epochs:{} hidden layers:{} features count:{}".format(e,h,f))
        n = network(2,h,f)
        n.build_layers()
        Train.train(e, n, h,f)
        

Current Layer Attributes - epochs:1 hidden layers:2 features count:2
Accuracy on Test data: 0.7088804244995117
Current Layer Attributes - epochs:1 hidden layers:2 features count:4
Accuracy on Test data: 0.30797550082206726
Current Layer Attributes - epochs:1 hidden layers:2 features count:8
Accuracy on Test data: 0.5132185816764832
Current Layer Attributes - epochs:1 hidden layers:2 features count:16
Accuracy on Test data: 0.254879355430603
Current Layer Attributes - epochs:1 hidden layers:2 features count:32
Accuracy on Test data: 0.6949520707130432
Current Layer Attributes - epochs:1 hidden layers:2 features count:64
Accuracy on Test data: 0.832061767578125
Current Layer Attributes - epochs:1 hidden layers:2 features count:128
Accuracy on Test data: 0.7676100134849548
Current Layer Attributes - epochs:1 hidden layers:2 features count:256
Accuracy on Test data: 0.7645936608314514
Current Layer Attributes - epochs:1 hidden layers:4 features count:2
Accuracy on Test data: 0.783844947814

In [10]:
df_results = pd.DataFrame(Train.results)

In [11]:
df_results

Unnamed: 0,epoch,no_of_features,hidden_layers,train_score,test_score
0,1,2,2,0.624648,0.70888
1,1,2,4,0.181584,0.307976
2,1,2,8,0.409446,0.513219
3,1,2,16,0.296348,0.254879
4,1,2,32,0.751161,0.694952
5,1,2,64,0.830284,0.832062
6,1,2,128,0.888748,0.76761
7,1,2,256,0.88589,0.764594
8,1,4,2,0.759099,0.783845
9,1,4,4,0.753007,0.511932


In [14]:
Train.predictions.to_pickle("dataset/vae_dense_predictions.pkl")
df_results.to_pickle("dataset/vae_dense_scores.pkl")