# Read Data Sample

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline

In [2]:
class dataset:
    kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels.pkl")
    kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels.pkl")
    
    kdd_train_5labels = pd.read_pickle("dataset/kdd_train_5labels.pkl")
    kdd_test_5labels = pd.read_pickle("dataset/kdd_test_5labels.pkl")
    

In [3]:
dataset.kdd_train_2labels.shape

(125973, 124)

In [4]:
dataset.kdd_test_2labels.shape

(22544, 124)

In [5]:
class preprocess:
    
    output_columns_2labels = ['is_Attack','is_Normal']

    from sklearn import model_selection as ms
    from sklearn import preprocessing as pp

    x_input = dataset.kdd_train_2labels
    x_test_input = dataset.kdd_test_2labels

    ss = pp.StandardScaler()

    x_train = ss.fit_transform(x_input)
    x_valid = ss.transform(x_test_input)
    x_test = ss.transform(x_test_input)
    


In [6]:
import tensorflow as tf


In [7]:
class network(object):
    
    input_dim = 124
    hidden_encoder_dim = 40
    hidden_decoder_dim = 40
    latent_dim = 10
    hidden_layers = 1
    
    def __init__(self, hidden_layers, num_of_features):
        self.hidden_layers = hidden_layers
        self.latent_dim = num_of_features
            
    def build_layers(self):
        tf.reset_default_graph()
        #learning_rate = tf.Variable(initial_value=0.001)

        with tf.variable_scope("Input"):
            self.x = tf.placeholder("float", shape=[None, self.input_dim])
            self.keep_prob = tf.placeholder("float")
        
        with tf.variable_scope("Layer_Encoder"):

            hidden_encoder = tf.layers.dense(self.x, self.hidden_encoder_dim, activation = tf.nn.relu)
            for h in range(self.hidden_layers - 1):
                hidden_encoder = tf.layers.dense(hidden_encoder, self.hidden_encoder_dim, activation = tf.nn.relu)
            
        with tf.variable_scope("Layer_Mean"):
            mu_encoder = tf.layers.dense(hidden_encoder, self.latent_dim, activation = None)

        with tf.variable_scope("Layer_Variance"):
            logvar_encoder = tf.layers.dense(hidden_encoder, self.latent_dim, activation = None)

        with tf.variable_scope("Sampling_Distribution"):
            # Sample epsilon
            epsilon = tf.random_normal(tf.shape(logvar_encoder), name='epsilon')

            # Sample latent variable
            std_encoder = tf.exp(0.5 * logvar_encoder)
            z = mu_encoder + tf.multiply(std_encoder, epsilon)

        with tf.variable_scope("Layer_Decoder"):
            hidden_decoder = tf.layers.dense(z, self.hidden_decoder_dim, activation = tf.nn.relu)
            for h in range(self.hidden_layers - 1):
                hidden_decoder = tf.layers.dense(hidden_decoder, self.hidden_decoder_dim, activation = tf.nn.relu)

        with tf.variable_scope("Layer_Reconstruction"):
            x_hat = tf.layers.dense(hidden_decoder, self.input_dim, activation = tf.nn.relu)
            
        with tf.variable_scope("Loss"):
            BCE = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=x_hat, labels=self.x), reduction_indices=1)
            KLD = -0.5 * tf.reduce_mean(1 + logvar_encoder - tf.pow(mu_encoder, 2) - tf.exp(logvar_encoder), reduction_indices=1)
            
            loss = tf.reduce_mean(BCE + KLD)

            self.regularized_loss = tf.abs(loss, name = "Regularized_loss")
            y_ = self.x[:,-2:]
            y = x_hat[:,-2:]
            correct_prediction = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1))
            self.tf_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name = "Accuracy")

        with tf.variable_scope("Optimizer"):
            learning_rate=0.01
            self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.regularized_loss)
            
        # add op for merging summary
        self.summary_op = tf.summary.merge_all()
        self.pred = tf.argmax(y, 1)
        self.actual = tf.argmax(y_, 1)

        # add Saver ops
        # saver = tf.train.Saver()
        

In [8]:
class Train:    
    def train(epochs, net):
        batch_iterations = 100

        batch_indices = np.array_split(np.arange(preprocess.x_train.shape[0]), 
                                   batch_iterations)
        with tf.Session() as sess:
            summary_writer_train = tf.summary.FileWriter('./logs/kdd/VAE/training', graph=sess.graph)
            summary_writer_valid = tf.summary.FileWriter('./logs/kdd/VAE/validation')

            sess.run(tf.global_variables_initializer())

            for epoch in range(0, epochs):
                for i in batch_indices:
                    print(i)
                    x = preprocess.x_train[i,:]
                    _, train_loss, summary_str = sess.run([net.optimizer, 
                                                           net.regularized_loss, 
                                                           net.summary_op],
                                                          feed_dict={net.x: x, 
                                                                     net.keep_prob:0.8})
                    summary_writer_train.add_summary(summary_str, epoch)


                accuracy, summary_str = sess.run([net.tf_accuracy, net.summary_op], 
                                                      feed_dict={net.x: preprocess.x_valid, 
                                                                 net.keep_prob:1})
                summary_writer_valid.add_summary(summary_str, epoch)

                if epoch % 10 == 0:
                    print("Step {} | Training Loss: {:.4f} | Validation Accuracy: {:.4f}".format(epoch, train_loss, accuracy))

            accuracy, pred_value, actual_value = sess.run([net.tf_accuracy, 
                                                           net.pred, 
                                                           net.actual], 
                                                          feed_dict={net.x: preprocess.x_test, 
                                                                     net.keep_prob:1})


            print("Accuracy on Test data: {}".format(accuracy))
            return accuracy, pred_value, actual_value

In [9]:
import itertools
import collections
class Hyperparameters:
    features_arr = [8, 32, 64, 128, 256, 512]
    hidden_layers_arr = [2, 8]
    
    epochs = [30]
    result = collections.namedtuple("result", ["epochs", "hidden_layers", "feature_count",
                                  "accuracy"])
    results = []
    best_acc = 0
    for e, h, f in itertools.product(epochs, hidden_layers_arr, features_arr):
        print("Current Layer Attributes - epochs:{} hidden layers:{} features count:{}".format(e,h,f))
        n = network(h,f)
        n.build_layers()
        acc, pred, actual = Train.train(e, n)
        if acc > best_acc:
            best_acc = acc
            pred_value = pred
            actual_value = actual
        results.append(result(e, h, f,acc))

Current Layer Attributes - epochs:30 hidden layers:2 features count:8
[   0    1    2 ..., 1257 1258 1259]


TypeError: Fetch argument None has invalid type <class 'NoneType'>

In [None]:
df_results = pd.DataFrame(Hyperparameters.results)

In [None]:
df_results

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    np.set_printoptions(precision=4)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j].round(4),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix
cm_2labels = confusion_matrix(y_pred = Hyperparameters.pred_value, y_true = Hyperparameters.actual_value)
plt.figure(figsize=[6,6])
plot_confusion_matrix(cm_2labels, preprocess.output_columns_2labels, normalize = True)