In [1]:
import numpy as np 
from numpy import argmax
from numpy import array_equal
from numpy import array
import statistics

import tensorflow as tf
import itertools
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras import Input
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Lambda
from tensorflow.keras import backend as K

from keras.metrics import categorical_crossentropy

from itertools import chain

tf.keras.backend.set_floatx('float64')


In [2]:
#Checking for tensorflow-GPU
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


Default GPU Device: /device:GPU:0


In [3]:
#Unzipping Attack and Validation folders

# Unzip Attack Folder to retrive subfolfers
from zipfile import ZipFile
file_name = "Attack_Data_Master.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unizipped Attack File')
    
# Unzip Validation Folder to retrive subfolfers
from zipfile import ZipFile
file_name = "Validation_Data_Master.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unizipped Validation File')
    
# Unzip Training Folder to retrive subfolfers
from zipfile import ZipFile
file_name = "Training_Data_Master.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unizipped Training File')

# list holding normal training data
train_normal_data = [] 



from pathlib import Path 
# list that holds attack vectors
train_attack_data = []

# Set file path ot the folder for iterations
input_dir = Path.cwd() / "Attack_Data_Master"
# store all the files ending with .txt in a list called files
files = list (input_dir.rglob("*.txt*"))
# Iterate over all the txt files and append the attack in the list train_attack data
for FILE in files:
  with open (FILE, 'r') as f:
    stringData = (f.read())
    string_split_data = stringData.split(" ")
    del (string_split_data[-1])
    train_attack_data.append(string_split_data)
    

for i in range(len(train_attack_data)):
  for j in range(len(train_attack_data[i])):
    train_attack_data[i][j] = int(train_attack_data[i][j])

from pathlib import Path 
# list that holds validation vectors
train_validation_data = []

# Set file path ot the folder for iterations
input_dir = Path.cwd() / "Validation_Data_Master"
# store all the files ending with .txt in a list called files
files = list (input_dir.rglob("*.txt*"))
# Iterate over all the txt files and append the attack in the list train_attack data
for FILE in files:
  with open (FILE, 'r') as f:
    stringData = (f.read())
    string_split_data = stringData.split(" ")
    del (string_split_data[-1])
    train_validation_data.append(string_split_data)
    

# list that holds training vectors
train_normal_data = []

# Set file path ot the folder for iterations
input_dir = Path.cwd() / "Training_Data_Master"
# store all the files ending with .txt in a list called files
files = list (input_dir.rglob("*.txt*"))
# Iterate over all the txt files and append the attack in the list train_attack data
for FILE in files:
  with open (FILE, 'r') as f:
    stringData = (f.read())
    string_split_data = stringData.split(" ")
    del (string_split_data[-1])
    train_normal_data.append(string_split_data)
    

for i in range(len(train_normal_data)):
  for j in range(len(train_normal_data[i])):
    train_normal_data[i][j] = int(train_normal_data[i][j])
print("Normal Data     --->  train_normal_data")
print("Attack Data     --->  train_attack_data")
print("Validation Data --->  train_validation_data")


Unizipped Attack File
Unizipped Validation File
Unizipped Training File
Normal Data     --->  train_normal_data
Attack Data     --->  train_attack_data
Validation Data --->  train_validation_data


In [4]:
#Statistics of the data

#Shortest request
shortest_seq = 1000;
#Longest request
longest_seq = 0;
#Average Request size in dataset
avg_seq = 0

sum = 0;
for i in range(len(train_normal_data)):
    curr_sequence_length = len(train_normal_data[i])
    if curr_sequence_length < shortest_seq:
        shortest_seq = curr_sequence_length
    if curr_sequence_length > longest_seq:
        longest_seq = curr_sequence_length
    sum += curr_sequence_length
avg_seq = int(sum/len(train_normal_data))

print(f"Shortest Request Length is {shortest_seq}")
print(f"Longest Request Length is {longest_seq}")
print(f"Average Request Length is {avg_seq}")
    

Shortest Request Length is 79
Longest Request Length is 2948
Average Request Length is 369


In [5]:
len(train_validation_data)

4372

In [6]:
#Number of unique system calls in the normal list
unique_normal = []
# System calls that are present in the attack list but not in normal
attack_not_in_normal = []

#Appending unique system calls from the normal list
for i in range(len(train_normal_data)):
    for j in range(len(train_normal_data[i])):
        curr_sys_call = train_normal_data[i][j]
        if curr_sys_call in unique_normal:
            continue
        else:
            unique_normal.append(curr_sys_call)

#Appending unique system calls from the attack list that are not present in normal
for i in range(len(train_attack_data)):
    for j in range(len(train_attack_data[i])):
        curr_sys_call = train_attack_data[i][j]
        if curr_sys_call not in unique_normal:
            attack_not_in_normal.append(curr_sys_call)

In [7]:
# Unique system calls in normal list
len(unique_normal)

150

In [8]:
# Unique system calls in attack list not present in normal
attack_not_in_normal

[324, 324, 324, 324, 324, 324, 324, 173, 156]

In [9]:
freq_dict = {}
#Appending unique system calls from the normal list
for i in range(len(train_normal_data)):
    for j in range(len(train_normal_data[i])):
        curr_sys_call = train_normal_data[i][j]
        if curr_sys_call in freq_dict:
            freq_dict[curr_sys_call] += 1 
        else:
            freq_dict[curr_sys_call] = 0


In [12]:
sorted_normal_dict = {}
sorted_keys = sorted(freq_dict, key=freq_dict.get)  # [1, 3, 2]

for w in sorted_keys:
    sorted_normal_dict[w] = freq_dict[w]

In [9]:
#Generating one hot vectors
dict = {}
vocab_size = 341

for x in range(vocab_size):
    arr=[]
    arr = [0 for i in range(vocab_size)] 
    arr[x] = 1
    dict[x] = arr

In [10]:
#Sequence size
n = 15
#N-Gram
m = 2

In [11]:
# This method receives a request array and returns one hot encoded version of that array
def generate_one_hot(request):
    temp = []
    for sys_call in request:
        temp.append(dict[int(sys_call)])
    return temp

# This method recevies a request array, start and  end of the request and populates x_train and y_train
# with returned one hot version from the generate_trainSet method
def split_request(source, target, request, start, end):
    while(len(request)-start >= n+m):
        source.append(generate_one_hot(request[start:end]))
        start += m
        end += m
        target.append(generate_one_hot(request[start:end]))
    

In [12]:
x_train = []
y_train = [] 

# Populating x_train and y_train with sources and targets
for i in range(len(train_normal_data)):
    split_request(x_train, y_train, train_normal_data[i],0,n)

In [13]:
print(len(x_train))
print(len(y_train))
print(len(train_normal_data))


147595
147595
833


In [14]:
# Minimizing for batch size compatibility
del x_train[147000:]
del y_train[147000:]

In [15]:
print(len(x_train))
print(len(y_train))

147000
147000


In [16]:
# Converting x_train and y_train into np arrays
x_train = np.array(x_train)
y_train = np.array(y_train)

In [17]:
#x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
print(x_train.shape)
print(y_train.shape)

(147000, 15, 341)
(147000, 15, 341)


In [18]:
# split test data into training and testing sets
x_train, x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.10,random_state=4)

In [19]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(132300, 15, 341)
(132300, 15, 341)
(14700, 15, 341)
(14700, 15, 341)


In [None]:
n_timesteps = n
n_features = vocab_size
numberOfUnits = 200

input= Input(shape=(n_timesteps, n_features))

lstm1 = LSTM(numberOfUnits,return_sequences=True, return_state=True)
all_state_h, state_h, state_c = lstm1(input) 
states = [state_h, state_c]

lstm2 = LSTM(numberOfUnits,return_sequences=True)
all_state_h = lstm2(all_state_h,initial_state=states)

dense = (Dense(n_features, activation='softmax'))
output = dense(all_state_h)
model_LSTM_return_sequences_return_state = Model(input,output,
                                name='model_LSTM_all_state_h_return_state')
model_LSTM_return_sequences_return_state.compile(loss='categorical_crossentropy', 
                                                 optimizer='adam',
                                                 metrics=['accuracy'])
model_LSTM_return_sequences_return_state.summary()

In [None]:
def train_test(model, X_train, y_train , 
               X_test, 	y_test, epochs=50, 
							        verbose=0, patience=5):
	# patient early stopping
	#es = EarlyStopping(monitor='val_accuracy', mode='max', min_delta=1, patience=20)
	es = EarlyStopping(monitor='val_loss', mode='min', 
	                   verbose=1, patience=patience)
	# train model
	print('training for ',epochs,
	      ' epochs begins with',
				' EarlyStopping(monitor= val_loss ',
				' patience=',patience,')....')
	history=model.fit(X_train, y_train, validation_split= 0.1, epochs=epochs,  verbose=verbose, callbacks=[es])
	print(epochs,' epoch training finished...')

	# report training
	# list all data in history
	print(history.history.keys())
	# evaluate the model
	_, train_acc = model.evaluate(X_train, y_train, verbose=0)
	_, test_acc = model.evaluate(X_test, 	y_test, verbose=0)
	print('\nPREDICTION ACCURACY (%):')
	print('Train: %.3f, Test: %.3f' % (train_acc*100, test_acc*100))
	# summarize history for accuracy
	plt.plot(history.history['accuracy'])
	plt.plot(history.history['val_accuracy'])
	plt.title(model.name+' accuracy')
	plt.ylabel('accuracy')
	plt.xlabel('epoch')
	plt.legend(['train', 'val'], loc='upper left')
	plt.show()
	# summarize history for loss
	plt.plot(history.history['loss'])
	plt.plot(history.history['val_loss'])
	plt.title(model.name+' loss')
	plt.ylabel('loss')
	plt.xlabel('epoch')
	plt.legend(['train', 'val'], loc='upper left')
	plt.show()

In [None]:
train_test(model_LSTM_return_sequences_return_state, x_train, y_train , x_test, y_test)

In [None]:
# import os.path
# if os.path.isfile('models/LSTM_HIDS.h5') is False:
#     model_LSTM_return_sequences_return_state.save('models/LSTM_HIDS.h5')

In [29]:
# Receives one hot represetation and returns index where value = 1
def one_hot_decode(arr):
    for index,num in enumerate(arr):
        if num == 1:
            return index
        
# Receives an array to append to and a 3D-array that is one hot encoded      
def decode(arr, three_d_array):
    for seq in three_d_array:
        temp = []
        for one_hot in seq:
            temp.append(one_hot_decode(one_hot))
        arr.append(temp)
    
        

In [30]:
# Decodes prediction done by LSTM and stores it in arr.
def prediction_decode(arr, prediction):
    for seq in prediction:
        predict_temp = []
        for one_hot in seq:
            predict_temp.append(argmax(one_hot))
        arr.append(predict_temp)
    
    

In [31]:
# perfect match
from nltk.translate.bleu_score import sentence_bleu
def calc_belu(target, prediction):
    reference = []
    candidate = []
    reference.append(target)
    candidate.extend(prediction)
    return sentence_bleu(reference, candidate, weights=(0, 1))

In [None]:
# VALIDATION

In [None]:
print(train_validation_data[2])

In [32]:
for i in range(len(train_validation_data)):
    for j in range(len(train_validation_data[i])):
        train_validation_data[i][j] = int(train_validation_data[i][j])

In [None]:
print(len(train_validation_data))

In [38]:
def calc_request_belu_score(request,start,end):
    request_sources = []
    request_targets = []
    request_prediction = []
    request_scores= []
    
    decoded_request_targets = []
    
    while(len(request)-start >= n+m):
        request_sources.append(generate_one_hot(request[start:end]))
        start += m
        end += m
        request_targets.append(request[start:end])
    
    i = 1
    while(i < len(request_sources)):
        prediction_decode( request_prediction, model_encoder_decoder.predict(request_sources[i-1:i]))
        i += 1
    
    for i in range(len(request_prediction)):
        request_scores.append(calc_belu(request_targets[i],request_prediction[i]))
        
    print(request_scores)
    return request_scores

In [39]:
means = []

for i in range(len(train_validation_data)-2300-1000):
    print(f'{i+1}/{len(train_validation_data)-2300-1000}')
    print('-------------------------------------------------------------------')
    means.append(statistics.mean(calc_request_belu_score(train_validation_data[i],0,n)))
    
    

1/1072
-------------------------------------------------------------------


InvalidArgumentError: Graph execution error:

Invalid input_h shape: [1,1,100] [1,300,100]
	 [[{{node CudnnRNN}}]]
	 [[model_encoder_decoder/decoder_lstm/PartitionedCall]] [Op:__inference_predict_function_735290]

In [None]:
print(statistics.mean(means))
print(statistics.mean(attack_means))

In [None]:
len(means)

#Binary Classification

In [None]:
# threshold = 0.834

threshold = .85

false_negative = []
false_positive = []
normal = []
anomaly = []

for avg in means:
    if avg > threshold:
        normal.append(avg)
    else:
        false_negative.append(avg)

for avg in attack_means:
    if avg > threshold:
        false_positive.append(avg)
    else:
        anomaly.append(avg)

    
    


In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Safe','False_Negative'
sizes = [len(normal),len(false_negative)]
explode = (0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Attack', 'False_Positive'
sizes = [len(anomaly),len(false_positive)]
explode = (0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
#Binary End

In [None]:
threshold = .91

weak_t = 0.78

medium_t = 0.65

percentages = []

def calculate_threat(avg_list):

    weak_count = 0
    medium_count = 0
    strong_count = 0
    green_light = 0

    for i in range(len(avg_list)):
        if avg_list[i] >= threshold:
            green_light += 1
        elif avg_list[i] >= weak_t:
            weak_count += 1
        elif avg_list[i] >= medium_t:
            medium_count += 1
        else:
            strong_count += 1
    

    
    print(f'Green Light: {green_light/len(avg_list)}')        
    print(f'Weak threat: {weak_count/len(avg_list)}')
    print(f'Medium threat: {medium_count/len(avg_list)}')
    print(f'Strong threat: {strong_count/len(avg_list)}')
    
    percentages.append(green_light/len(avg_list)* 100)
    percentages.append(weak_count/len(avg_list) * 100)
    percentages.append(medium_count/len(avg_list) * 100)
    percentages.append(strong_count/len(avg_list) * 100)

In [None]:
calculate_threat(means)

In [None]:
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Safe', 'Weak Threat', 'Medium Threat', 'Strong Threat'
sizes = [percentages[0],percentages[1],percentages[2],percentages[3]]
explode = (0.2, 0, 0, 0)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
percentages = []
calculate_threat(attack_means)

In [None]:

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = 'Safe', 'Weak Threat', 'Medium Threat', 'Strong Threat'
sizes = [percentages[0],percentages[1],percentages[2],percentages[3]]
explode = (0.6, 0, 0, 0.3)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

In [None]:
attack_means = []

for i in range((len(train_attack_data))):
    print(f'{i+1}/{len(train_attack_data)}')
    print('-------------------------------------------------------------------')
    attack_means.append(statistics.mean(calc_request_belu_score(train_attack_data[i],0,n)))
    
    
        
    

In [None]:
plt.hist(means)
plt.hist(attack_means)