In [36]:
import numpy as np 
import tensorflow as tf
import itertools
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import one_hot

from keras.metrics import categorical_crossentropy

from itertools import chain


In [37]:
#Checking for tensorflow-GPU
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


Default GPU Device: /device:GPU:0


In [38]:
#Unzipping Attack and Validation folders

# Unzip Attack Folder to retrive subfolfers
from zipfile import ZipFile
file_name = "Attack_Data_Master.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unizipped Attack File')
    
# Unzip Validation Folder to retrive subfolfers
from zipfile import ZipFile
file_name = "Validation_Data_Master.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('Unizipped Validation File')

# list holding normal training data
train_normal_data = [] 

files = 834 # maximum number of files for training data
main = "UTD-0000"
# iterate over all files
for i in range(1,files):
  back = str(i)
# appending each training vector to the list train_data.
  with open (main[:len(main)-len(back)] + back  + ".txt", 'r') as f:
    string_data = (f.read())
    string_split_data = string_data.split(" ")
    del (string_split_data[-1])
    train_normal_data.append(string_split_data)

for i in range(len(train_normal_data)):
  for j in range(len(train_normal_data[i])):
    train_normal_data[i][j] = int(train_normal_data[i][j])

from pathlib import Path 
# list that holds attack vectors
train_attack_data = []

# Set file path ot the folder for iterations
input_dir = Path.cwd() / "Attack_Data_Master"
# store all the files ending with .txt in a list called files
files = list (input_dir.rglob("*.txt*"))
# Iterate over all the txt files and append the attack in the list train_attack data
for FILE in files:
  with open (FILE, 'r') as f:
    stringData = (f.read())
    string_split_data = stringData.split(" ")
    del (string_split_data[-1])
    train_attack_data.append(string_split_data)
    

for i in range(len(train_attack_data)):
  for j in range(len(train_attack_data[i])):
    train_attack_data[i][j] = int(train_attack_data[i][j])

from pathlib import Path 
# list that holds validation vectors
train_validation_data = []

# Set file path ot the folder for iterations
input_dir = Path.cwd() / "Validation_Data_Master"
# store all the files ending with .txt in a list called files
files = list (input_dir.rglob("*.txt*"))
# Iterate over all the txt files and append the attack in the list train_attack data
for FILE in files:
  with open (FILE, 'r') as f:
    stringData = (f.read())
    string_split_data = stringData.split(" ")
    del (string_split_data[-1])
    train_validation_data.append(string_split_data)
    

for i in range(len(train_validation_data)):
  for j in range(len(train_validation_data[i])):
    train_validation_data[i][j] = int(train_validation_data[i][j])
print("Normal Data     --->  train_normal_data")
print("Attack Data     --->  train_attack_data")
print("Validation Data --->  train_validation_data")


Unizipped Attack File
Unizipped Validation File
Normal Data     --->  train_normal_data
Attack Data     --->  train_attack_data
Validation Data --->  train_validation_data


In [39]:
#Statistics of the data

#Shortest reuqest
shortest_seq = 1000;
#Longest request
longest_seq = 0;
#Average Request size in dataset
avg_seq = 0

sum = 0;
for i in range(len(train_normal_data)):
    curr_sequence_length = len(train_normal_data[i])
    if curr_sequence_length < shortest_seq:
        shortest_seq = curr_sequence_length
    if curr_sequence_length > longest_seq:
        longest_seq = curr_sequence_length
    sum += curr_sequence_length
avg_seq = int(sum/len(train_normal_data))
    

In [23]:
print(f"Shortest Request Length is {shortest_seq}")
print(f"Longest Request Length is {longest_seq}")
print(f"Average Request Length is {avg_seq}")

Shortest Request Length is 79
Longest Request Length is 2948
Average Request Length is 369


In [40]:
unique_normal = []
attack_not_in_normal = []
for i in range(len(train_normal_data)):
    for j in range(len(train_normal_data[i])):
        curr_sys_call = train_normal_data[i][j]
        if curr_sys_call in unique_normal:
            continue
        else:
            unique_normal.append(curr_sys_call)

for i in range(len(train_attack_data)):
    for j in range(len(train_attack_data[i])):
        curr_sys_call = train_attack_data[i][j]
        if curr_sys_call not in unique_normal:
            attack_not_in_normal.append(curr_sys_call)

In [32]:
unique_normal

[6,
 63,
 42,
 120,
 195,
 114,
 1,
 252,
 54,
 175,
 3,
 7,
 119,
 174,
 140,
 11,
 45,
 33,
 192,
 5,
 197,
 243,
 125,
 91,
 258,
 311,
 240,
 191,
 122,
 268,
 201,
 196,
 38,
 13,
 4,
 118,
 194,
 221,
 66,
 12,
 60,
 220,
 199,
 10,
 85,
 83,
 96,
 97,
 289,
 163,
 141,
 331,
 78,
 57,
 168,
 146,
 102,
 202,
 158,
 265,
 219,
 300,
 133,
 160,
 159,
 142,
 180,
 207,
 94,
 9,
 41,
 39,
 272,
 340,
 155,
 157,
 266,
 183,
 19,
 209,
 205,
 143,
 254,
 255,
 179,
 27,
 332,
 292,
 99,
 93,
 117,
 15,
 293,
 269,
 75,
 204,
 203,
 213,
 200,
 20,
 64,
 30,
 308,
 162,
 211,
 40,
 314,
 198,
 212,
 37,
 256,
 128,
 21,
 301,
 295,
 65,
 307,
 259,
 260,
 264,
 214,
 172,
 104,
 208,
 110,
 132,
 176,
 229,
 226,
 206,
 8,
 77,
 43,
 309,
 322,
 233,
 144,
 270,
 148,
 242,
 320,
 231,
 228,
 298,
 26,
 184,
 224,
 185,
 234,
 230]

In [42]:
attack_not_in_normal

[324, 324, 324, 324, 324, 324, 324, 173, 156]

In [44]:
#Generating one hot vectors
vocab_size = 341
encoded_train_normal = [one_hot(str(d), vocab_size) for d in train_normal_data]


In [53]:
encoded_train_normal[0]


[212,
 212,
 136,
 212,
 100,
 183,
 212,
 71,
 183,
 212,
 212,
 91,
 91,
 91,
 91,
 70,
 70,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 91,
 91,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 91,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 70,
 70,
 91,
 70,
 91,
 91,
 91,
 91,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 70,
 70,
 70,
 70,
 70,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 91,


In [56]:
x_train = []
y_train = []

def preprocess(request):
    start = 0;
    end = 5;
    num_features = 5
    factor = int(len(request) / 6)
    
    for i in range(0,factor):
        x_train.append(request[start:end])
        y_train.append(request[end])
        start = end
        end += num_features
        

In [58]:
for i in range(len(encoded_train_normal)):
    preprocess(encoded_train_normal[i])

102000