In [None]:
import pandas as pd
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import initializers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Input, BatchNormalization, LeakyReLU, Dense, Reshape, Flatten, Activation 
from tensorflow.keras.layers import Dropout, multiply, GaussianNoise, MaxPooling2D, concatenate, Embedding
import time
from datetime import date
from tqdm import tqdm

In [None]:
def get_generator(optimizer, max_length):
    
    generator = Sequential()

    generator.add(Dense(max_length, input_dim=max_length, kernel_initializer=initializers.glorot_normal(seed=42)))
    generator.add(Activation('tanh'))
    
    generator.add(Dense(max_length))
    generator.add(Activation('tanh'))
    
    generator.add(Dense(max_length))
    generator.add(Activation('tanh'))
    
    generator.add(Dense(max_length))
    generator.add(Activation('tanh'))
       
    generator.add(Dense(512))
    generator.add(Activation('tanh'))

    generator.add(Dense(1024))
    generator.add(Activation('tanh'))
   
    generator.add(Dense(max_length, activation='tanh'))
    
    generator.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    return generator

def get_discriminator(optimizer, max_length):
    
    discriminator = Sequential()

    discriminator.add(Dense(512, input_dim=max_length, kernel_initializer=initializers.glorot_normal(seed=42)))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))

    discriminator.add(Dense(1024))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))
       
    discriminator.add(Dense(1024))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))
    
    discriminator.add(Dense(1024))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))

    discriminator.add(Dense(max_length))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))
    
    discriminator.add(Dense(1))
    discriminator.add(Activation('sigmoid'))
   
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer)

    return discriminator

def get_gan_network(discriminator, generator, optimizer,input_dim):

    discriminator.trainable = False   
    gan_input = Input(shape=(input_dim,))  
    x = generator(gan_input)        
    gan_output = discriminator(x)
    
    gan = Model(inputs=gan_input, outputs=gan_output)    
    gan.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    return gan

In [None]:
f = open('conn.log', 'r')
lines = f.readlines()
f.close()

In [None]:
model_type = {"dest_host":4,"dest_port":5,"service_duration":8,"orig_bytes":9,"resp_bytes":10,"orig_pkts":16,"resp_pkts":18}

In [None]:
models = {}
dest_hosts = []
dest_ports = []
service_durations = []
orig_bytes = []
resp_bytes = []
orig_pkts = []
resp_pkts = []

for line in lines[8:]:
    values = line.strip().split('\t')
    dest_hosts.append(values[model_type["dest_host"]].replace('"',''))
    dest_ports.append(values[model_type["dest_port"]].replace('"',''))
    
    try:
        service_durations.append(float(values[model_type["service_duration"]].replace('"','')))
    except:
        service_durations.append(0)
    
    try:
        orig_bytes.append(float(values[model_type["orig_bytes"]].replace('"','')))
    except:
        orig_bytes.append(0)
        
    try:
        resp_bytes.append(float(values[model_type["resp_bytes"]].replace('"','')))
    except:
        resp_bytes.append(0)
        
    try:
        orig_pkts.append(float(values[model_type["orig_pkts"]].replace('"','')))
    except:
        orig_pkts.append(0)
        
    try:
        resp_pkts.append(float(values[model_type["resp_pkts"]].replace('"','')))
    except:
        resp_pkts.append(0)
    
vocab = {}

import random
low = 0
high = 1
  
# Python Generate List of Random Numbers Between 0 to 1
floatList = [random.uniform(low, high) for _ in range(257)]

for i in range(0, 257):
    vocab[i] = floatList[i]
    
vocab_size = len(vocab)

inverse_vocab = {index: token for token, index in vocab.items()}
    
dest_hosts_lengths = np.array([len(dest_host.replace('\n','').replace('\r','')) for dest_host in dest_hosts])
dest_ports_lengths = np.array([len(dest_port.replace('\n','').replace('\r','')) for dest_port in dest_ports])
dest_hosts_max_length = int(dest_hosts_lengths.mean() + dest_hosts_lengths.std())
dest_ports_max_length = 5

dest_hosts_x = [np.frombuffer(bytearray(dest_host, 'utf-8'), np.uint8) for dest_host in dest_hosts]
dest_hosts_x = pad_sequences(dest_hosts_x, maxlen=dest_hosts_max_length, padding='post', value=0, truncating='post')

dest_ports_x = [np.frombuffer(bytearray(dest_port, 'utf-8'), np.uint8) for dest_port in dest_ports]
dest_ports_x = pad_sequences(dest_ports_x, maxlen=dest_ports_max_length, padding='post', value=0, truncating='post')

dest_hosts_x_train = []
for item in dest_hosts_x:
    dest_hosts_x_train.append([vocab[i] for i in item])

dest_ports_x_train = []
for item in dest_ports_x:
    dest_ports_x_train.append([vocab[i] for i in item])

scaled_dest_hosts = np.array(dest_hosts_x_train, dtype=np.float32)

final_arr = []
for i, line in enumerate(lines[8:]):
    temp_arr = [service_durations[i], orig_bytes[i], resp_bytes[i], orig_pkts[i], resp_pkts[i]]
    final_arr.append(temp_arr)
final_arr = np.array(final_arr, dtype=np.float32)

scaler = MinMaxScaler()
scaled_final_arr = scaler.fit_transform(final_arr)

scaled_final_arr = np.concatenate((dest_ports_x_train,scaled_final_arr), axis=1)
dest_ports_max_length = 10

In [None]:
learning_rate = 0.00001
batch_size = len(scaled_dest_hosts)
epochs = 200
adam = tf.keras.optimizers.legacy.Adam(learning_rate = learning_rate,beta_1 = 0.5)
    
#Calculating the number of batches based on the batch size
batch_count = len(scaled_dest_hosts) // batch_size
pbar = tqdm(total=epochs * batch_count)
gan_loss = []
discriminator_loss = []

#Inititalizing the network
generator = get_generator(adam, dest_hosts_max_length)
discriminator = get_discriminator(adam, dest_hosts_max_length)
gan = get_gan_network(discriminator, generator, adam,input_dim=dest_hosts_max_length)
    
for epoch in range(epochs):
    d_loss= 0       
    for index in range(batch_count):        
        pbar.update(1)        
        # Creating a random set of input noise and images
        noise = np.random.normal(0, 1, size=[batch_size,dest_hosts_max_length])
        
        # Generate fake logs
        generated_images = generator.predict_on_batch(noise)
        
        #Obtain a batch of normal logs
        image_batch = scaled_dest_hosts[index * batch_size: (index + 1) * batch_size]
            
        X = np.vstack((generated_images,image_batch))       
        y_dis = np.ones(2*batch_size) 
        y_dis[:batch_size] = 0

        # Train discriminator
        discriminator.trainable = True
        d_loss= discriminator.train_on_batch(X, y_dis)

        # Train generator
        noise = np.random.uniform(0, 1, size=[batch_size, dest_hosts_max_length])
        y_gen = np.ones(batch_size)
        discriminator.trainable = False
        g_loss = gan.train_on_batch(noise, y_gen)
            
        #Record the losses
        gan_loss.append(g_loss)
    discriminator_loss.append(d_loss)
        
orig_recon_scores = discriminator.predict(scaled_dest_hosts, verbose=False)
orig_results = pd.DataFrame(orig_recon_scores, columns=['anomaly_score'])
m = orig_results['anomaly_score'].median()
std = orig_results['anomaly_score'].std()
        
#models[model_type[j]] = (discriminator, vocab, max_length, m, std)
print("Discriminator loss: ")
print(discriminator_loss)
print('\n\n')
    

In [None]:
learning_rate = 0.00001
batch_size = len(scaled_final_arr)
epochs = 200
adam = tf.keras.optimizers.legacy.Adam(learning_rate = learning_rate,beta_1 = 0.5)
    
#Calculating the number of batches based on the batch size
batch_count = len(scaled_final_arr) // batch_size
pbar = tqdm(total=epochs * batch_count)
gan_loss = []
discriminator_loss = []

#Inititalizing the network
generator = get_generator(adam, dest_ports_max_length)
discriminator = get_discriminator(adam, dest_ports_max_length)
gan = get_gan_network(discriminator, generator, adam,input_dim=dest_ports_max_length)
    
for epoch in range(epochs):
    d_loss= 0       
    for index in range(batch_count):        
        pbar.update(1)        
        # Creating a random set of input noise and images
        noise = np.random.normal(0, 1, size=[batch_size,dest_ports_max_length])
        
        # Generate fake logs
        generated_images = generator.predict_on_batch(noise)
        
        #Obtain a batch of normal logs
        image_batch = scaled_final_arr[index * batch_size: (index + 1) * batch_size]
            
        X = np.vstack((generated_images,image_batch))       
        y_dis = np.ones(2*batch_size) 
        y_dis[:batch_size] = 0

        # Train discriminator
        discriminator.trainable = True
        d_loss= discriminator.train_on_batch(X, y_dis)

        # Train generator
        noise = np.random.uniform(0, 1, size=[batch_size, dest_ports_max_length])
        y_gen = np.ones(batch_size)
        discriminator.trainable = False
        g_loss = gan.train_on_batch(noise, y_gen)
            
        #Record the losses
        gan_loss.append(g_loss)
    discriminator_loss.append(d_loss)
        
orig_recon_scores = discriminator.predict(scaled_final_arr, verbose=False)
orig_results = pd.DataFrame(orig_recon_scores, columns=['anomaly_score'])
connection_median = orig_results['anomaly_score'].median()
connection_std = orig_results['anomaly_score'].std()
        
#models[model_type[j]] = (discriminator, vocab, max_length, m, std)
print("Discriminator loss: ")
print(discriminator_loss)
print('\n\n')
    

In [None]:
m

In [None]:
std

In [None]:
lines[18].split('\t')

In [None]:
len(lines)

In [None]:
dest_hosts = []
dest_ports = []
service_durations = []
orig_bytes = []
resp_bytes = []
orig_pkts = []
resp_pkts = []

for line in lines[8:]:
    values = line.strip().split('\t')
    dest_hosts.append(values[model_type["dest_host"]].replace('"',''))
    dest_ports.append(values[model_type["dest_port"]].replace('"',''))
    
    try:
        service_durations.append(float(values[model_type["service_duration"]].replace('"','')))
    except:
        service_durations.append(0)
    
    try:
        orig_bytes.append(float(values[model_type["orig_bytes"]].replace('"','')))
    except:
        orig_bytes.append(0)
        
    try:
        resp_bytes.append(float(values[model_type["resp_bytes"]].replace('"','')))
    except:
        resp_bytes.append(0)
        
    try:
        orig_pkts.append(float(values[model_type["orig_pkts"]].replace('"','')))
    except:
        orig_pkts.append(0)
        
    try:
        resp_pkts.append(float(values[model_type["resp_pkts"]].replace('"','')))
    except:
        resp_pkts.append(0)

dest_hosts_x = [np.frombuffer(bytearray(dest_host, 'utf-8'), np.uint8) for dest_host in dest_hosts]
dest_hosts_x = pad_sequences(dest_hosts_x, maxlen=dest_hosts_max_length, padding='post', value=0, truncating='post')

dest_ports_max_length = 5
dest_ports_x = [np.frombuffer(bytearray(dest_port, 'utf-8'), np.uint8) for dest_port in dest_ports]
dest_ports_x = pad_sequences(dest_ports_x, maxlen=dest_ports_max_length, padding='post', value=0, truncating='post')

dest_hosts_x_train = []
for item in dest_hosts_x:
    dest_hosts_x_train.append([vocab[i] for i in item])

dest_ports_x_train = []
for item in dest_ports_x:
    dest_ports_x_train.append([vocab[i] for i in item])

scaled_dest_hosts = np.array(dest_hosts_x_train, dtype=np.float32)

final_arr = []
for i, line in enumerate(lines[8:]):
    temp_arr = [service_durations[i], orig_bytes[i], resp_bytes[i], orig_pkts[i], resp_pkts[i]]
    final_arr.append(temp_arr)
final_arr = np.array(final_arr, dtype=np.float32)

scaled_final_arr = scaler.transform(final_arr)

scaled_final_test = np.concatenate((dest_ports_x_train,scaled_final_arr), axis=1)



In [None]:
results = discriminator.predict(scaled_final_test)

In [None]:
results

In [None]:
connection_median

In [None]:
connection_std

In [None]:
zscores = []

for pred in results:
    value = pred[0]
    found = False
    for i in range(1,10):
        threshold = connection_median - (i * connection_std)
        if value > threshold:
            zscores.append(i)
            found = True
            break
    if not found:
        zscores.append(10)

In [None]:
for i in zscores:
    print(i)

In [68]:
print(lines[3404])

1711394766.979827	C0sYpa4mZA85wY5bP6	192.168.100.129	34100	143.198.3.13	11601	tcp	ssl	86511.291214	24089117	17914541	S1	-	-	0	ShADadwtt	326237	37138617	345043	31722626	-

