In [None]:
import pandas as pd
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import initializers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Input, BatchNormalization, LeakyReLU, Dense, Reshape, Flatten, Activation 
from tensorflow.keras.layers import Dropout, multiply, GaussianNoise, MaxPooling2D, concatenate, Embedding
import time
from datetime import date
from tqdm import tqdm

In [None]:
def get_generator(optimizer, max_length):
    
    generator = Sequential()

    generator.add(Dense(max_length, input_dim=max_length, kernel_initializer=initializers.glorot_normal(seed=42)))
    generator.add(Activation('tanh'))
    
    generator.add(Dense(max_length))
    generator.add(Activation('tanh'))
    
    generator.add(Dense(max_length))
    generator.add(Activation('tanh'))
    
    generator.add(Dense(max_length))
    generator.add(Activation('tanh'))
       
    generator.add(Dense(512))
    generator.add(Activation('tanh'))

    generator.add(Dense(1024))
    generator.add(Activation('tanh'))
   
    generator.add(Dense(max_length, activation='tanh'))
    
    generator.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    return generator

def get_discriminator(optimizer, max_length):
    
    discriminator = Sequential()

    discriminator.add(Dense(512, input_dim=max_length, kernel_initializer=initializers.glorot_normal(seed=42)))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))

    discriminator.add(Dense(1024))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))
       
    discriminator.add(Dense(1024))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))
    
    discriminator.add(Dense(1024))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))

    discriminator.add(Dense(max_length))
    discriminator.add(Activation('relu'))
    discriminator.add(Dropout(0.2))
    
    discriminator.add(Dense(1))
    discriminator.add(Activation('sigmoid'))
   
    discriminator.compile(loss='binary_crossentropy', optimizer=optimizer)

    return discriminator

def get_gan_network(discriminator, generator, optimizer,input_dim):

    discriminator.trainable = False   
    gan_input = Input(shape=(input_dim,))  
    x = generator(gan_input)        
    gan_output = discriminator(x)
    
    gan = Model(inputs=gan_input, outputs=gan_output)    
    gan.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    return gan

In [None]:
f = open('parsed-sysmon-logs-recent.txt', 'r')
lines = f.readlines()
f.close()

In [None]:
lines[0]

In [None]:
model_types = lines[0].strip().split('``')
model_types

In [None]:
models = {}
for i, model_type in enumerate(model_types):
    values = []
    for line in lines:
        value = line.strip().split('``')[i]
        values.append(value)
    
    vocab = {}

    import random
    low = 0
    high = 1
  
    # Python Generate List of Random Numbers Between 0 to 1
    floatList = [random.uniform(low, high) for _ in range(257)]

    for i in range(0, 257):
        vocab[i] = floatList[i]
    
    vocab_size = len(vocab)

    inverse_vocab = {index: token for token, index in vocab.items()}
    
    lengths = np.array([len(value.replace('\n','').replace('\r','')) for value in values])
    max_length = int(lengths.mean() + lengths.std())

    x = [np.frombuffer(bytearray(value, 'utf-8'), np.uint8) for value in values]
    x = pad_sequences(x, maxlen=max_length, padding='post', value=0, truncating='post')

    x_train = []
    for item in x:
        x_train.append([vocab[i] for i in item])

    scaled_x = np.array(x_train, dtype=np.float32)
    
    learning_rate = 0.00001
    batch_size = len(scaled_x)
    epochs = 200
    adam = tf.keras.optimizers.legacy.Adam(learning_rate = learning_rate,beta_1 = 0.5)
    
    #Calculating the number of batches based on the batch size
    batch_count = len(scaled_x) // batch_size
    pbar = tqdm(total=epochs * batch_count)
    gan_loss = []
    discriminator_loss = []

    #Inititalizing the network
    generator = get_generator(adam, max_length)
    discriminator = get_discriminator(adam, max_length)
    gan = get_gan_network(discriminator, generator, adam,input_dim=max_length)
    
    for epoch in range(epochs):
        d_loss= 0       
        for index in range(batch_count):        
            pbar.update(1)        
            # Creating a random set of input noise and images
            noise = np.random.normal(0, 1, size=[batch_size,max_length])
        
            # Generate fake logs
            generated_images = generator.predict_on_batch(noise)
        
            #Obtain a batch of normal logs
            image_batch = scaled_x[index * batch_size: (index + 1) * batch_size]
            
            X = np.vstack((generated_images,image_batch))       
            y_dis = np.ones(2*batch_size) 
            y_dis[:batch_size] = 0

            # Train discriminator
            discriminator.trainable = True
            d_loss= discriminator.train_on_batch(X, y_dis)

            # Train generator
            noise = np.random.uniform(0, 1, size=[batch_size, max_length])
            y_gen = np.ones(batch_size)
            discriminator.trainable = False
            g_loss = gan.train_on_batch(noise, y_gen)
        
            #Record the losses
            gan_loss.append(g_loss)
        discriminator_loss.append(d_loss)
        
    orig_recon_scores = discriminator.predict(scaled_x, verbose=False)
    orig_results = pd.DataFrame(orig_recon_scores, columns=['anomaly_score'])
    m = orig_results['anomaly_score'].median()
    std = orig_results['anomaly_score'].std()
        
    models[model_type] = (discriminator, vocab, max_length, m, std)