In [None]:
"""
This first module will focus on loading the dataset from the preprocessing step. It will then
remove all variables that will not be used in developing the GAN.
"""

from google.colab import files
uploaded = files.upload()

import pandas as pd
import copy

# Load the data
sysmon_data = pd.read_csv('modified_sysmon_data.csv')

Saving modified_sysmon_data.csv to modified_sysmon_data.csv


In [None]:
import ast
sysmon_data2 = copy.deepcopy(sysmon_data)

def convert_to_list(string):
    try:
        return ast.literal_eval(string)
    except (ValueError, SyntaxError):
        # Return an empty list in case of a parsing error
        return []


# Convert the string representations of lists back into actual lists
sysmon_data2['Encoded_CurrentDirectory'] = sysmon_data2['Encoded_CurrentDirectory'].apply(convert_to_list)
sysmon_data2['Encoded_ImageDirectory'] = sysmon_data2['Encoded_ImageDirectory'].apply(convert_to_list)

# Apply a lambda function to get the last element of each list or 0 if the list is empty
sysmon_data2['Last_CurrentDirectory'] = sysmon_data2['Encoded_CurrentDirectory'].apply(lambda x: x[-1] if x else 0)
sysmon_data2['Last_ImageDirectory'] = sysmon_data2['Encoded_ImageDirectory'].apply(lambda x: x[-1] if x else 0)

#This variable has to be renamed or it will cause problems with the postprocessing script
sysmon_data2.rename(columns={'Image_Encoded': 'Encoded_Image'}, inplace=True)

# Create one-hot encoded columns for TerminalSessionId
file_version_dummies = pd.get_dummies(sysmon_data2['TerminalSessionId'], prefix='TerminalSessionId')

# Join these new columns to the original DataFrame
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for FileVersion
file_version_dummies = pd.get_dummies(sysmon_data2['FileVersion'], prefix='FileVersion')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for LogonId
file_version_dummies = pd.get_dummies(sysmon_data2['LogonId'], prefix='LogonId')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for LogonGuid
file_version_dummies = pd.get_dummies(sysmon_data2['LogonGuid'], prefix='LogonGuid')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for Company
file_version_dummies = pd.get_dummies(sysmon_data2['Company'], prefix='Company')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for Product
file_version_dummies = pd.get_dummies(sysmon_data2['Product'], prefix='Product')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for Description
file_version_dummies = pd.get_dummies(sysmon_data2['Description'], prefix='Description')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for IntegrityLevel
file_version_dummies = pd.get_dummies(sysmon_data2['IntegrityLevel'], prefix='IntegrityLevel')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for ParentUser
file_version_dummies = pd.get_dummies(sysmon_data2['ParentUser'], prefix='ParentUser')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for User
file_version_dummies = pd.get_dummies(sysmon_data2['User'], prefix='User')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for Encoded_image
file_version_dummies = pd.get_dummies(sysmon_data2['Encoded_Image'], prefix='Encoded_Image')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for ParentImage_Encoded
file_version_dummies = pd.get_dummies(sysmon_data2['ParentImage_Encoded'], prefix='ParentImage_Encoded')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for ImageDirectory_Type
file_version_dummies = pd.get_dummies(sysmon_data2['ImageDirectory_Type'], prefix='ImageDirectory_Type')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for CurrentDirectory_Type
file_version_dummies = pd.get_dummies(sysmon_data2['CurrentDirectory_Type'], prefix='CurrentDirectory_Type')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for Last_CurrentDirectory
file_version_dummies = pd.get_dummies(sysmon_data2['Last_CurrentDirectory'], prefix='Last_CurrentDirectory')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Create one-hot encoded columns for Last_ImageDirectory
file_version_dummies = pd.get_dummies(sysmon_data2['Last_ImageDirectory'], prefix='Last_ImageDirectory')
sysmon_data2 = pd.concat([sysmon_data2, file_version_dummies], axis=1)

# Normalize time variable by converting it to number of minutes past midnight and scaling to range 0-1
sysmon_data2['Minutes'] = (pd.to_datetime(sysmon_data2['Time']).dt.hour * 60 + \
                                      pd.to_datetime(sysmon_data2['Time']).dt.minute) / 1440.0

# Normalize Day between 0 and 1
sysmon_data2['Day'] = sysmon_data2['Day'] / 5.0


# Defines and drops the columns that are no longer needed
columns_to_drop = ['FileVersion', 'LogonId', 'LogonGuid', 'Company', 'Product', 'Description', 'IntegrityLevel', \
                   'ParentUser', 'User', 'Encoded_Image', 'ParentImage_Encoded', 'ImageDirectory_Type', \
                   'CurrentDirectory_Type', 'TerminalSessionId', 'ProcessId', 'ParentProcessId', 'CommandLine_Tokens', \
                   'ParentCommandLine_Tokens', 'CurrentDirectory_Tokens', 'Image_Directory', 'Time', \
                   'Encoded_CurrentDirectory', 'Encoded_ImageDirectory', 'Last_CurrentDirectory', 'Last_ImageDirectory']
sysmon_data2.drop(columns=columns_to_drop, axis=1, inplace=True)


In [None]:
"""
Warning: It's about to get really painful, really fast.
"""

sysmon_data3 = copy.deepcopy(sysmon_data2)

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, ReLU, LeakyReLU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import backend as K

# Define the Wasserstein loss function
def wasserstein_loss(y_true, y_pred):
    return K.mean(y_true * y_pred)

# Convert the DataFrame into a numpy array for training
data = sysmon_data3.values

# Define model parameters
noise_dim = 100
data_dim = sysmon_data3.shape[1]

# Define training parameters
epochs = 5000
batch_size = 64
n_critic = 5
clip_value = 0.01

# Generator Input
g_input = Input(shape=(noise_dim,))
g = Dense(1024)(g_input)
g = ReLU()(g)
g = Dense(512)(g)
g = ReLU()(g)
g = Dense(256)(g)
g = ReLU()(g)
g = Dense(512)(g)
g = ReLU()(g)
g = Dense(1024)(g)
g = ReLU()(g)
# Generator Output
g_output = Dense(data_dim, activation='tanh')(g)
generator = Model(g_input, g_output)


# Critic Input
c_input = Input(shape=(data_dim,))
c = Dense(1024)(c_input)
c = LeakyReLU(0.2)(c)
c = Dense(512)(c)
c = LeakyReLU(0.2)(c)
c = Dense(256)(c)
c = LeakyReLU(0.2)(c)
c = Dense(512)(c)  # Adjust the size (number of units) as needed
c = LeakyReLU(0.2)(c)
c = Dense(1024)(c)
c = LeakyReLU(0.2)(c)
# Critic Output
c_output = Dense(1)(c)
critic = Model(c_input, c_output)


# Define the optimizers
critic_optimizer = RMSprop(learning_rate=0.00005)
combined_optimizer = RMSprop(learning_rate=0.00005)

# Compile the Critic
critic.compile(loss=wasserstein_loss, optimizer=critic_optimizer)

# Combined model (stacked generator and critic)
critic.trainable = False
combined_input = Input(shape=(noise_dim,))
combined_output = critic(generator(combined_input))
combined = Model(combined_input, combined_output)

# Compile the Combined Model
combined.compile(loss=wasserstein_loss, optimizer=combined_optimizer)

# Placeholder for the critic's score on real and fake data
real = -np.ones((batch_size, 1))
fake = np.ones((batch_size, 1))

# Training Loop
for epoch in range(epochs):
    for _ in range(n_critic):
        # Random batch of real data
        idx = np.random.randint(0, data.shape[0], batch_size)
        real_data = data[idx]

        # Generate a batch of fake data
        noise = np.random.normal(0, 1, (batch_size, noise_dim))
        fake_data = generator.predict(noise)

        # Train the critic
        c_loss_real = critic.train_on_batch(real_data, real)
        c_loss_fake = critic.train_on_batch(fake_data, fake)
        c_loss = 0.5 * np.add(c_loss_real, c_loss_fake)

        # Clip critic weights
        for layer in critic.layers:
            weights = layer.get_weights()
            weights = [np.clip(weight, -clip_value, clip_value) for weight in weights]
            layer.set_weights(weights)

    # Train the generator
    g_loss = combined.train_on_batch(noise, real)

    # Print progress
    print(f"Epoch: {epoch+1}/{epochs} - Critic Loss: {c_loss} - Generator Loss: {g_loss}")



In [None]:
"""
A module for postprocessing of data for verification. Most of the postprocessing will be saved for a later step.
The result of this step will yield a result that can allow us to see if the generator correctly made data for the
one-hot encoded variables
"""

sysmon_data3 = copy.deepcopy(sysmon_data2)

# Number of samples to generate
num_samples = 100

one_hot_columns = ['TerminalSessionId', 'FileVersion', 'LogonId', 'LogonGuid', \
                    'Company', 'Product', 'Description', 'IntegrityLevel', 'ImageDirectory_Type', \
                    'User', 'ParentUser', 'Encoded_Image', 'ParentImage_Encoded', \
                    'CurrentDirectory_Type', 'Last_CurrentDirectory', 'Last_ImageDirectory']

# Generate samples using the trained generator
random_noise = np.random.normal(0, 1, size=(num_samples, noise_dim))
generated_samples = generator.predict(random_noise)

# Convert to DataFrame
generated_df = pd.DataFrame(generated_samples, columns=sysmon_data3.columns)

print(generated_df['Minutes'].describe())

def convert_minutes_to_time(minutes):
    minutes *= 1440 # Undo regularization of minutes
    hours = int(minutes // 60)  # Cast to integer
    minutes = int(minutes % 60)  # Cast to integer
    return f"{hours:02d}:{minutes:02d}:00"


def post_process_one_hot(df, one_hot_columns):
    for col in one_hot_columns:
        one_hot_encoded_cols = df.filter(like=f'{col}_').columns
        if one_hot_encoded_cols.size > 0:
            # Find the index of the column with the highest value for each one-hot set
            max_indices = df[one_hot_encoded_cols].idxmax(axis=1)
            # Extract the index from the column name and set to '1'
            df[col] = max_indices.apply(lambda x: int(x.split('_')[-1]))
        else:
            print(f"No columns found for pattern: {col}_")
    return df

# Apply the post-processing function
generated_df = post_process_one_hot(generated_df, one_hot_columns)

# Other post-processing steps
generated_df['Event_ID'] = generated_df['Event_ID'].round().apply(lambda x: 5 if x < 0.5 else 1)
generated_df['Day'] = generated_df['Day'].apply(lambda x: int(14 - round(x * 5)))
generated_df['Time'] = generated_df['Minutes'].apply(convert_minutes_to_time)
generated_df = generated_df.drop(columns='Minutes')

for base_col in one_hot_columns:
    # Find all columns that start with the base_col name and end with an underscore and a number
    one_hot_cols = generated_df.filter(like=f'{base_col}_').columns
    # Drop these columns
    generated_df = generated_df.drop(columns=one_hot_cols)

print(generated_df.head())



In [None]:
generated_df.to_csv('generated_sysmon_data3.csv', index=False)
files.download('generated_sysmon_data3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>