In [None]:
"""
First, we must begin to prepare the data and simpligy it so that we may train a model on it.
The first step will be simplifying the date and time of the features
"""

from google.colab import files
uploaded = files.upload()

import pandas as pd
import copy

# Load the data
sysmon_data = pd.read_csv('tidy_sysmon_data.csv')

# Drop the 'UtcTime' feature
sysmon_data.drop('UtcTime', axis=1, inplace=True)

# Transform 'Date_and_Time' into 'Day' and 'Time'
sysmon_data['Day'] = pd.to_datetime(sysmon_data['Date_and_Time']).dt.day
sysmon_data['Time'] = pd.to_datetime(sysmon_data['Date_and_Time']).dt.time

# Drop the original 'Date_and_Time' feature
sysmon_data.drop('Date_and_Time', axis=1, inplace=True)



# Display the first few rows of the DataFrame to verify changes
# print(sysmon_data.head())


Saving tidy_sysmon_data.csv to tidy_sysmon_data (1).csv


In [None]:
"""
Next, we will continue to prune the features. From the dataset, some of the features may be
dropped due to uniformality, redundancy, or randomization. Following that, we will convert
some of the features into a categorical nature to make training easier. Note, this can only
be done on features where there are a small set of possibilities, or else we run the risk
of vastly increasing the dimensionality of our data.

The reasons for the drop variables are as follows:
Level - This is the same for all observations and thus is not needed.
Source - This is the same for all observations and thus is not needed.
RuleName - This is the same for all observations and thus is not needed.
Task_Category - This information corresponds exactly to Event_Id and is therefore redundant.
Hashes - These are assumed to be random and thus nothing will be learned from this information.
OriginalFileName - This corresponds exactly to the basename of Image and is thus rendendant.
ProcessGuid and ParentProcessGuid - Both of these consist of five token hexadecimal strings \
separated by dashes. The first and last token are consisten throughout the dataset, and the \
middle three tokens consist of randomized 4-character strings. Thus, the first and last tokens \
will be saved for remaking logs later and the two variables will be dropped from training.
"""

# Saving this to a new variable makes it easer to re-run individual modules
sysmon_data1 = copy.deepcopy(sysmon_data)

# Drop the unneeded columns
columns_to_drop = ['Level', 'Source', 'Task_Category', 'Hashes', 'OriginalFileName', \
                   'RuleName', 'ProcessGuid', 'ParentProcessGuid']
sysmon_data1.drop(columns_to_drop, axis=1, inplace=True)

# Categorical variables for encoding
categorical_columns = ['Event_ID', 'TerminalSessionId', 'FileVersion', 'LogonId', \
                       'LogonGuid', 'Day', 'Company', 'Product', 'Description', \
                       'IntegrityLevel', 'ParentUser', 'User']

Guid_Start = "{8f201f84"
Guid_End = "070000004800}"

# Dictionary for storing mappings
encoding_dicts = {}

for column in categorical_columns:
    # Create a mapping for each unique value in the column
    unique_values = sysmon_data1[column].unique()
    encoding_dict = {value: idx for idx, value in enumerate(unique_values)}

    # Store the mapping dictionary
    encoding_dicts[column] = encoding_dict

    # Apply the mapping to the column
    sysmon_data1[column] = sysmon_data1[column].map(encoding_dict)

# Print the encoding dictionaries
for column, d in encoding_dicts.items():
    print(f"Encoding for {column}: {d}")


# Verify the changes
# print(sysmon_data1.head())


Encoding for Event_ID: {5: 0, 1: 1}
Encoding for TerminalSessionId: {2.0: 0, 0.0: 1, nan: 2, 3.0: 3}
Encoding for FileVersion: {'1.6.00.29964': 0, '119.0.6045.106': 1, '10.0.19041.3570 (WinBuild.160101.0800)': 2, nan: 3, '10.19.0 Build 19950': 4, '23.214.1015.0001': 5, '1.3.127.15': 6, '10.0.19041.3562 (WinBuild.160101.0800)': 7, '1, 0, 0, 2': 8, '10.0.19041.1 (WinBuild.160101.0800)': 9, '1.0.4.2': 10, '7.0.19041.3570 (WinBuild.160101.0800)': 11, '1.3.4.0': 12, '1.3.1.0': 13, '1.3.35.451': 14, '1.3.36.311': 15, '16.0.16924.20124': 16, '16.0.14326.21738': 17, '3.7.2181.36443': 18, '5.2.35.0': 19, '1.60.3467.0': 20, '1.3.2.1': 21, '16.0.16924.20054': 22, '0.14.9.0': 23, '4.2.1608.0': 24, '2.90': 25, '1.17.9.0': 26, '10.0.19041.3623 (WinBuild.160101.0800)': 27, '121.9202.5630.0': 28, '119.0.2151.44': 29, '2.0.0.5': 30, '4.18.23100.2009 (8fcbd1c22d82af16ba34560e1a70591413e88d17)': 31, '1.0.2.0': 32, '1.0.0.4': 33, '1.0.0.8': 34, '2.0.0.7': 35, '16.0.14326.21640': 36, '10.0.19645.1102 (WinB

In [None]:
"""
This section will be tokenize the string variables which cannot be otherwise dropped or made categorical.
The token string elements will be the most difficult portion to generate data for, so it is important
that the data is well-understood
"""

sysmon_data2 = copy.deepcopy(sysmon_data1)

# Function to remove exectuable command from a given command line
def remove_command_token(string):
    if not isinstance(string,str):
        return None

    # Find the indices of the first two double quotes
    first_quote_index = string.find('"')
    second_quote_index = string.find('"', first_quote_index + 1)

    # If both double quotes are found, slice the string from the character after the second quote
    if first_quote_index != -1 and second_quote_index != -1:
        return string[second_quote_index + 2:]

    # If double quotes not found, find the first space and slice the string from there
    first_space_index = string.find(' ')
    if first_space_index != -1:
        return string[first_space_index + 1:]

    return string

# Function to tokenize a string based on a given delimiter
def tokenize_string(string, delimiter):
    # Check if the input is a string
    if isinstance(string, str):
        return string.split(delimiter)
    # Return an empty list if not a string
    else:
        return []

# Function to get the directory type and update the token lists
def get_directory_branch(tokens, directory_dict):

    if tokens is None:
        return 0, None  # 0 for None values
    if not tokens:
        return 0, tokens  # Empty list remains unchanged

    # Get the directory type
    directory_type = directory_dict.get(tokens[0].lower(), 0)  # Default to 0 if not found

    # Remove the first token (and second if first is 'Users')
    if tokens[0] == 'Users' and len(tokens) > 1:
        updated_tokens = tokens[2:]  # Remove first two tokens
    else:
        updated_tokens = tokens[1:]  # Remove first token

    return directory_type, updated_tokens

# Function to remove '.exe' from the end of a string if it exists
def remove_exe_extension(string):
    if string is None:
        return None
    if string.lower().endswith('.exe'):
        return string[:-4]  # Remove the last 4 characters '.exe'
    return string


# Removing the command from a command line prevents tokenization problems using ' ' char
sysmon_data2['CommandLine_Tokens'] = sysmon_data2['CommandLine'].apply\
            (lambda x: tokenize_string(remove_command_token(x), ' '))
sysmon_data2['ParentCommandLine_Tokens'] = sysmon_data2['ParentCommandLine'].apply\
            (lambda x: tokenize_string(remove_command_token(x), ' '))

# # Applying tokenization to the specified columns
sysmon_data2['Image_Tokens'] = \
sysmon_data2['Image'].apply(lambda x: tokenize_string(x, '\\'))
sysmon_data2['CommandLine_Args'] = \
sysmon_data2['CommandLine'].apply(lambda x: tokenize_string(x, ' '))
sysmon_data2['CurrentDirectory_Tokens'] = \
sysmon_data2['CurrentDirectory'].apply(lambda x: tokenize_string(x, '\\'))
sysmon_data2['ParentImage_Tokens'] = \
sysmon_data2['ParentImage'].apply(lambda x: tokenize_string(x, '\\'))
sysmon_data2['ParentCommandLine_Args'] = \
sysmon_data2['ParentCommandLine'].apply(lambda x: tokenize_string(x, ' '))

print(sysmon_data2.head())

# Update Image to only contain the last token of Image_Tokens
sysmon_data2['Image'] = sysmon_data2['Image_Tokens'].apply(lambda tokens: tokens[-1] if tokens else None)

# Update Image_Directory to exclude the first and last token
# The first token will always be 'C:', and the last token is saved to Image
sysmon_data2['Image_Directory'] = sysmon_data2['Image_Tokens'].apply(lambda tokens: tokens[1:-1] if tokens else None)

# Update CurrentDirectory_Tokens to exclude the first token
# The first token will always be 'C:'
sysmon_data2['CurrentDirectory_Tokens'] = sysmon_data2['CurrentDirectory_Tokens'].apply(lambda tokens: tokens[1:] if tokens else None)

# Remove the first token from CommandLine_Tokens
# This token can be gathered from Image_Directory and Image
sysmon_data2['CommandLine_Args'] = sysmon_data2['CommandLine_Args'].apply(lambda tokens: tokens[1:] if tokens else None)

# Update ParentImage to only contain the last token of ParentImage_Tokens
# This token will contain just the parent executable
sysmon_data2['ParentImage'] = sysmon_data2['ParentImage_Tokens'].apply(lambda tokens: tokens[-1] if tokens else None)

# Remove the first token from ParentCommandLine_Tokens
# This token can be gathered from Image_Directory and Image
sysmon_data2['ParentCommandLine_Args'] = sysmon_data2['ParentCommandLine_Args'].apply(lambda tokens: tokens[1:] if tokens else None)


# Creating the dictionary for directory mappings
directoryDict = {"program files": 1, "program files (x86)": 2, "windows": 3, "users": 4}

# Apply the function to Image_Directory and CurrentDirectory_Tokens
sysmon_data2['ImageDirectory_Type'], sysmon_data2['Image_Directory'] = zip(*sysmon_data2['Image_Directory'].apply(lambda x: get_directory_branch(x, directoryDict)))
sysmon_data2['CurrentDirectory_Type'], sysmon_data2['CurrentDirectory_Tokens'] = zip(*sysmon_data2['CurrentDirectory_Tokens'].apply(lambda x: get_directory_branch(x, directoryDict)))
# Final Preprocessing Cleanup Steps
# Removes the empty string present as the final token of CurrentDirectory_Tokens
sysmon_data2['CurrentDirectory_Tokens'] = sysmon_data2['CurrentDirectory_Tokens'].apply(lambda tokens: tokens[0:-1] if tokens else None)
# Removes the .exe extension from Image
sysmon_data2['Image'] = sysmon_data2['Image'].apply(remove_exe_extension)
# Removes the .exe extension from ParentImage
sysmon_data2['ParentImage'] = sysmon_data2['ParentImage'].apply(remove_exe_extension)
# Defines and drops the columns that are no longer needed
columns_to_drop = ['Image_Tokens', 'ParentImage_Tokens', 'CurrentDirectory', 'CommandLine', 'ParentCommandLine', 'CommandLine_Args', 'ParentCommandLine_Args']
sysmon_data2.drop(columns=columns_to_drop, axis=1, inplace=True)

# Verify the changes
# print(sysmon_data2.head())

   Event_ID                                        CommandLine  Company  \
0         0  "C:\Users\Floater\AppData\Local\Microsoft\Team...        0   
1         0  "C:\Program Files (x86)\Google\Chrome\Applicat...        1   
2         0  "C:\Program Files (x86)\Google\Chrome\Applicat...        1   
3         0                 "C:\WINDOWS\System32\wsqmcons.exe"        0   
4         0                                                NaN        2   

                                    CurrentDirectory  Description  \
0  C:\Users\Floater\AppData\Local\Microsoft\Teams...            0   
1  C:\Program Files (x86)\Google\Chrome\Applicati...            1   
2  C:\Program Files (x86)\Google\Chrome\Applicati...            1   
3                               C:\WINDOWS\system32\            2   
4                                                NaN            3   

   FileVersion                                              Image  \
0            0  C:\Users\Floater\AppData\Local\Microsoft\Teams...

In [None]:
"""
After completing the previous module, Image and ParentImage may also be encoded as categorical
values. After this, we begin the final stage of encoding the tokens of CurrentDirectory_Tokens
and Image_Directory as lists of numbers. This may be the final encoding before beginning to build
a model, as CommandLine_Tokens and ParentCommandLine_Tokens are too diverse to build a model on
within the scope of this project.

Note: I am leaving some of the variables that will not be used in training intact. While this
module condes the folders, it does not accurately capture the entire structure of the file
system, and so I am leaving the Image_Directory and CurrentDirectory_Tokens intact at this
stage. These will be removed before training or if a better encoding method is found. This will
represent a point in processing where nearly all original information about the dataset remains
intact.
"""

sysmon_data3 = copy.deepcopy(sysmon_data2)
imageDict = {} # Initialize an empty dictionary for executable encodings
folderDict = {}  # Initialize an empty dictionary for folder encodings

def encode_image(image, encoding_dict):
    if image not in encoding_dict:
        # Assign a new encoding: length of the dictionary + 1
        encoding_dict[image] = len(encoding_dict) + 1
    return encoding_dict[image]

def encode_folders(folder_list, encoding_dict):
    encoded_list = []
    for folder in folder_list:
        if folder:  # Check if folder is not None or empty
            if folder not in encoding_dict:
                encoding_dict[folder] = len(encoding_dict) + 1
            encoded_list.append(encoding_dict[folder])
    return encoded_list

# Encode the executables from the dataset
sysmon_data3['Image_Encoded'] = sysmon_data3['Image'].apply(lambda x: encode_image(x, imageDict))
sysmon_data3['ParentImage_Encoded'] = sysmon_data3['ParentImage'].apply(lambda x: encode_image(x, imageDict))

# Eliminate None values from the dataset
sysmon_data3['CurrentDirectory_Tokens'] = sysmon_data3['CurrentDirectory_Tokens'].apply(lambda x: x if x is not None else [])
sysmon_data3['ParentProcessId'] = sysmon_data3['ParentProcessId'].fillna(0)

# Encoding the folder structures
sysmon_data3['Encoded_CurrentDirectory'] = sysmon_data3['CurrentDirectory_Tokens'].apply(lambda x: encode_folders(x, folderDict))
sysmon_data3['Encoded_ImageDirectory'] = sysmon_data3['Image_Directory'].apply(lambda x: encode_folders(x, folderDict))

# Defines and drops the columns that are no longer needed
columns_to_drop = ['Image', 'ParentImage']
sysmon_data3.drop(columns=columns_to_drop, axis=1, inplace=True)

print(imageDict)
print(folderDict)


{'Teams': 1, 'chrome': 2, 'wsqmcons': 3, 'svchost': 4, 'WmiPrvSE': 5, 'RuntimeBroker': 6, 'backgroundTaskHost': 7, 'g2mupdate': 8, 'FileCoAuth': 9, 'MicrosoftEdgeUpdate': 10, 'TiWorker': 11, 'TrustedInstaller': 12, 'conhost': 13, 'ETD_GetSMART': 14, 'wermgr': 15, 'LocationNotificationWindows': 16, 'ProcInfo': 17, 'SearchFilterHost': 18, 'SearchProtocolHost': 19, 'E_YUBSWE': 20, 'E_YTSSWE': 21, 'taskhostw': 22, 'GoogleUpdate': 23, 'GoogleCrashHandler64': 24, 'GoogleCrashHandler': 25, 'SDXHelper': 26, 'g2mupload': 27, 'BackgroundTransferHost': 28, 'MusNotifyIcon': 29, 'sppsvc': 30, 'MusNotification': 31, 'OfficeC2RClient': 32, 'HxTsr': 33, 'TabTip': 34, 'BackgroundDownload': 35, 'dllhost': 36, 'ai': 37, 'WINWORD': 38, 'init': 39, 'wslhost': 40, 'bash': 41, 'wsl': 42, 'LogonUI': 43, 'nslF0FF.tmp': 44, 'HPDCSetup': 45, 'sc': 46, 'nso79A2.tmp': 47, 'BridgeCommunication': 48, 'escsvc64': 49, 'audiodg': 50, 'nstC333.tmp': 51, 'schtasks': 52, 'officesvcmgr': 53, 'smartscreen': 54, 'TouchpointA

In [None]:
"""
A simple module to download a dataset so that it may be easily viewed.
"""

sysmon_data3.to_csv('modified_sysmon_data.csv', index=False)
files.download('modified_sysmon_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>