In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sumologic import SumoLogic
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import time
from datetime import date

# 1. Establish Your Use Case
The first step of any QFuncion engagement is determining the cybersecurity use case to which you would like to apply AI. Most clients will perform one or more of the following:

* Monitor some high priority or high value system(s) for anomalous logs activity
* Perform a single threat hunt on some user, system, network, or appliance
* Implement User Entity Behavior Analytics (UEBA) in the environment

# 2. Collect Data
Depending on your use case, we will need to collect the appropriate data. This example notebook shows how AI can be used to monitor anomalies and threats that occur on the command line of a Windows domain controller. The Windows domain controller has been configured to log event code 4688 with command line logging enabled. The logs have been collected into SumoLogic and have been retrieved in CSV form.

Generally speaking, the more logs available, the better the AI will perform. QFunction suggests collecting 3-6 months of data minimum to establish a proper baseline of what's "normal" for your users, systems, or networks.

In [None]:
f = open('all-sumologic-results-november2023-tripled.csv', 'r')
lines = f.readlines()
f.close()

data = {'domain_name': [], 'username':[], 'command_line':[], 'parent_process_name':[]}

for line in lines:
    message_fields = line.split(',')
    data['domain_name'].append(message_fields[2].strip())
    data['username'].append(message_fields[3].strip())
    data['command_line'].append(message_fields[0].strip())
    data['parent_process_name'].append(message_fields[1].strip())
        

In [None]:
df = pd.DataFrame.from_dict(data)

# 3. Create and Train the AI Model

This AI is used to establish the "normal" for your collected data and is customized for whatever your use case may be.

Training the AI can take a number of days depending on the amount of data the AI is given. Because every client has their own data security regulations, it is up to the discretion of the client as to where the model is trained. The common options are as follows:

* Dedicated Cloud Server with GPU support (best option, yet expensive)
* Dedicated Cloud Server with enough CPU and RAM (not as fast, but still works, more cost effective)
* On-Prem Server with enough CPU and RAM (same as dedicated cloud server, may be cheaper due to on-prem)
* QFunction Cloud server (charges will be added to engagement cost)

Keep in mind that systems with high resources are only used for training; the actual production setup will require less resources.

In [None]:
def initialize_autoencoder_model(df):
    lengths = np.array([len(line.replace('\n','').replace('\r','')) for line in df['domain_name']+' '+df['username']+' '+df['command_line']+' '+df['parent_process_name']])
    max_length = int(lengths.mean() + lengths.std())
    print(max_length)
    model = models.Sequential()
    model.add(layers.Embedding(256, 4, input_length=max_length))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='tanh'))
    model.add(layers.Dense(64, activation='tanh'))
    model.add(layers.Dense(32, activation='tanh'))
    model.add(layers.Dense(16, activation='tanh'))
    model.add(layers.Dense(32, activation='tanh'))
    model.add(layers.Dense(64, activation='tanh'))
    model.add(layers.Dense(128, activation='tanh'))
    model.add(layers.Dense(max_length, activation='tanh'))
    return model, max_length

In [None]:
def fit(model,max_length):
    x = [np.frombuffer(bytearray(line, 'utf-8'), np.uint8) for line in df['domain_name']+' '+df['username']+' '+df['command_line']+' '+df['parent_process_name']]
    x = pad_sequences(x, maxlen=max_length, padding='post', value=0, truncating='post')
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    x = x.astype(np.float32)
    model.compile(loss='mae', optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.00001))
    model.fit(x, x, epochs=20)
    print(x.shape)
    info = {"message": "model trained"}
    return info

In [None]:
model, max_length = initialize_autoencoder_model(df)
info = fit(model, max_length)

In [None]:
def apply(model, df, max_length):
    x = [np.frombuffer(bytearray(line, 'utf-8'), np.uint8) for line in df['domain_name']+' '+df['username']+' '+df['command_line']+' '+df['parent_process_name']]
    x = pad_sequences(x, maxlen=max_length, padding='post', value=0, truncating='post')
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    x = x.astype(np.uint8)
    reconstruction_scores = tf.keras.losses.mae(x, model.predict(x, verbose=False))
    #y_hat = tf.keras.metrics.RootMeanSquaredError(x, model.predict(x, verbose=False))
    #print(y_hat)
    #print(type(y_hat))
    result = pd.DataFrame(reconstruction_scores, columns=['reconstruction_loss'])
    return result

In [None]:
orig_predictions = apply(model, df, max_length)
orig_predictions.describe()

In [None]:
orig_predictions.min()
p_sorted = orig_predictions.sort_values(by='reconstruction_loss', ascending=False)
print(p_sorted)

r = []
for i in list(p_sorted.index.values)[:100]:
    print(df.iloc[i]['domain_name'] + ' ' + df.iloc[i]['username'] + ' ' + df.iloc[i]['command_line'] + ' ' + df.iloc[i]['parent_process_name'])


# 4. Connecting to Your SIEM
After the model is trained, it will be ready to process new, incoming logs to determine whether they are "normal" for your environment. These incoming logs originate from your SIEM. All SIEMs have APIs that can be used to collect new data programatically which will be given to the AI to determine anomalies.

Below is an example usage of retrieving new logs via the SumoLogic API and feeding the logs to the AI to determine anomalies:

In [None]:
access_id = ''
access_key = ''
endpoint = 'https://api.sumologic.com/api'
sumo = SumoLogic(access_id, access_key, endpoint)
time_zone = "PST"
by_receipt_time = False

In [None]:
query = '_collector="WIN-HDJETKALNBL" | json auto | where event_id.id = 4688 | fields event_data.SubjectDomainName, event_data.SubjectUserName, event_data.CommandLine, event_data.ParentProcessName'
to_time = int(time.time() * 1000)
#from_time = to_time - (60000 * 30) #Subtract 30 minutes
from_time = to_time - 86400000  #Subtract 1 day
#from_time = to_time - 86400000 * 3  #Subtract 2 days
#from_time = to_time - (86400000 * 28)  #Subtract 2 days
sj = sumo.search_job(query, from_time, to_time, time_zone, by_receipt_time)

status = sumo.search_job_status(sj)
delay = 5
while status['state'] != 'DONE GATHERING RESULTS':
    if status['state'] == 'CANCELLED':
        break
    time.sleep(delay)
    status = sumo.search_job_status(sj)
    
count = int(status['messageCount'])
print("Number of results: " + str(count))
limit = count if count != 0 else 0
r = sumo.search_job_messages(sj, limit=limit)

test_df = {'domain_name': [], 'username':[], 'command_line':[], 'parent_process_name':[]}
test_data = []

for message in r['messages']:
    dn = message['map']['event_data.subjectdomainname'].strip().replace('"','')
    un = message['map']['event_data.subjectusername'].strip().replace('"','')
    cl = message['map']['event_data.commandline'].strip().replace('"','')
    ppn = message['map']['event_data.parentprocessname'].strip().replace('"','')
    test_df['domain_name'].append(dn)
    test_df['username'].append(un)
    test_df['command_line'].append(cl)
    test_df['parent_process_name'].append(ppn)
    
test_df = pd.DataFrame.from_dict(test_df)
#print(test_df)
#print(test_data)

In [None]:
predictions = apply(model, test_df, max_length)

In [None]:
predictions.describe()
recon_mean = np.mean(predictions)
recon_stddev = np.std(predictions)

stats_threshold = recon_mean + 6*recon_stddev
print(stats_threshold)

In [None]:
z = pd.DataFrame(predictions > stats_threshold)
p_dropped = predictions[z.reconstruction_loss]

for i in list(p_dropped.index.values):
    print(test_df.iloc[i]['domain_name'] + ' ' + test_df.iloc[i]['username'] + ' ' + test_df.iloc[i]['command_line'] + ' ' + test_df.iloc[i]['parent_process_name'])

# 5. Tuning
The final step is tuning the anomaly findings to your specifications. This involves a little bit of trial-and-error, but this can be adjusted using some of the following measures:
* Adjusting the threshold for which anomalies are alerted
* Adding known data to exclude from future anomaly alerting

The below example excludes a known executable to make the anomaly findings cleaner. In production use, any exclusion needs to be as specific as possible!

In [None]:
exclude_commandline = ['msedge.exe']

for i in list(p_dropped.index.values):
    found_exclude = False
    for entry in exclude_commandline:
        if entry in test_df.iloc[i]['command_line']:
            found_exclude = True
            break
    if not found_exclude:
        print(test_df.iloc[i]['domain_name'] + ' ' + test_df.iloc[i]['username'] + ' ' + test_df.iloc[i]['command_line'] + ' ' + test_df.iloc[i]['parent_process_name'])

In [None]:
model.save('autoencodermodel')

# 6. Deployment
This anomaly finder will be deployed as a simple Python script and will need to be hosted on a server with adequate CPU and memory resources that can run Python scripts with Tensorflow and can connect to your SIEM. However, it can be customized to the client's specifications. Some deployment ideas include the following:

* A daily/weekly email showing all anomaly findings
* A running log file which can be ingested back into your SIEM

# 7. Maintenance
Like any AI solution, the anomaly finder will need to be periodically retrained so that it stays up to date with any changes that occur in the environment. Some changes may include the following:

* Schema changes in your SIEM
* Software/hardware updates on the server that break the anomaly finder
* Data drift

QFunction can work with you to ensure that your anomaly finder stays operational.

In [None]:
print(tf.__version__)