# Introduction

Stratosphere Linux IPS allows a number of file formats such as pcap and zeek flows to train and test the ML detection algorithm. The issue with pcaps is that Slips turns the pcaps into flows then labels the flow as benign or malicious instead of the individual packets. If the labelled flows are not congruent with the way Slips creates the flow then the labelled flows cannot be used to evaluate correctness.

If the input is flows, then Slips will classify the flows themselves as malicious or benign. This means the output of Slips and the labelled flows are directly comparable. To do this, the labelled flows need to be converted to a format that Slips understands. The way this is done may be different from dataset to dataset but the general procedure is to extract the relevant columns from each datasets labelled flow and put it in a zeek conn.log file.

## Processing CICIDS2017

### Clean the labelled flows

In [69]:
import csv

# Function to convert scientific notation to decimal
def scientific_to_decimal(value):
    try:
        # Check if the value is in scientific notation (contains 'e' or 'E')
        if 'e+' in value or 'E+' in value:
            decimal_value = int(float(value))
            return decimal_value
        else:
            # Return the original value if it's not in scientific notation
            return value
    except ValueError:
        # If it's not a valid float, return the original value
        return value

# Input CSV file and output cleaned CSV file
INPUT_CSV_FILE = "Thursday-WorkingHours-Full.csv"
OUTPUT_CSV_FILE = "Thursday-WorkingHours-Full_cleaned.csv"

# Open the input and output CSV files
with open(INPUT_CSV_FILE, "r") as infile, open(OUTPUT_CSV_FILE, "w", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    for row in reader:
        # Apply the scientific_to_decimal function to fields in scientific notation
        cleaned_row = [scientific_to_decimal(field) for field in row]
        writer.writerow(cleaned_row)

### Convert labelled flows into Zeek/Bro conn.log file

In [70]:
import time
from datetime import datetime
import csv

CICIDS_LABELLED = "Thursday-WorkingHours-Full_cleaned.csv"
ZEEK_OUTPUT = "conn.log"

# This is a standard header of all Zeek conn.log flow files. Note that the values for #open and #path does not matter.
ZEEK_HEADER ='''#separator \\x09
#set_separator	,
#empty_field	(empty)
#unset_field	-
#path	conn
#open
#fields	ts	uid	id.orig_h	id.orig_p	id.resp_h	id.resp_p	proto	service	duration	orig_bytes	resp_bytes	conn_state	local_orig	local_resp	missed_bytes	history	orig_pkts	orig_ip_bytes	resp_pkts	resp_ip_bytes	tunnel_parents
#types	time	string	addr	port	addr	port	enum	string	interval	count	count	string	bool	bool	count	string	count	count	count	count	set[string]
'''

# conn.log fields mapped to CICIDS flows file
# for fields that do not have a mapping, a default value is set
TIME_STAMP =    6
FLOW_ID =       0
SOURCE_IP =     1
SOURCE_PORT =   2
DEST_IP =       3
DEST_PORT =     4
PROTOCOL =      5
SERVICE =       13
DURATION =      7
FWD_BYTES =     10
BWD_BYTES =     11
CONN_STATE =    '-'
LOCAL_ORIG =    '-'
LOCAL_RESP =    '-'
MISSED =        '-'
HISTORY =       '-'
ORIG_PKTS =     8
ORIG_IP_BYTES = '-'
RESP_PKTS =     9
RESP_IP_BYTES = '-'
TUNNEL_PARENT = '(empty)'

# protocol mapping for CICIDS2017
PROTOCOL_DICT = {
    1: "icmp",
    2: "igmp",
    3: "ggp",
    6: "tcp",
    8: "egp",
    9: "igp",
    17: "udp",
    20: "hmp",
    27: "rdp",
    41: "ipv6",
    46: "rsvp",
    47: "gre",
    50: "esp",
    51: "ah",
    58: "ipv6-icmp",
    59: "ipv6-nonxt",
    88: "eigrp",
    89: "ospf",
    115: "l2tp",
    132: "sctp",
    139: "netbios",
    143: "imap",
    161: "snmp",
    179: "bgp"
}

with open(CICIDS_LABELLED, "r") as data, open(ZEEK_OUTPUT, "w") as output:
    labelled_flows = csv.reader(data)
    next(labelled_flows) #skip header
    
    output.write(ZEEK_HEADER)

    fid = 1

    for flow in labelled_flows:
        time = str(datetime.strptime(flow[TIME_STAMP], "%d/%m/%Y %H:%M").timestamp()) # Thursday flows don't include seconds
        #time = str(datetime.strptime(flow[TIME_STAMP], "%d/%m/%Y %H:%M:%S").timestamp()) # Monday flows do include seconds
        #flowid = flow[FLOW_ID]
        flowid = str(fid)
        fid += 1
        src_ip = flow[SOURCE_IP]
        src_port = flow[SOURCE_PORT]
        des_ip = flow[DEST_IP]
        des_port = flow[DEST_PORT]
        protocol = PROTOCOL_DICT.get(int(flow[PROTOCOL]), 'tcp')
        service = SERVICE
        duration = format(int(flow[DURATION]) / 1000000, ".6f") 
        fwd_bytes = flow[FWD_BYTES]
        bwd_bytes = flow[BWD_BYTES]
        conn_state = CONN_STATE
        local_orig = LOCAL_ORIG
        local_resp = LOCAL_RESP
        missed = MISSED
        history = HISTORY
        orig_pkts = flow[ORIG_PKTS]
        orig_ip_bytes = ORIG_IP_BYTES
        resp_pkts = flow[RESP_PKTS]
        resp_ip_bytes = RESP_IP_BYTES
        tunnel_parent = TUNNEL_PARENT

        processed_line = time + "\t" + flowid + "\t" + src_ip + "\t" + src_port + "\t" + des_ip + "\t" + des_port + "\t" + protocol + "\t" + service + "\t" + duration + "\t" + fwd_bytes + "\t" + bwd_bytes + "\t" + conn_state + "\t" + local_orig + "\t" + local_resp + "\t" + missed + "\t" + history + "\t" + orig_pkts + "\t" + orig_ip_bytes + "\t" + resp_pkts + "\t" + resp_ip_bytes + "\t" + tunnel_parent + "\n"

        output.write(processed_line)

    output.write("#close")



### Test or train

When conn.log has been obtained, copy the file to the straosphere docker container to test or train the model. If training the model, raw pcaps may be used because Slips identifies malicious traffic by creating a baseline for normal traffic. If testing, a sqlite database will be output with labelled flows and it can be exported to a csv. This can be compared directly with the original labelled flows.

### Evaluate CICIDS2017

In [83]:
ORIGINAL_FLOWS = "Thursday-WorkingHours-Full_cleaned.csv"
OUTPUT_FLOWS = "Thursday-Output-Full.csv"

with open(ORIGINAL_FLOWS, "r") as original, open(OUTPUT_FLOWS, "r") as processed:
    original_flows = csv.reader(original)
    processed_flows = csv.reader(processed)

    next(original_flows) #skip header

    # counter for number of flows compared
    count = 0

    # statistics
    fp = 0
    tp = 0
    fn = 0
    tn = 0  

    for original_flow, processed_flow in zip(original_flows, processed_flows):

        classification = "ERROR" #default

        if(original_flow[-1] == "BENIGN" and processed_flow[2].upper() == "BENIGN"): #TN
            classification = "TN"
            tn += 1
        if(original_flow[-1] == "BENIGN" and processed_flow[2].upper() != "BENIGN"): #FP
            classification = "FP"
            fp += 1
        if(original_flow[-1] != "BENIGN" and processed_flow[2].upper() == "BENIGN"): #FN
            classification = "FN"
            fn += 1
        if(original_flow[-1] != "BENIGN" and processed_flow[2].upper() != "BENIGN"): #TP
            classification = "TP"
            tp += 1

        # print(processed_flow[0], original_flow[-1], processed_flow[2].upper(), classification)

        count += 1

    # print results
    print("Number of flows compared: ", count)
    print("TN: ", tn)
    print("FN: ", fn)
    print("TP: ", tp)
    print("FP: ", fp)
    print("Accuracy: ", (tp + tn) / count)
    print("Recall: ", tp / (tp + fn))
    print("Precision: ", tp / (tp + fp))


Number of flows compared:  458968
TN:  430133
FN:  2117
TP:  99
FP:  26619
Accuracy:  0.9373899705426086
Recall:  0.044675090252707585
Precision:  0.0037053671682012127


## Processing UNSWNB15

### Training the model

UNSW-NB15 provides labelled flows for testing and training machine learning models split into 4 parts. We can use one of these parts for training. Slips is trained by gauging malicious and benign traffic. As a result, the training set will need to split into two. For training, we will use "UNSWNB15_assets\UNSW-NB15_4.csv"

In [42]:
import csv

TRAINING_SET = "UNSWNB15_assets/UNSW-NB15_4.csv"
TRAIN_MALICIOUS = "UNSWNB15_assets/unswnb15_train_malicious.csv"
TRAIN_BENIGN = "UNSWNB15_assets/unswnb15_train_benign.csv"

with open(TRAINING_SET, "r") as train, open(TRAIN_MALICIOUS, "w", newline="") as mal, open(TRAIN_BENIGN, "w", newline="") as ben:
    train_in = csv.reader(train)
    ben_out = csv.writer(ben)
    mal_out = csv.writer(mal)

    next(train_in)

    for flow in train_in:
        if flow[-1] == "0":
            ben_out.writerow(flow)
        else:
            mal_out.writerow(flow)

Now the benign and malicious training set can be converted into zeek flows

In [76]:
import csv

INPUT_CSV = "UNSWNB15_assets/unswnb15_train_benign.csv" # CHANGE THIS AS REQUIRED
ZEEK_OUTPUT = "UNSWNB15_assets/benign_conn.log" # CHANGE THIS AS REQUIRED

#INPUT_CSV = "UNSWNB15_assets/UNSW-NB15_2.csv" # CHANGE THIS AS REQUIRED
#ZEEK_OUTPUT = "UNSWNB15_assets/test_conn.log" # CHANGE THIS AS REQUIRED

# This is a standard header of all Zeek conn.log flow files. Note that the values for #open and #path does not matter.
ZEEK_HEADER ='''#separator \\x09
#set_separator	,
#empty_field	(empty)
#unset_field	-
#path	conn
#open
#fields	ts	uid	id.orig_h	id.orig_p	id.resp_h	id.resp_p	proto	service	duration	orig_bytes	resp_bytes	conn_state	local_orig	local_resp	missed_bytes	history	orig_pkts	orig_ip_bytes	resp_pkts	resp_ip_bytes	tunnel_parents
#types	time	string	addr	port	addr	port	enum	string	interval	count	count	string	bool	bool	count	string	count	count	count	count	set[string]
'''

TIME_STAMP =    28
FLOW_ID =       '-'
SOURCE_IP =     0
SOURCE_PORT =   1
DEST_IP =       2
DEST_PORT =     3
PROTOCOL =      4
SERVICE =       13
DURATION =      6
FWD_BYTES =     7
BWD_BYTES =     8
CONN_STATE =    5
LOCAL_ORIG =    '-'
LOCAL_RESP =    '-'
MISSED =        '-'
HISTORY =       '-'
ORIG_PKTS =     16
ORIG_IP_BYTES = '-'
RESP_PKTS =     17
RESP_IP_BYTES = '-'
TUNNEL_PARENT = '(empty)'

with open(INPUT_CSV, "r") as data, open(ZEEK_OUTPUT, "w") as output:
    flows = csv.reader(data)

    output.write(ZEEK_HEADER)
    
    fid = 1

    for flow in flows:
        time = flow[TIME_STAMP]
        flowid = str(fid)
        fid += 1
        src_ip = flow[SOURCE_IP]
        src_port = flow[SOURCE_PORT]
        des_ip = flow[DEST_IP]
        des_port = flow[DEST_PORT]
        if 'x' in flow[DEST_PORT].lower(): # turn hexadecimal into int
            des_port = str(int(flow[DEST_PORT], 16))
        protocol = flow[PROTOCOL]
        service = flow[SERVICE]
        duration = flow[DURATION]
        fwd_bytes = flow[FWD_BYTES]
        bwd_bytes = flow[BWD_BYTES]
        conn_state = flow[CONN_STATE]
        local_orig = LOCAL_ORIG
        local_resp = LOCAL_RESP
        missed = MISSED
        history = HISTORY
        orig_pkts = flow[ORIG_PKTS]
        orig_ip_bytes = ORIG_IP_BYTES
        resp_pkts = flow[RESP_PKTS]
        resp_ip_bytes = RESP_IP_BYTES
        tunnel_parent = TUNNEL_PARENT

        processed_line = time + "\t" + flowid + "\t" + src_ip + "\t" + src_port + "\t" + des_ip + "\t" + des_port + "\t" + protocol + "\t" + service + "\t" + duration + "\t" + fwd_bytes + "\t" + bwd_bytes + "\t" + conn_state + "\t" + local_orig + "\t" + local_resp + "\t" + missed + "\t" + history + "\t" + orig_pkts + "\t" + orig_ip_bytes + "\t" + resp_pkts + "\t" + resp_ip_bytes + "\t" + tunnel_parent + "\n"

        output.write(processed_line)

    output.write("#close")    

### Testing

The above code may be used to convert testing sets to zeek flows as well. When converted, run Slips in testing mode and it will output the labelled flows in sqlite database format. For testing, "UNSWNB15_assets\UNSW-NB15_2.csv" will be used

### Evaluating UNSWNB15

In [73]:
ORIGINAL_FLOWS = "UNSWNB15_assets/UNSW-NB15_2.csv"
OUTPUT_FLOWS = "UNSWNB15_assets/UNSW-NB15_2_output.csv"

with open(ORIGINAL_FLOWS, "r") as original, open(OUTPUT_FLOWS, "r") as processed:
    original_flows = csv.reader(original)
    processed_flows = csv.reader(processed)

    # counter for number of flows compared
    count = 0

    # statistics
    fp = 0
    tp = 0
    fn = 0
    tn = 0  

    for original_flow, processed_flow in zip(original_flows, processed_flows):

        classification = "ERROR" #default

        if(original_flow[-1] == "0" and processed_flow[2].upper() == "BENIGN"): #TN
            classification = "TN"
            tn += 1
        if(original_flow[-1] == "0" and processed_flow[2].upper() != "BENIGN"): #FP
            classification = "FP"
            fp += 1
        if(original_flow[-1] != "0" and processed_flow[2].upper() == "BENIGN"): #FN
            classification = "FN"
            fn += 1
        if(original_flow[-1] != "0" and processed_flow[2].upper() != "BENIGN"): #TP
            classification = "TP"
            tp += 1

        #print(processed_flow[2], original_flow[-1], classification)

        count += 1

    # print results
    print("Number of flows compared: ", count)
    print("TN: ", tn)
    print("FN: ", fn)
    print("TP: ", tp)
    print("FP: ", fp)
    print("Accuracy: ", (tp + tn) / count)
    print("Recall: ", tp / (tp + fn))
    print("Precision: ", tp / (tp + fp))

Number of flows compared:  513598
TN:  431293
FN:  23190
TP:  0
FP:  59115
Accuracy:  0.839748207742242
Recall:  0.0
Precision:  0.0


## Processing BoT-IOT

### Training

This dataset is structured similarly to UNSWNB15 so the same processing can be applied to create the required zeek/bro flows to test and train the machine learning model

In [74]:
import csv

TRAINING_SET = "BOTIOT_assets/UNSW_2018_IoT_Botnet_Full5pc_4.csv"
TRAIN_MALICIOUS = "BOTIOT_assets/botiot_train_malicious.csv"
TRAIN_BENIGN = "BOTIOT_assets/botiot_train_benign.csv"

with open(TRAINING_SET, "r") as train, open(TRAIN_MALICIOUS, "w", newline="") as mal, open(TRAIN_BENIGN, "w", newline="") as ben:
    train_in = csv.reader(train)
    ben_out = csv.writer(ben)
    mal_out = csv.writer(mal)

    next(train_in)

    for flow in train_in:
        if flow[-3] == "0":
            ben_out.writerow(flow)
        else:
            mal_out.writerow(flow)

Again, convert the training or testing flows into zeek/bro flows

In [67]:
import csv

#INPUT_CSV = "BOTIOT_assets/botiot_train_malicious.csv" # CHANGE THIS AS REQUIRED
#ZEEK_OUTPUT = "BOTIOT_assets/malicious_conn.log" # CHANGE THIS AS REQUIRED

INPUT_CSV = "BOTIOT_assets/botiot_train_malicious.csv" # CHANGE THIS AS REQUIRED
ZEEK_OUTPUT = "BOTIOT_assets/malicious_conn.log" # CHANGE THIS AS REQUIRED

# This is a standard header of all Zeek conn.log flow files. Note that the values for #open and #path does not matter.
ZEEK_HEADER ='''#separator \\x09
#set_separator	,
#empty_field	(empty)
#unset_field	-
#path	conn
#open
#fields	ts	uid	id.orig_h	id.orig_p	id.resp_h	id.resp_p	proto	service	duration	orig_bytes	resp_bytes	conn_state	local_orig	local_resp	missed_bytes	history	orig_pkts	orig_ip_bytes	resp_pkts	resp_ip_bytes	tunnel_parents
#types	time	string	addr	port	addr	port	enum	string	interval	count	count	string	bool	bool	count	string	count	count	count	count	set[string]
'''

TIME_STAMP =    1
FLOW_ID =       '-'
SOURCE_IP =     6
SOURCE_PORT =   7
DEST_IP =       8
DEST_PORT =     9
PROTOCOL =      4
SERVICE =       '-'
DURATION =      16
FWD_BYTES =     24
BWD_BYTES =     25
CONN_STATE =    12
LOCAL_ORIG =    '-'
LOCAL_RESP =    '-'
MISSED =        '-'
HISTORY =       '-'
ORIG_PKTS =     22
ORIG_IP_BYTES = '-'
RESP_PKTS =     23
RESP_IP_BYTES = '-'
TUNNEL_PARENT = '(empty)'

with open(INPUT_CSV, "r") as data, open(ZEEK_OUTPUT, "w") as output:
    flows = csv.reader(data)

    output.write(ZEEK_HEADER)
    
    fid = 1

    for flow in flows:
        time = flow[TIME_STAMP]
        flowid = str(fid)
        fid += 1
        src_ip = flow[SOURCE_IP]
        src_port = flow[SOURCE_PORT]
        if 'x' in flow[SOURCE_PORT].lower(): # turn hexadecimal into int
            src_port = str(int(flow[SOURCE_PORT], 16))
        des_ip = flow[DEST_IP]
        des_port = flow[DEST_PORT]
        if 'x' in flow[DEST_PORT].lower(): # turn hexadecimal into int
            des_port = str(int(flow[DEST_PORT], 16))
        protocol = flow[PROTOCOL]
        service = SERVICE
        duration = flow[DURATION]
        fwd_bytes = flow[FWD_BYTES]
        bwd_bytes = flow[BWD_BYTES]
        conn_state = flow[CONN_STATE]
        local_orig = LOCAL_ORIG
        local_resp = LOCAL_RESP
        missed = MISSED
        history = HISTORY
        orig_pkts = flow[ORIG_PKTS]
        orig_ip_bytes = ORIG_IP_BYTES
        resp_pkts = flow[RESP_PKTS]
        resp_ip_bytes = RESP_IP_BYTES
        tunnel_parent = TUNNEL_PARENT

        processed_line = time + "\t" + flowid + "\t" + src_ip + "\t" + src_port + "\t" + des_ip + "\t" + des_port + "\t" + protocol + "\t" + service + "\t" + duration + "\t" + fwd_bytes + "\t" + bwd_bytes + "\t" + conn_state + "\t" + local_orig + "\t" + local_resp + "\t" + missed + "\t" + history + "\t" + orig_pkts + "\t" + orig_ip_bytes + "\t" + resp_pkts + "\t" + resp_ip_bytes + "\t" + tunnel_parent + "\n"

        output.write(processed_line)

    output.write("#close")