## Processing CICIDS2017

### Clean the labelled flows

In [69]:
import csv

# Function to convert scientific notation to decimal
def scientific_to_decimal(value):
    try:
        # Check if the value is in scientific notation (contains 'e' or 'E')
        if 'e+' in value or 'E+' in value:
            decimal_value = int(float(value))
            return decimal_value
        else:
            # Return the original value if it's not in scientific notation
            return value
    except ValueError:
        # If it's not a valid float, return the original value
        return value

# Input CSV file and output cleaned CSV file
INPUT_CSV_FILE = "Thursday-WorkingHours-Full.csv"
OUTPUT_CSV_FILE = "Thursday-WorkingHours-Full_cleaned.csv"

# Open the input and output CSV files
with open(INPUT_CSV_FILE, "r") as infile, open(OUTPUT_CSV_FILE, "w", newline="") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    
    for row in reader:
        # Apply the scientific_to_decimal function to fields in scientific notation
        cleaned_row = [scientific_to_decimal(field) for field in row]
        writer.writerow(cleaned_row)

### Convert labelled flows into Zeek/Bro conn.log file

In [70]:
import time
from datetime import datetime
import csv

CICIDS_LABELLED = "Thursday-WorkingHours-Full_cleaned.csv"
ZEEK_OUTPUT = "conn.log"

# This is a standard header of all Zeek conn.log flow files. Note that the values for #open and #path does not matter.
ZEEK_HEADER ='''#separator \\x09
#set_separator	,
#empty_field	(empty)
#unset_field	-
#path	conn
#open
#fields	ts	uid	id.orig_h	id.orig_p	id.resp_h	id.resp_p	proto	service	duration	orig_bytes	resp_bytes	conn_state	local_orig	local_resp	missed_bytes	history	orig_pkts	orig_ip_bytes	resp_pkts	resp_ip_bytes	tunnel_parents
#types	time	string	addr	port	addr	port	enum	string	interval	count	count	string	bool	bool	count	string	count	count	count	count	set[string]
'''

# conn.log fields mapped to CICIDS flows file
# for fields that do not have a mapping, a default value is set
TIME_STAMP =    6
FLOW_ID =       0
SOURCE_IP =     1
SOURCE_PORT =   2
DEST_IP =       3
DEST_PORT =     4
PROTOCOL =      5
SERVICE =       13
DURATION =      7
FWD_BYTES =     10
BWD_BYTES =     11
CONN_STATE =    '-'
LOCAL_ORIG =    '-'
LOCAL_RESP =    '-'
MISSED =        '-'
HISTORY =       '-'
ORIG_PKTS =     8
ORIG_IP_BYTES = '-'
RESP_PKTS =     9
RESP_IP_BYTES = '-'
TUNNEL_PARENT = '(empty)'

# protocol mapping for CICIDS2017
PROTOCOL_DICT = {
    1: "icmp",
    2: "igmp",
    3: "ggp",
    6: "tcp",
    8: "egp",
    9: "igp",
    17: "udp",
    20: "hmp",
    27: "rdp",
    41: "ipv6",
    46: "rsvp",
    47: "gre",
    50: "esp",
    51: "ah",
    58: "ipv6-icmp",
    59: "ipv6-nonxt",
    88: "eigrp",
    89: "ospf",
    115: "l2tp",
    132: "sctp",
    139: "netbios",
    143: "imap",
    161: "snmp",
    179: "bgp"
}

with open(CICIDS_LABELLED, "r") as data, open(ZEEK_OUTPUT, "w") as output:
    labelled_flows = csv.reader(data)
    next(labelled_flows) #skip header
    
    output.write(ZEEK_HEADER)

    fid = 1

    for flow in labelled_flows:
        time = str(datetime.strptime(flow[TIME_STAMP], "%d/%m/%Y %H:%M").timestamp()) # Thursday flows don't include seconds
        #time = str(datetime.strptime(flow[TIME_STAMP], "%d/%m/%Y %H:%M:%S").timestamp()) # Monday flows do include seconds
        #flowid = flow[FLOW_ID]
        flowid = str(fid)
        fid += 1
        src_ip = flow[SOURCE_IP]
        src_port = flow[SOURCE_PORT]
        des_ip = flow[DEST_IP]
        des_port = flow[DEST_PORT]
        protocol = PROTOCOL_DICT.get(int(flow[PROTOCOL]), 'tcp')
        service = SERVICE
        duration = format(int(flow[DURATION]) / 1000000, ".6f") 
        fwd_bytes = flow[FWD_BYTES]
        bwd_bytes = flow[BWD_BYTES]
        conn_state = CONN_STATE
        local_orig = LOCAL_ORIG
        local_resp = LOCAL_RESP
        missed = MISSED
        history = HISTORY
        orig_pkts = flow[ORIG_PKTS]
        orig_ip_bytes = ORIG_IP_BYTES
        resp_pkts = flow[RESP_PKTS]
        resp_ip_bytes = RESP_IP_BYTES
        tunnel_parent = TUNNEL_PARENT

        processed_line = time + "\t" + flowid + "\t" + src_ip + "\t" + src_port + "\t" + des_ip + "\t" + des_port + "\t" + protocol + "\t" + service + "\t" + duration + "\t" + fwd_bytes + "\t" + bwd_bytes + "\t" + conn_state + "\t" + local_orig + "\t" + local_resp + "\t" + missed + "\t" + history + "\t" + orig_pkts + "\t" + orig_ip_bytes + "\t" + resp_pkts + "\t" + resp_ip_bytes + "\t" + tunnel_parent + "\n"

        output.write(processed_line)

    output.write("#close")



### Test or train

When conn.log has been obtained, copy the file to the straosphere docker container to test the model. An sqlite database will be output with labelled flows and it can be exported to a csv. This can be compared directly with the original labelled flows.

### Evaluate CICIDS2017

In [83]:
ORIGINAL_FLOWS = "Thursday-WorkingHours-Full_cleaned.csv"
OUTPUT_FLOWS = "Thursday-Output-Full.csv"

with open(ORIGINAL_FLOWS, "r") as original, open(OUTPUT_FLOWS, "r") as processed:
    original_flows = csv.reader(original)
    processed_flows = csv.reader(processed)

    next(original_flows) #skip header

    # counter for number of flows compared
    count = 0

    # statistics
    fp = 0
    tp = 0
    fn = 0
    tn = 0  

    for original_flow, processed_flow in zip(original_flows, processed_flows):

        classification = "ERROR" #default

        if(original_flow[-1] == "BENIGN" and processed_flow[2].upper() == "BENIGN"): #TN
            classification = "TN"
            tn += 1
        if(original_flow[-1] == "BENIGN" and processed_flow[2].upper() != "BENIGN"): #FP
            classification = "FP"
            fp += 1
        if(original_flow[-1] != "BENIGN" and processed_flow[2].upper() == "BENIGN"): #FN
            classification = "FN"
            fn += 1
        if(original_flow[-1] != "BENIGN" and processed_flow[2].upper() != "BENIGN"): #TP
            classification = "TP"
            tp += 1

        # print(processed_flow[0], original_flow[-1], processed_flow[2].upper(), classification)

        count += 1

    # print results
    print("Number of flows compared: ", count)
    print("TN: ", tn)
    print("FN: ", fn)
    print("TP: ", tp)
    print("FP: ", fp)
    print("Accuracy: ", (tp + tn) / count)
    print("Recall: ", tp / (tp + fn))
    print("Precision: ", tp / (tp + fp))


Number of flows compared:  458968
TN:  430133
FN:  2117
TP:  99
FP:  26619
Accuracy:  0.9373899705426086
Recall:  0.044675090252707585
Precision:  0.0037053671682012127
