# DARPA 2000 Dataset

In [29]:
import pandas as pd
import xml.etree.ElementTree as ET

## Labeling Sanity Check

In [41]:
def labeling_sanity_check(labels_file_dir, processed_flows_df):
    """
    Compares the number of true alerts in the labels files with the number of alerts in the processed flows file.
    """

    total_num_true_alerts = 0
    total_num_processed_alerts = 0

    for phase in range(1,6):
        LABELS_FILE = f"{labels_file_dir}/mid-level-phase-{phase}.xml"

        # Number of alerts in labels file
        tree = ET.parse(LABELS_FILE)
        root = tree.getroot()
        items = root.findall('.//Alert')
        num_true_alerts = len(items)
        total_num_true_alerts += num_true_alerts

        # Number of alerts in processed flows file
        num_processed_alerts = (processed_flows_df["phase"] == phase).sum()
        total_num_processed_alerts += num_processed_alerts

        print(f"--- Attack Phase {phase} ---")
        print(f"Number of alerts in labels file: {num_true_alerts}")
        print(f"Number of alerts in processed flows file: {num_processed_alerts}\n")

    print(f"=== Overall Summary ===")
    print(f"Total number of true alerts in labels files: {total_num_true_alerts}")
    print(f"Total number of alerts in processed flows file: {total_num_processed_alerts}")

### Inside Traffic

In [42]:
INSIDE_LABELS_FILE_DIR = "../data/DARPA_2000/inside/labels" # true labels directory
INSIDE_PROCESSED_FLOWS_FILE = "../data/DARPA_2000/inside/inside_labeled_flows.csv" # processed flows with labels

In [43]:
inside_processed_flows_df = pd.read_csv(INSIDE_PROCESSED_FLOWS_FILE)
inside_processed_flows_df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,attack,phase
0,f0,2000-03-07 14:52:00.817878962+00:00,2000-03-07 14:52:00.817878962+00:00,0.0,202.77.162.213,8,172.16.112.207,0,icmp,-,-,-,1,0,0,0
1,f1,2000-03-07 14:51:59.339156032+00:00,2000-03-07 14:51:59.339156032+00:00,0.0,202.77.162.213,8,172.16.112.149,0,icmp,-,-,-,1,0,0,0
2,f2,2000-03-07 14:51:56.814707041+00:00,2000-03-07 14:51:56.814875007+00:00,0.000168,202.77.162.213,8,172.16.112.50,0,icmp,-,10,10,1,1,0,0
3,f3,2000-03-07 14:51:55.797145009+00:00,2000-03-07 14:51:55.797194958+00:00,5e-05,202.77.162.213,8,172.16.112.10,0,icmp,-,10,10,1,1,0,0
4,f4,2000-03-07 14:51:36.142480969+00:00,2000-03-07 14:52:00.818096042+00:00,24.675615,172.16.112.194,0,202.77.162.213,8,icmp,-,110,10,11,1,1,1


In [49]:
labeling_sanity_check(INSIDE_LABELS_FILE_DIR, inside_processed_flows_df)

--- Attack Phase 1 ---
Number of alerts in labels file: 31
Number of alerts in processed flows file: 1

--- Attack Phase 2 ---
Number of alerts in labels file: 32
Number of alerts in processed flows file: 10

--- Attack Phase 3 ---
Number of alerts in labels file: 35
Number of alerts in processed flows file: 6

--- Attack Phase 4 ---
Number of alerts in labels file: 22
Number of alerts in processed flows file: 10

--- Attack Phase 5 ---
Number of alerts in labels file: 33754
Number of alerts in processed flows file: 1

=== Overall Summary ===
Total number of true alerts in labels files: 33874
Total number of alerts in processed flows file: 28


### DMZ Traffic

In [45]:
DMZ_LABELS_FILE_DIR = "../data/DARPA_2000/dmz/labels" # true labels directory
DMZ_PROCESSED_FLOWS_FILE = "../data/DARPA_2000/dmz/dmz_labeled_flows.csv" # processed flows with labels

In [46]:
dmz_processed_flows_df = pd.read_csv(DMZ_PROCESSED_FLOWS_FILE)
dmz_processed_flows_df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,attack,phase
0,f0,2000-03-07 14:52:02.006109953+00:00,2000-03-07 14:52:02.006109953+00:00,0.0,202.77.162.213,8,172.16.112.254,0,icmp,-,-,-,1,0,0,0
1,f1,2000-03-07 14:52:01.980528951+00:00,2000-03-07 14:52:01.980528951+00:00,0.0,202.77.162.213,8,172.16.112.253,0,icmp,-,-,-,1,0,0,0
2,f2,2000-03-07 14:52:01.929620981+00:00,2000-03-07 14:52:01.929620981+00:00,0.0,202.77.162.213,8,172.16.112.251,0,icmp,-,-,-,1,0,0,0
3,f3,2000-03-07 14:52:01.853170037+00:00,2000-03-07 14:52:01.853170037+00:00,0.0,202.77.162.213,8,172.16.112.248,0,icmp,-,-,-,1,0,0,0
4,f4,2000-03-07 14:52:01.802250028+00:00,2000-03-07 14:52:01.802250028+00:00,0.0,202.77.162.213,8,172.16.112.246,0,icmp,-,-,-,1,0,0,0


In [50]:
labeling_sanity_check(DMZ_LABELS_FILE_DIR, dmz_processed_flows_df)

--- Attack Phase 1 ---
Number of alerts in labels file: 785
Number of alerts in processed flows file: 0

--- Attack Phase 2 ---
Number of alerts in labels file: 25
Number of alerts in processed flows file: 10

--- Attack Phase 3 ---
Number of alerts in labels file: 80
Number of alerts in processed flows file: 15

--- Attack Phase 4 ---
Number of alerts in labels file: 19
Number of alerts in processed flows file: 10

--- Attack Phase 5 ---
Number of alerts in labels file: 33910
Number of alerts in processed flows file: 1

=== Overall Summary ===
Total number of true alerts in labels files: 34819
Total number of alerts in processed flows file: 36
