# DARPA 2000 Dataset

In [36]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import pytz

from datetime import datetime
from collections import defaultdict

In [37]:
# File paths - Scenario One files
INSIDE_DIR = "Scenario_One/inside"
INSIDE_LABELS_FILE_DIR = "Scenario_One/inside/labels"
INSIDE_FLOWS_FILE_DIR = "Scenario_One/inside/per_phase_flows"

DMZ_DIR = "Scenario_One/dmz"
DMZ_LABELS_FILE_DIR = "Scenario_One/dmz/labels"
DMZ_FLOWS_FILE_DIR = "Scenario_One/dmz/per_phase_flows"

## Exploration - How to Label?

### XML Labels

In [38]:
def parse_idmef_xml(xml_path, phase):
    '''
    Parse a DARPA 2000 IDMEF XML file.
    Args:
        xml_path (str): Path to the IDMEF XML file.
        phase (int): Phase number of the attack.
    Returns:
        pd.DataFrame: DataFrame containing parsed alerts.
    '''
    tree = ET.parse(xml_path)
    root = tree.getroot()
    alerts = []

    for alert in root.findall(".//Alert"):
        alert_id = int(alert.get("alertid"))

        # ---- Timestamp ----
        date_str = alert.findtext(".//Time/date")
        time_str = alert.findtext(".//Time/time")
        duration_str = alert.findtext(".//Time/sessionduration")

        # Convert timestamp from US/Eastern → UTC epoch seconds
        eastern = pytz.timezone("US/Eastern")
        dt = eastern.localize(
            datetime.strptime(f"{date_str} {time_str}", "%m/%d/%Y %H:%M:%S")
        ).astimezone(pytz.UTC)

        start_time = dt.timestamp()

        h, m, s = map(int, duration_str.split(":"))
        end_time = start_time + h*3600 + m*60 + s
        duration = end_time - start_time

        # ---- IPs ----
        src_ip = alert.findtext(".//Source//address")
        dst_ip = alert.findtext(".//Target//address")

        # ---- Ports ----
        sport = alert.findtext(".//Target/Service/sport")
        dport = alert.findtext(".//Target/Service/dport")

        sport = int(sport) if sport else 0
        dport = int(dport) if dport else 0

        # ---- Protocol ----
        service_name = alert.findtext(".//Target/Service/name") or ""
        service_name = service_name.lower()
        
        if service_name.startswith("icmp"):
            proto = "icmp"
        elif service_name == "tcp":
            proto = "tcp"
        elif service_name == "udp":
            proto = "udp"
        else:
            proto = "other"

        alerts.append({
            "alert_id": alert_id,
            "alert": 1,
            "phase": phase,
            "start_time": start_time,
            "end_time": end_time,
            "duration": duration,
            "src_ip": src_ip,
            "dst_ip": dst_ip,
            "sport": sport,
            "dport": dport,
            "proto": proto,
            "ts_floor": int(start_time)  # integer second
        })

    return pd.DataFrame(alerts)

In [39]:
phase = 5

In [40]:
inside_alerts_df = parse_idmef_xml(f"{INSIDE_LABELS_FILE_DIR}/mid-level-phase-{phase}.xml", phase=phase)

# # Save to CSV
# output_path = f"{INSIDE_LABELS_FILE_DIR}/alerts_phase{phase}.csv"
# inside_alerts_df.to_csv(output_path, index=False)
# print("Saved to:", output_path)

num_alerts = len(inside_alerts_df)
print(f"Number of alerts: {num_alerts}\n")
inside_alerts_df.head()

Number of alerts: 33754



Unnamed: 0,alert_id,alert,phase,start_time,end_time,duration,src_ip,dst_ip,sport,dport,proto,ts_floor
0,1,1,5,952446375.0,952446861.0,486.0,202.77.162.213,172.16.115.20,49212,23,tcp,952446375
1,2,1,5,952446424.0,952446424.0,0.0,202.77.162.213,172.16.115.20,1022,1020,tcp,952446424
2,3,1,5,952446429.0,952446429.0,0.0,172.16.115.20,255.255.255.255,33799,9325,udp,952446429
3,4,1,5,952446471.0,952446471.0,0.0,172.16.115.20,172.16.112.50,33800,7983,udp,952446471
4,5,1,5,952446471.0,952446471.0,0.0,172.16.115.20,172.16.112.10,33800,7983,udp,952446471


In [41]:
# Comparing with flows
inside_flows_df = pd.read_csv(
    f"{INSIDE_FLOWS_FILE_DIR}/phase{phase}_flows.csv",
)

num_flows = len(inside_flows_df)
print(f"Number of flows: {num_flows}\n")
print(inside_flows_df.head())
print()

Number of flows: 73472

  flow_id    start_time      end_time  duration         src_ip  sport  \
0      f0  9.524464e+08  9.524464e+08  0.000658  172.16.115.20   1020   
1      f1  9.524465e+08  9.524465e+08  0.000000    8.138.161.2   4845   
2      f2  9.524465e+08  9.524465e+08  0.000000    8.138.161.2   4939   
3      f3  9.524465e+08  9.524465e+08  0.000000    8.138.161.2  17060   
4      f4  9.524465e+08  9.524465e+08  0.000000   61.56.80.155   4940   

           dst_ip  dport proto service  ...  local_orig  local_resp  \
0  202.77.162.213   1022   tcp       -  ...           T           F   
1     131.84.1.31  12277   tcp       -  ...           F           F   
2     131.84.1.31   4047   tcp       -  ...           F           F   
3     131.84.1.31   3462   tcp       -  ...           F           F   
4     131.84.1.31  31938   tcp       -  ...           F           F   

  missed_bytes history orig_pkts  orig_ip_bytes resp_pkts  resp_ip_bytes  \
0            0    FafA         2  

### Debugging Individual Flow-Alert Pair

In [42]:
# Set phase to 1
# print(phase==1)
# flow = 3
# alert = 0

# Set phase to 1
# print(phase==1)
# flow = 5
# alert = 3

# Set phase to 4
# f19,952444207.044505,952444207.044856,0.000351,172.16.115.20,1021,202.77.162.213,1022,tcp,-,0,0,2,1,S1,T,F,1,1,4
# print(phase==4)
# flow = 19
# alert = 6

In [43]:
# # Making sure that the values are comparable
# print("IP types:")
# print(type(inside_alerts_df["start_time"][alert]))
# print(type(inside_flows_df["start_time"][flow]))
# print(type(inside_alerts_df["end_time"][alert]))
# print(type(inside_flows_df["end_time"][flow]))
# print()

# print("Port types:")
# print(type(inside_alerts_df["sport"][alert]))
# print(type(inside_flows_df["sport"][flow]))
# print(type(inside_alerts_df["dport"][alert]))
# print(type(inside_flows_df["dport"][flow]))
# print()

# print("IP types:")
# print(type(inside_alerts_df["src_ip"][alert]))
# print(type(inside_flows_df["src_ip"][flow]))
# print(type(inside_alerts_df["dst_ip"][alert]))
# print(type(inside_flows_df["dst_ip"][flow]))
# print()

In [44]:
# # Step-by-step debugging

# flow_id = inside_flows_df["flow_id"][flow]
# alert_id = inside_alerts_df["alert_id"][alert]
# print(f"Comparing flow {flow_id} and alert {alert_id}:\n")

# # IP matching
# print("IP matching:")

# f_sip = inside_flows_df["src_ip"][flow]
# f_dip = inside_flows_df["dst_ip"][flow]
# a_sip = inside_alerts_df["src_ip"][alert]
# a_dip = inside_alerts_df["dst_ip"][alert]

# print(f"Flow: Source and destination IPs:")
# print(f_sip, f_dip)
# print(f"Alert: Source and destination IPs:")
# print(a_sip, a_dip)

# print("Source IPs equal:", end=" ")
# print(f_sip == a_sip)
# print("Destination IPs equal:", end=" ")
# print(f_dip == a_dip)
# print(f_dip)
# print(a_dip)

# same_direction = (
#     (f_sip == a_sip) &
#     (f_dip == a_dip)
# )
# reverse_direction = (
#     (f_sip == a_dip) &
#     (f_dip == a_sip)
# )
# print("Same direction:", same_direction)
# print("Reverse direction:", reverse_direction)
# print()

# # Port matching
# print("Port matching:")
# f_sport = inside_flows_df["sport"][flow]
# a_sport = inside_alerts_df["sport"][alert]
# f_dport = inside_flows_df["dport"][flow]
# a_dport = inside_alerts_df["dport"][alert]
# print("Flow: Source and destination ports:")
# print(f_sport, f_dport)
# print("Alert: Source and destination ports:")
# print(a_sport, a_dport)

# icmp_match = (
#     (inside_flows_df["proto"][flow] == "icmp") &
#     (inside_alerts_df["proto"][alert] == "icmp")
# )
# port_same_dir = ((f_sport == a_sport) & (f_dport == a_dport)) | icmp_match
# port_reverse_dir = ((f_dport == a_sport) & (f_sport == a_dport)) | icmp_match

# print("ICMP protocol match:", icmp_match)
# print("Ports match same direction:", port_same_dir)
# print("Ports match reverse direction:", port_reverse_dir)
# print()

# # Time overlap
# print("Time overlap:")
# f_start = inside_flows_df["start_time"][flow]
# f_end = inside_flows_df["end_time"][flow]
# a_start = inside_alerts_df["start_time"][alert]
# a_end = inside_alerts_df["end_time"][alert]
# print("Flow: Start and end time:")
# print(f_start, f_end)
# print("Alert: Start and end time:")
# print(a_start, a_end)

# time_match = (
#         (np.floor(f_start) == np.floor(a_start))
#     )
# print("Time match:", time_match)
# print()

# # Final matching
# print("Final matching:")
# mask = ((same_direction & port_same_dir) | (reverse_direction & port_reverse_dir)) & time_match
# print(mask)

### Labeling Flows

In [45]:
# ---------------------------------------------------------------------------
# 1. BUILD FAST INDEX FOR ALERTS
# ---------------------------------------------------------------------------

def build_alert_index(labels_df):
    """
    Create a dictionary mapping (src_ip, dst_ip, sport, dport, second) → alert indices.
    Also adds reverse direction.
    """
    index = defaultdict(list)

    for i, row in labels_df.iterrows():
        key = (
            row["src_ip"],
            row["dst_ip"],
            row["sport"],
            row["dport"],
            row["ts_floor"],
        )
        index[key].append(i)

        # reverse direction (TCP response flows swap ports)
        rev_key = (
            row["dst_ip"],
            row["src_ip"],
            row["dport"],
            row["sport"],
            row["ts_floor"],
        )
        index[rev_key].append(i)

        # ICMP matching (no ports)
        if row["proto"] == "icmp":
            icmp_key = (
                row["src_ip"],
                row["dst_ip"],
                "icmp",
                "icmp",
                row["ts_floor"],
            )
            index[icmp_key].append(i)

            icmp_key_rev = (
                row["dst_ip"],
                row["src_ip"],
                "icmp",
                "icmp",
                row["ts_floor"],
            )
            index[icmp_key_rev].append(i)

    return index

In [46]:
# ---------------------------------------------------------------------------
# 2. LABEL FLOWS USING THE INDEX
# ---------------------------------------------------------------------------

def label_flows(flows_df, labels_df, alert_index):
    attack_id = []
    attack = []
    phase = []

    for idx, flow in flows_df.iterrows():

        ts_sec = int(flow["start_time"])

        if flow["proto"] == "icmp":
            key = (
                flow["src_ip"],
                flow["dst_ip"],
                "icmp",
                "icmp",
                ts_sec
            )
        else:
            key = (
                flow["src_ip"],
                flow["dst_ip"],
                int(flow["sport"]),
                int(flow["dport"]),
                ts_sec
            )

        candidates = alert_index.get(key, [])

        if len(candidates) == 0:
            attack_id.append(0)
            attack.append(0)
            phase.append(0)
            continue

        # choose alert closest in time
        best = sorted(
            candidates,
            key=lambda i: abs(flow["start_time"] - labels_df.loc[i, "start_time"])
        )[0]

        attack_id.append(labels_df.loc[best, "alert_id"])
        attack.append(1)
        phase.append(labels_df.loc[best, "phase"])

    flows_df["attack_id"] = attack_id
    flows_df["attack"] = attack
    flows_df["phase"] = phase

    return flows_df

In [47]:
# ---------------------------------------------------------------------------
# 3. MAIN DATASET BUILDER
# ---------------------------------------------------------------------------

def build_dataset(flows_dir, labels_dir, out_csv):
    all_labeled = []

    for phase in range(1, 6):
        print(f"\n=== Processing Phase {phase} ===")

        flows_file = f"{flows_dir}/phase{phase}_flows.csv"
        xml_file   = f"{labels_dir}/phase{phase}_alerts.csv"

        flows_df = pd.read_csv(flows_file)
        labels_df = pd.read_csv(xml_file)

        print(f"Flows: {len(flows_df)}  | Alerts: {len(labels_df)}")

        alert_index = build_alert_index(labels_df)
        labeled = label_flows(flows_df, labels_df, alert_index)

        all_labeled.append(labeled)

    print("\n=== Saving final combined dataset ===")
    final_df = pd.concat(all_labeled, ignore_index=True)
    final_df.to_csv(out_csv, index=False)
    print(f"Saved → {out_csv}")


def build_dataset_all_flows(flows_dir, labels_dir, out_csv):
    print("\n=== Processing All Flows ===")

    print("Loading flows and alerts...")
    flows_file = f"{flows_dir}/flows.csv"
    xml_file   = f"{labels_dir}/xml_alerts_combined.csv"
    flows_df = pd.read_csv(flows_file)
    labels_df = pd.read_csv(xml_file)
    print(f"Flows: {len(flows_df)}  | Alerts: {len(labels_df)}")

    print("Building alert index...")
    alert_index = build_alert_index(labels_df)
    print("Done building alert index.")

    print("Labeling flows...")
    labeled_flows_df = label_flows(flows_df, labels_df, alert_index)
    print("Done labeling flows.")

    print("\n=== Saving final combined dataset ===")
    labeled_flows_df.to_csv(out_csv, index=False)
    print(f"Saved → {out_csv}")


In [48]:
output_path = f"{INSIDE_DIR}/inside_labeled_flows_notebook.csv"
build_dataset(INSIDE_FLOWS_FILE_DIR, INSIDE_LABELS_FILE_DIR, output_path)


=== Processing Phase 1 ===
Flows: 20  | Alerts: 31

=== Processing Phase 2 ===
Flows: 22  | Alerts: 32

=== Processing Phase 3 ===
Flows: 35  | Alerts: 35

=== Processing Phase 4 ===
Flows: 20  | Alerts: 22

=== Processing Phase 5 ===
Flows: 73472  | Alerts: 33754

=== Saving final combined dataset ===
Saved → Scenario_One/inside/inside_labeled_flows_notebook.csv


## Analysis - Of the Labeled Attack Flows

Zeek aggregates packets based on a 5-tuple (src IP, dst IP, src port, dst port, protocol), which means that several alerts from the original XML file might correspond to one zeek flow instance. This means that the number of flows labeled as an attack in the resulting CSV file will be less than the number of alerts in the original XML file.

### Flow Aggregation Analysis

Number of packets in original PCAP files (manual counting):

phase 1 | phase 2 | phase 3 | phase 4 | phase 5 | total
--- | --- | --- | --- | --- | ---
40 | 158 | 225 | 521 | 74477 | 75421

In [49]:
# Number of flows after Zeek aggregation
total_number_of_flows = 0
for phase in range(1,6):
    flows_file = f"{INSIDE_FLOWS_FILE_DIR}/phase{phase}_flows.csv"
    flows_df = pd.read_csv(flows_file)
    num_flows = len(flows_df)
    total_number_of_flows += num_flows
    print(f"Phase {phase} - Number of flows: {num_flows}")

print(f"Total number of flows: {total_number_of_flows}")

Phase 1 - Number of flows: 20
Phase 2 - Number of flows: 22
Phase 3 - Number of flows: 35
Phase 4 - Number of flows: 20
Phase 5 - Number of flows: 73472
Total number of flows: 73569


### Number of Alerts in XML Files

In [50]:
def num_alerts(labels_file_dir):
    '''
    Computes the number of true alerts in the labels files.
    '''
    total_num_true_alerts = 0
    for phase in range(1,6):
        LABELS_FILE = f"{labels_file_dir}/mid-level-phase-{phase}.xml"

        # Number of alerts in labels file
        tree = ET.parse(LABELS_FILE)
        root = tree.getroot()
        items = root.findall('.//Alert')
        num_true_alerts = len(items)
        total_num_true_alerts += num_true_alerts

        print(f"--- Attack Phase {phase} ---")
        print(f"Number of alerts in labels file: {num_true_alerts}\n")

    print(f"=== Overall Summary ===")
    print(f"Total number of alerts in labels files: {total_num_true_alerts}")

In [51]:
num_alerts(INSIDE_LABELS_FILE_DIR)

--- Attack Phase 1 ---
Number of alerts in labels file: 31

--- Attack Phase 2 ---
Number of alerts in labels file: 32

--- Attack Phase 3 ---
Number of alerts in labels file: 35

--- Attack Phase 4 ---
Number of alerts in labels file: 22

--- Attack Phase 5 ---
Number of alerts in labels file: 33754

=== Overall Summary ===
Total number of alerts in labels files: 33874


### Labeling Sanity Check

In [52]:
def labeling_sanity_check(flows_file_dir, labeled_flows_df):
    
    total_num_flows = 0
    total_num_alerts = 0
    for phase in range(1,6):

        # Number of flows
        flows_file = f"{flows_file_dir}/phase{phase}_flows.csv"
        flows_df = pd.read_csv(flows_file)
        num_flows = len(flows_df)
        total_num_flows += num_flows

        # Number of alerts in flows file
        num_alerts = (labeled_flows_df["phase"] == phase).sum()
        total_num_alerts += num_alerts

        print(f"--- Attack Phase {phase} ---")
        print(f"Number of flows: {num_flows}")
        print(f"Number of alerts in flows file: {num_alerts}\n")

    print(f"=== Overall Summary ===")
    print(f"Total number of flows: {total_num_flows}")
    print(f"Total number of alerts in processed flows file: {total_num_alerts}")

In [53]:
labeled_flows_file_name = f"{INSIDE_DIR}/inside_labeled_flows_notebook.csv"
inside_labeled_flows_df = pd.read_csv(labeled_flows_file_name)
inside_labeled_flows_df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,952440700.0,952440700.0,0.0,202.77.162.213,8,172.16.112.149,0,icmp,-,...,-,1,38,0,0,-,1,29,1,1
1,f1,952440700.0,952440700.0,0.003379,202.77.162.213,8,172.16.112.105,0,icmp,-,...,-,1,38,1,38,-,1,27,1,1
2,f2,952440700.0,952440700.0,0.000153,202.77.162.213,8,172.16.112.100,0,icmp,-,...,-,1,38,1,38,-,1,25,1,1
3,f3,952440700.0,952440700.0,0.0,202.77.162.213,8,172.16.112.20,0,icmp,-,...,-,1,38,0,0,-,1,22,1,1
4,f4,952440700.0,952440700.0,5e-05,202.77.162.213,8,172.16.112.10,0,icmp,-,...,-,1,38,1,38,-,1,20,1,1


In [54]:
labeling_sanity_check(INSIDE_FLOWS_FILE_DIR, inside_labeled_flows_df)

--- Attack Phase 1 ---
Number of flows: 20
Number of alerts in flows file: 20

--- Attack Phase 2 ---
Number of flows: 22
Number of alerts in flows file: 22

--- Attack Phase 3 ---
Number of flows: 35
Number of alerts in flows file: 35

--- Attack Phase 4 ---
Number of flows: 20
Number of alerts in flows file: 19

--- Attack Phase 5 ---
Number of flows: 73472
Number of alerts in flows file: 33751

=== Overall Summary ===
Total number of flows: 73569
Total number of alerts in processed flows file: 33847


## Analysis - Final Labeled Dataset

In [55]:
def final_dataset_statistics(file_path):

    labeled_flows_df = pd.read_csv(file_path)

    num_flows = len(labeled_flows_df)
   
    total_num_alerts = 0
    for phase in range(1,6):
        num_alerts = (labeled_flows_df["phase"] == phase).sum()
        total_num_alerts += num_alerts
        print(f"Number of alerts in phase {phase}: {num_alerts}")

    print(f"=== Overall Summary ===")
    print(f"Total number of flows: {num_flows}")
    print(f"Total number of alerts in processed flows file: {total_num_alerts}")

### Scenario One

In [56]:
# Final statistics for inside labeled flows
labeled_flows_file_path = f"{INSIDE_DIR}/inside_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 20
Number of alerts in phase 2: 22
Number of alerts in phase 3: 35
Number of alerts in phase 4: 22
Number of alerts in phase 5: 33754
=== Overall Summary ===
Total number of flows: 125825
Total number of alerts in processed flows file: 33853


In [57]:
# Final statistics for dmz labeled flows
labeled_flows_file_path = f"{DMZ_DIR}/dmz_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 767
Number of alerts in phase 2: 25
Number of alerts in phase 3: 80
Number of alerts in phase 4: 19
Number of alerts in phase 5: 33909
=== Overall Summary ===
Total number of flows: 45441
Total number of alerts in processed flows file: 34800


### Scenario Two

In [58]:
SCENARIO_TWO_INSIDE_DIR = "Scenario_Two/inside"
labeled_flows_file_path = f"{SCENARIO_TWO_INSIDE_DIR}/inside_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 2
Number of alerts in phase 2: 4
Number of alerts in phase 3: 2
Number of alerts in phase 4: 9
Number of alerts in phase 5: 5
=== Overall Summary ===
Total number of flows: 78775
Total number of alerts in processed flows file: 22


In [59]:
SCENARIO_TWO_DMZ_DIR = "Scenario_Two/dmz/"
labeled_flows_file_path = f"{SCENARIO_TWO_DMZ_DIR}/dmz_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 2
Number of alerts in phase 2: 4
Number of alerts in phase 3: 2
Number of alerts in phase 4: 1
Number of alerts in phase 5: 1
=== Overall Summary ===
Total number of flows: 58666
Total number of alerts in processed flows file: 10
