# DARPA 2000 Dataset

In [6]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import pytz

from datetime import datetime

In [None]:
# File paths 
INSIDE_DIR = "../data/DARPA_2000/Scenario_One/inside/"
INSIDE_LABELS_FILE_DIR = "../data/DARPA_2000/Scenario_One/inside/labels"
INSIDE_FLOWS_FILE_DIR = "../data/DARPA_2000/Scenario_One/inside/per_phase_flows"

DMZ_DIR = "../data/DARPA_2000/Scenario_One/dmz/"
DMZ_LABELS_FILE_DIR = "../data/DARPA_2000/Scenario_One/dmz/labels"
DMZ_FLOWS_FILE_DIR = "../data/DARPA_2000/Scenario_One/dmz/per_phase_flows"

## Exploration - How to Label?

### XML Labels

In [8]:
# Note: The old processing logic.

def parse_idmef_xml(xml_path, phase):
    '''
    Parse a DARPA 2000 IDMEF XML file.
    Args:
        xml_path (str): Path to the IDMEF XML file.
        phase (int): Phase number of the attack.
    Returns:
        pd.DataFrame: DataFrame containing parsed alerts.
    '''
    tree = ET.parse(xml_path)
    root = tree.getroot()
    alerts = []

    for alert in root.findall(".//Alert"):

        alert_id = alert.get("alertid")

        # ---- Timestamp parsing ----
        date_str = alert.findtext(".//Time/date")
        time_str = alert.findtext(".//Time/time")
        duration_str = alert.findtext(".//Time/sessionduration")

        # From US/Eastern local time -> UTC
        eastern = pytz.timezone("US/Eastern")
        start_dt = eastern.localize(
            datetime.strptime(f"{date_str} {time_str}", "%m/%d/%Y %H:%M:%S")
        ).astimezone(pytz.UTC)

        start_ts = start_dt.timestamp()

        h, m, s = map(int, duration_str.split(":"))
        end_ts = start_ts + h*3600 + m*60 + s

        # ---- IPs ----
        src_ip = alert.findtext(".//Source//address")
        dst_ip = alert.findtext(".//Target//address")

        # ---- Ports ----
        sport = alert.findtext(".//Target/Service/sport")
        dport = alert.findtext(".//Target/Service/dport")

        # ---- Protocol ----
        service_name = alert.findtext(".//Target/Service/name")
        if service_name is None:
            proto = None
        elif service_name.startswith("icmp"):
            proto = "icmp"
        elif service_name.lower() == "tcp":
            proto = "tcp"
        elif service_name.lower() == "udp":
            proto = "udp"
        else:
            proto = "other"

        alerts.append({
            "alert_id": int(alert_id),
            "alert": 1,
            "phase": phase,
            "start_time": start_ts,
            "duration": end_ts - start_ts,
            "end_time": end_ts,
            "src_ip": src_ip,
            "sport": int(sport) if sport else 0,
            "dst_ip": dst_ip,
            "dport": int(dport) if dport else 0,
            "proto": proto
        })

    return pd.DataFrame(alerts)

In [9]:
phase = 4

In [10]:
inside_alerts_df = parse_idmef_xml(f"{INSIDE_LABELS_FILE_DIR}/mid-level-phase-{phase}.xml", phase=phase)

# # Save to CSV
# output_path = f"{INSIDE_LABELS_FILE_DIR}/alerts_phase{phase}.csv"
# inside_alerts_df.to_csv(output_path, index=False)
# print("Saved to:", output_path)

num_alerts = len(inside_alerts_df)
print(f"Number of alerts: {num_alerts}\n")
inside_alerts_df.head()

Number of alerts: 22



Unnamed: 0,alert_id,alert,phase,start_time,duration,end_time,src_ip,sport,dst_ip,dport,proto
0,1,1,4,952444201.0,5.0,952444206.0,202.77.162.213,47496,172.16.115.20,23,tcp
1,2,1,4,952444202.0,0.0,952444202.0,172.16.115.20,1023,202.77.162.213,514,tcp
2,3,1,4,952444202.0,1.0,952444203.0,172.16.115.20,1022,202.77.162.213,514,tcp
3,4,1,4,952444203.0,1.0,952444204.0,172.16.115.20,1022,202.77.162.213,514,tcp
4,5,1,4,952444204.0,1.0,952444205.0,172.16.115.20,1021,202.77.162.213,514,tcp


In [11]:
# Comparing with flows
inside_flows_df = pd.read_csv(
    f"{INSIDE_FLOWS_FILE_DIR}/phase{phase}_flows.csv",
)

num_flows = len(inside_flows_df)
print(f"Number of flows: {num_flows}\n")
print(inside_flows_df.head())
print()

Number of flows: 20

  flow_id    start_time      end_time  duration          src_ip  sport  \
0      f0  9.524442e+08  9.524442e+08  1.235002   172.16.115.20   1022   
1      f1  9.524442e+08  9.524442e+08  0.012167  202.77.162.213  47014   
2      f2  9.524442e+08  9.524442e+08  0.184623   172.16.115.20   1023   
3      f3  9.524442e+08  9.524442e+08  1.112859   172.16.115.20   1022   
4      f4  9.524442e+08  9.524442e+08  1.253746   172.16.115.20   1021   

           dst_ip  dport proto service  orig_bytes  resp_bytes  orig_pkts  \
0  202.77.162.213    514   tcp       -          75       44201         26   
1   172.16.112.50     23   tcp       -           0           0          2   
2  202.77.162.213    514   tcp       -          86          52          7   
3  202.77.162.213    514   tcp       -          86          52          7   
4  202.77.162.213    514   tcp       -          75       59373         31   

   resp_pkts conn_state local_orig local_resp  
0         38         SF

### Debugging Individual Flow-Alert Pair

In [12]:
# Set phase to 1
# print(phase==1)
# flow = 3
# alert = 0

# Set phase to 1
# print(phase==1)
# flow = 5
# alert = 3

# Set phase to 4
# f19,952444207.044505,952444207.044856,0.000351,172.16.115.20,1021,202.77.162.213,1022,tcp,-,0,0,2,1,S1,T,F,1,1,4
print(phase==4)
flow = 19
alert = 6

True


In [13]:
# Making sure that the values are comparable
print("IP types:")
print(type(inside_alerts_df["start_time"][alert]))
print(type(inside_flows_df["start_time"][flow]))
print(type(inside_alerts_df["end_time"][alert]))
print(type(inside_flows_df["end_time"][flow]))
print()

print("Port types:")
print(type(inside_alerts_df["sport"][alert]))
print(type(inside_flows_df["sport"][flow]))
print(type(inside_alerts_df["dport"][alert]))
print(type(inside_flows_df["dport"][flow]))
print()

print("IP types:")
print(type(inside_alerts_df["src_ip"][alert]))
print(type(inside_flows_df["src_ip"][flow]))
print(type(inside_alerts_df["dst_ip"][alert]))
print(type(inside_flows_df["dst_ip"][flow]))
print()

IP types:
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>

Port types:
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.int64'>

IP types:
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>



In [14]:
# Step-by-step debugging

flow_id = inside_flows_df["flow_id"][flow]
alert_id = inside_alerts_df["alert_id"][alert]
print(f"Comparing flow {flow_id} and alert {alert_id}:\n")

# IP matching
print("IP matching:")

f_sip = inside_flows_df["src_ip"][flow]
f_dip = inside_flows_df["dst_ip"][flow]
a_sip = inside_alerts_df["src_ip"][alert]
a_dip = inside_alerts_df["dst_ip"][alert]

print(f"Flow: Source and destination IPs:")
print(f_sip, f_dip)
print(f"Alert: Source and destination IPs:")
print(a_sip, a_dip)

print("Source IPs equal:", end=" ")
print(f_sip == a_sip)
print("Destination IPs equal:", end=" ")
print(f_dip == a_dip)
print(f_dip)
print(a_dip)

same_direction = (
    (f_sip == a_sip) &
    (f_dip == a_dip)
)
reverse_direction = (
    (f_sip == a_dip) &
    (f_dip == a_sip)
)
print("Same direction:", same_direction)
print("Reverse direction:", reverse_direction)
print()

# Port matching
print("Port matching:")
f_sport = inside_flows_df["sport"][flow]
a_sport = inside_alerts_df["sport"][alert]
f_dport = inside_flows_df["dport"][flow]
a_dport = inside_alerts_df["dport"][alert]
print("Flow: Source and destination ports:")
print(f_sport, f_dport)
print("Alert: Source and destination ports:")
print(a_sport, a_dport)

icmp_match = (
    (inside_flows_df["proto"][flow] == "icmp") &
    (inside_alerts_df["proto"][alert] == "icmp")
)
port_same_dir = ((f_sport == a_sport) & (f_dport == a_dport)) | icmp_match
port_reverse_dir = ((f_dport == a_sport) & (f_sport == a_dport)) | icmp_match

print("ICMP protocol match:", icmp_match)
print("Ports match same direction:", port_same_dir)
print("Ports match reverse direction:", port_reverse_dir)
print()

# Time overlap
print("Time overlap:")
f_start = inside_flows_df["start_time"][flow]
f_end = inside_flows_df["end_time"][flow]
a_start = inside_alerts_df["start_time"][alert]
a_end = inside_alerts_df["end_time"][alert]
print("Flow: Start and end time:")
print(f_start, f_end)
print("Alert: Start and end time:")
print(a_start, a_end)

time_match = (
        (np.floor(f_start) == np.floor(a_start))
    )
print("Time match:", time_match)
print()

# Final matching
print("Final matching:")
mask = ((same_direction & port_same_dir) | (reverse_direction & port_reverse_dir)) & time_match
print(mask)

Comparing flow f19 and alert 7:

IP matching:
Flow: Source and destination IPs:
172.16.115.20 202.77.162.213
Alert: Source and destination IPs:
202.77.162.213 172.16.115.20
Source IPs equal: False
Destination IPs equal: False
202.77.162.213
172.16.115.20
Same direction: False
Reverse direction: True

Port matching:
Flow: Source and destination ports:
1021 1022
Alert: Source and destination ports:
1022 1021
ICMP protocol match: False
Ports match same direction: False
Ports match reverse direction: True

Time overlap:
Flow: Start and end time:
952444207.044505 952444207.044856
Alert: Start and end time:
952444207.0 952444207.0
Time match: True

Final matching:
True


### Labeling Flows

In [15]:
# Note: Again the old logic.

def merge_flows_labels(flows_df, labels_df):
    '''
    Merge Zeek flows with IDMEF labels based on 4-tuple (src_ip, dst_ip, src_port, dst_port) and time overlap.
    Args:
        flows_df (pd.DataFrame): DataFrame containing Zeek flow data.
        labels_df (pd.DataFrame): DataFrame containing IDMEF alert data.
    Returns:
        pd.DataFrame: Merged DataFrame with attack labels.
    '''

    flows = flows_df.copy()
    labels = labels_df.copy()
    
    # Add explicit flow index so we can reduce back to one row per original flow
    flows_with_idx = flows.reset_index().rename(columns={"index": "_flow_idx"})

    # Cross-join flows and labels, then compute matching mask on the merged table
    merged = flows_with_idx.merge(labels, how="cross", suffixes=("", "_label"))

    # ==== Matching criteria ====

    # IP matching: same or reverse
    same_direction = (
        (merged["src_ip"] == merged["src_ip_label"]) &
        (merged["dst_ip"] == merged["dst_ip_label"]) 
    )
    reverse_direction = (
        (merged["src_ip"] == merged["dst_ip_label"]) &
        (merged["dst_ip"] == merged["src_ip_label"]) 
    )
    # print("Same direction:", same_direction)
    # print("Reverse direction:", reverse_direction)
    # print()
    # Port matching
    icmp_match = (
        (merged["proto"] == "icmp") &
        (merged["proto_label"] == "icmp")
    )
    port_same_dir = ((merged["sport"] == merged["sport_label"]) & (merged["dport"] == merged["dport_label"])) | icmp_match
    port_reverse_dir = ((merged["dport"] == merged["sport_label"]) & (merged["sport"] == merged["dport_label"])) | icmp_match
    # print("ICMP protocol match:", icmp_match)
    # print("Ports match same direction:", port_same_dir)
    # print("Ports match reverse direction:", port_reverse_dir)
    # print()
    
    # Time overlap
    time_match = (
        (np.floor(merged["start_time"]) == np.floor(merged["start_time_label"]))
    )
    # print("Time match:", time_match)
    # print()

    # Final matching
    # print("Final matching:")
    mask = ((same_direction & port_same_dir) | (reverse_direction & port_reverse_dir)) & time_match
    # print(mask)

    # Initialize label columns in merged
    merged["attack_id"] = 0 # default no alert
    merged["attack"] = 0   # default no attack  
    merged["attack_phase"] = 0    # default phase 0 (no attack)

    # Apply matches
    merged.loc[mask, "attack_id"] = merged.loc[mask, "alert_id"]
    merged.loc[mask, "attack"] = merged.loc[mask, "alert"]
    merged.loc[mask, "attack_phase"] = merged.loc[mask, "phase"]

    # For each original flow, keep a single row. Prefer rows where attack==1.
    merged = merged.sort_values(["_flow_idx", "attack"], ascending=[True, False])
    dedup = merged.drop_duplicates(subset=["_flow_idx"], keep="first")

    # Restore original flow columns order
    flow_columns = flows_with_idx.columns.tolist()
    labeled_flows = dedup[flow_columns + ["attack_id", "attack", "attack_phase"]].copy()

    # Drop the helper _flow_idx column and reset index
    labeled_flows = labeled_flows.drop(columns=["_flow_idx"]).reset_index(drop=True)

    return labeled_flows

In [16]:
def build_dataset(flows_dir, labels_dir, out_csv):
    '''
    Build the labeled dataset by merging Zeek flows with IDMEF labels.
    Args:
        flows_dir (Path): Directory containing Zeek flow CSVs.
        labels_dir (Path): Directory containing IDMEF XML labels.
        out_csv (Path): Path to save the final labeled dataset CSV.
    Returns:
        None
    '''
    all_labeled = []
    for phase in range(1,5): # Not possible to process phase 5 in notebook  
        print(f"Processing Phase {phase} ...")

        # Load flows
        flows_file = f"{flows_dir}/phase{phase}_flows.csv"
        flows_df = pd.read_csv(flows_file)

        # Load XML labels
        xml_file = f"{labels_dir}/mid-level-phase-{phase}.xml"
        labels_df = parse_idmef_xml(xml_file, phase)

        # Merge
        labeled = merge_flows_labels(flows_df, labels_df)
        all_labeled.append(labeled)

    print("Finished processing all phases. Combining and saving dataset...")
    final_df = pd.concat(all_labeled, ignore_index=True)
    final_df.to_csv(out_csv, index=False)
    print(f"Saved dataset to: {out_csv}")


In [17]:
output_path = f"{INSIDE_DIR}/inside_labeled_flows_notebook.csv"
build_dataset(INSIDE_FLOWS_FILE_DIR, INSIDE_LABELS_FILE_DIR, output_path)

Processing Phase 1 ...
Processing Phase 2 ...
Processing Phase 3 ...
Processing Phase 4 ...
Finished processing all phases. Combining and saving dataset...
Saved dataset to: ../data/DARPA_2000/Scenario_One/inside//inside_labeled_flows_notebook.csv


## Analysis - Of the Labeled Attack Flows

Zeek aggregates packets based on a 5-tuple (src IP, dst IP, src port, dst port, protocol), which means that several alerts from the original XML file might correspond to one zeek flow instance. This means that the number of flows labeled as an attack in the resulting CSV file will be less than the number of alerts in the original XML file.

### Flow Aggregation Analysis

Number of packets in original PCAP files (manual counting):

phase 1 | phase 2 | phase 3 | phase 4 | phase 5 | total
--- | --- | --- | --- | --- | ---
40 | 158 | 225 | 521 | 74477 | 75421

In [18]:
# Number of flows after Zeek aggregation
total_number_of_flows = 0
for phase in range(1,6):
    flows_file = f"{INSIDE_FLOWS_FILE_DIR}/phase{phase}_flows.csv"
    flows_df = pd.read_csv(flows_file)
    num_flows = len(flows_df)
    total_number_of_flows += num_flows
    print(f"Phase {phase} - Number of flows: {num_flows}")

print(f"Total number of flows: {total_number_of_flows}")

Phase 1 - Number of flows: 20
Phase 2 - Number of flows: 22
Phase 3 - Number of flows: 35
Phase 4 - Number of flows: 20
Phase 5 - Number of flows: 73472
Total number of flows: 73569


### Number of Alerts in XML Files

In [19]:
def num_alerts(labels_file_dir):
    '''
    Computes the number of true alerts in the labels files.
    '''
    total_num_true_alerts = 0
    for phase in range(1,6):
        LABELS_FILE = f"{labels_file_dir}/mid-level-phase-{phase}.xml"

        # Number of alerts in labels file
        tree = ET.parse(LABELS_FILE)
        root = tree.getroot()
        items = root.findall('.//Alert')
        num_true_alerts = len(items)
        total_num_true_alerts += num_true_alerts

        print(f"--- Attack Phase {phase} ---")
        print(f"Number of alerts in labels file: {num_true_alerts}\n")

    print(f"=== Overall Summary ===")
    print(f"Total number of alerts in labels files: {total_num_true_alerts}")

In [20]:
num_alerts(INSIDE_LABELS_FILE_DIR)

--- Attack Phase 1 ---
Number of alerts in labels file: 31

--- Attack Phase 2 ---
Number of alerts in labels file: 32

--- Attack Phase 3 ---
Number of alerts in labels file: 35

--- Attack Phase 4 ---
Number of alerts in labels file: 22

--- Attack Phase 5 ---
Number of alerts in labels file: 33754

=== Overall Summary ===
Total number of alerts in labels files: 33874


### Labeling Sanity Check

In [21]:
def labeling_sanity_check(flows_file_dir, labeled_flows_df):
    
    total_num_flows = 0
    total_num_alerts = 0
    for phase in range(1,6):

        # Number of flows
        flows_file = f"{flows_file_dir}/phase{phase}_flows.csv"
        flows_df = pd.read_csv(flows_file)
        num_flows = len(flows_df)
        total_num_flows += num_flows

        # Number of alerts in flows file
        num_alerts = (labeled_flows_df["attack_phase"] == phase).sum()
        total_num_alerts += num_alerts

        print(f"--- Attack Phase {phase} ---")
        print(f"Number of flows: {num_flows}")
        print(f"Number of alerts in flows file: {num_alerts}\n")

    print(f"=== Overall Summary ===")
    print(f"Total number of flows: {total_num_flows}")
    print(f"Total number of alerts in processed flows file: {total_num_alerts}")

In [22]:
labeled_flows_file_name = f"{INSIDE_DIR}/inside_labeled_flows_notebook.csv"
inside_labeled_flows_df = pd.read_csv(labeled_flows_file_name)
inside_labeled_flows_df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,attack_phase
0,f0,952440700.0,952440700.0,0.0,202.77.162.213,8,172.16.112.149,0,icmp,-,0,0,1,0,OTH,F,T,29,1,1
1,f1,952440700.0,952440700.0,0.003379,202.77.162.213,8,172.16.112.105,0,icmp,-,10,10,1,1,OTH,F,T,27,1,1
2,f2,952440700.0,952440700.0,0.000153,202.77.162.213,8,172.16.112.100,0,icmp,-,10,10,1,1,OTH,F,T,25,1,1
3,f3,952440700.0,952440700.0,0.0,202.77.162.213,8,172.16.112.20,0,icmp,-,0,0,1,0,OTH,F,T,22,1,1
4,f4,952440700.0,952440700.0,5e-05,202.77.162.213,8,172.16.112.10,0,icmp,-,10,10,1,1,OTH,F,T,20,1,1


In [23]:
labeling_sanity_check(INSIDE_FLOWS_FILE_DIR, inside_labeled_flows_df)

--- Attack Phase 1 ---
Number of flows: 20
Number of alerts in flows file: 20

--- Attack Phase 2 ---
Number of flows: 22
Number of alerts in flows file: 22

--- Attack Phase 3 ---
Number of flows: 35
Number of alerts in flows file: 35

--- Attack Phase 4 ---
Number of flows: 20
Number of alerts in flows file: 19

--- Attack Phase 5 ---
Number of flows: 73472
Number of alerts in flows file: 0

=== Overall Summary ===
Total number of flows: 73569
Total number of alerts in processed flows file: 96


## Analysis - Final Labeled Dataset

In [24]:
def final_dataset_statistics(file_path):

    labeled_flows_df = pd.read_csv(file_path)

    num_flows = len(labeled_flows_df)
   
    total_num_alerts = 0
    for phase in range(1,6):
        num_alerts = (labeled_flows_df["phase"] == phase).sum()
        total_num_alerts += num_alerts
        print(f"Number of alerts in phase {phase}: {num_alerts}")

    print(f"=== Overall Summary ===")
    print(f"Total number of flows: {num_flows}")
    print(f"Total number of alerts in processed flows file: {total_num_alerts}")

In [25]:
# Final statistics for inside labeled flows
labeled_flows_file_path = f"{INSIDE_DIR}/inside_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 20
Number of alerts in phase 2: 22
Number of alerts in phase 3: 35
Number of alerts in phase 4: 22
Number of alerts in phase 5: 33754
=== Overall Summary ===
Total number of flows: 125825
Total number of alerts in processed flows file: 33853


In [26]:
# Final statistics for dmz labeled flows
labeled_flows_file_path = f"{DMZ_DIR}/dmz_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 767
Number of alerts in phase 2: 25
Number of alerts in phase 3: 80
Number of alerts in phase 4: 19
Number of alerts in phase 5: 33909
=== Overall Summary ===
Total number of flows: 45441
Total number of alerts in processed flows file: 34800


In [None]:
# For Scenario Two dataset
SCENARIO_TWO_INSIDE_DIR = "../data/DARPA_2000/Scenario_Two/inside"
labeled_flows_file_path = f"{SCENARIO_TWO_INSIDE_DIR}/inside_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 2
Number of alerts in phase 2: 4
Number of alerts in phase 3: 2
Number of alerts in phase 4: 9
Number of alerts in phase 5: 5
=== Overall Summary ===
Total number of flows: 78775
Total number of alerts in processed flows file: 22


In [30]:
labeled_flows_file_path = f"{SCENARIO_TWO_INSIDE_DIR}/inside_labeled_flows_attack.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 2
Number of alerts in phase 2: 4
Number of alerts in phase 3: 2
Number of alerts in phase 4: 9
Number of alerts in phase 5: 5
=== Overall Summary ===
Total number of flows: 22
Total number of alerts in processed flows file: 22


In [31]:
SCENARIO_TWO_DMZ_DIR = "../data/DARPA_2000/Scenario_Two/dmz/"
labeled_flows_file_path = f"{SCENARIO_TWO_DMZ_DIR}/dmz_labeled_flows_all.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 2
Number of alerts in phase 2: 4
Number of alerts in phase 3: 2
Number of alerts in phase 4: 1
Number of alerts in phase 5: 1
=== Overall Summary ===
Total number of flows: 58666
Total number of alerts in processed flows file: 10


In [32]:
SCENARIO_TWO_DMZ_DIR = "../data/DARPA_2000/Scenario_Two/dmz/"
labeled_flows_file_path = f"{SCENARIO_TWO_DMZ_DIR}/dmz_labeled_flows_attack.csv"
final_dataset_statistics(labeled_flows_file_path)

Number of alerts in phase 1: 2
Number of alerts in phase 2: 4
Number of alerts in phase 3: 2
Number of alerts in phase 4: 1
Number of alerts in phase 5: 1
=== Overall Summary ===
Total number of flows: 10
Total number of alerts in processed flows file: 10
