# Data Prep

In [656]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump
from scipy.sparse import save_npz

In [657]:
SEED = 123

## Preprocessing Functions

In [658]:
def encode_ip_none(df):
    """Drop IP address fields completely."""
    df = df.copy()
    return df.drop(columns=["src_ip", "dst_ip"]), None

def encode_ip_integer(df):
    """Convert IPv4 x.x.x.x to an integer: a*256^3 + b*256^2 + c*256 + d."""
    def ip_to_int(ip):
        try:
            a, b, c, d = map(int, ip.split("."))
            return (a << 24) + (b << 16) + (c << 8) + d
        except:
            return 0
    df = df.copy()
    df["src_ip_int"] = df["src_ip"].apply(ip_to_int)
    df["dst_ip_int"] = df["dst_ip"].apply(ip_to_int)

    return df.drop(columns=["src_ip", "dst_ip"]), ["src_ip_int", "dst_ip_int"]

IP_ENCODERS = {
    "none": encode_ip_none,
    "integer": encode_ip_integer,
}

In [659]:
def prepare_data(df,feature_set,ip_encoding="none"):
    '''
    Prepare data by filtering on selected features.

    Args:
        df: Input DataFrame with flow data.
        feature_set: List of features to retain.
        ip_encoding: IP address encoding method ("none", "integer", "onehot").
    Returns:
        df: Prepared DataFrame with selected features.
        numeric_cols: List of numeric feature column names.
        categorical_cols: List of categorical feature column names.
        ip_feature_cols: List of IP feature column names.
    '''
    # Drop metadata fields
    df = df.drop(columns=["flow_id", "attack_id", "phase", "attack"])

    # Handle IP address fields
    if ip_encoding not in IP_ENCODERS:
        raise ValueError(f"Unknown IP encoding: {ip_encoding}")

    df, ip_feature_cols = IP_ENCODERS[ip_encoding](df)

    # Categorical features
    categorical_cols = [
        "proto",      # categorical: "tcp", "udp", "icmp", etc.
        "service",    # categorical: "http", "ftp", "dns", etc.
        "conn_state", # categorical: "S0", "S1", "SF", etc.
        "local_orig", # binary flags
        "local_resp", # binary flags
        ]
    
    # Numerical features
    numeric_cols = [
        "start_time",
        "end_time",
        "duration", 
        "sport",
        "dport",
        "orig_bytes", 
        "resp_bytes",
        "orig_pkts", 
        "resp_pkts",
    ]
    ip_feature_cols = ip_feature_cols if ip_feature_cols else []
    numeric_cols.extend(ip_feature_cols)

    # Filter on features
    df = df[feature_set + ip_feature_cols]
    categorical_cols = [col for col in categorical_cols if col in feature_set]
    numeric_cols = [col for col in numeric_cols if col in feature_set]

    return df, numeric_cols, categorical_cols, ip_feature_cols

In [660]:
def construct_pipeline(numeric_cols, categorical_cols):
    '''
    Construct a preprocessing pipeline for numerical and categorical features.
    Args:
        numeric_cols: List of numerical feature column names.
        categorical_cols: List of categorical feature column names.
    Returns:
        pipeline: sklearn Pipeline object for preprocessing. 
    '''
    transformer = ColumnTransformer(
        transformers=[
            ("numerical", StandardScaler(with_mean=False), numeric_cols),
            ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_cols),
        ], 
        sparse_threshold = 1.0 # always return sparse matrix
    )
    pipeline = Pipeline(steps=[("transform", transformer)])

    return pipeline

In [661]:
def save_processed_data(output_dir, X_train, y_train, X_test, y_test, pipeline, numeric_cols, categorical_cols, ip_encoding):
    '''
    Save processed data and preprocessing pipeline to disk.
    Args:
        output_dir: Directory to save processed data.
        X_train: Processed training feature matrix.
        y_train: Training labels.
        X_test: Processed testing feature matrix.
        y_test: Testing labels.
        pipeline: Preprocessing pipeline.
        numeric_cols: List of numerical feature column names.
        categorical_cols: List of categorical feature column names.
        ip_encoding: IP encoding method used.
    Returns:
        None
    '''
    os.makedirs(output_dir, exist_ok=True)

    np.save(os.path.join(output_dir, "y_train.npy"), y_train)
    np.save(os.path.join(output_dir, "y_test.npy"), y_test)
    save_npz(os.path.join(output_dir, "X_train.npz"), X_train)
    save_npz(os.path.join(output_dir, "X_test.npz"), X_test)

    dump(pipeline, os.path.join(output_dir, "feature_pipeline.joblib"))

    with open(os.path.join(output_dir, "feature_info.txt"), "w") as f:
        f.write("Numerical features:\n")
        f.write(str(numeric_cols) + "\n\n")
        f.write("Categorical features:\n")
        f.write(str(categorical_cols) + "\n\n")
        f.write(f"IP encoding: {ip_encoding}\n")

    print(f"Saved X, y, and preprocessing pipeline to {output_dir}/")

## Choice of Features

In [662]:
ip_encoding = "none"
# ip_encoding = "integer"

Possible zeek features to consider:
- Time-based features: start_time, end_time, duration
- Ports: sport and dport
- Protocol: proto
- Service: service
- Packet-based features: orig_bytes, resp_bytes, orig_pkts, resp_pkts
- Origin: local_orig, local_resp

In [663]:
# all features 
# feature_set = ["start_time", "end_time", "duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]

# without time
feature_set = ["duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]

# without ports 
# feature_set = ["start_time", "end_time", "duration", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]

# without time and ports
# feature_set = ["duration", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]



## Split Mode: inside

### Scenario 1

In [664]:
output_dir = "processed_data/inside_split/scenario_one"

In [665]:
# Load labeled data
SCENARIO_ONE_INSIDE_CSV = "../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
df = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


In [666]:
# Split Data into Train/Test
train_size = 0.8
test_size = 1 - train_size
df_train, df_test = train_test_split(
    df, test_size=test_size, stratify=df["phase"], random_state=SEED
)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (100660, 20)
Test set shape: (25165, 20)


In [667]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, feature_set, ip_encoding=ip_encoding, 
)
df_test_features, _, _, _ = prepare_data(
    df_test, feature_set, ip_encoding=ip_encoding,
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'sport', 'dport', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto']
IP feature cols: []


Unnamed: 0,duration,sport,dport,proto,orig_bytes,resp_bytes,orig_pkts,resp_pkts
56074,0.0,12786,24344,tcp,0,0,1,0
16363,0.000979,3047,53,udp,156,131,2,1
67672,0.0,15983,2719,tcp,0,0,1,0
22914,0.000851,1343,53,udp,44,131,1,1
124029,0.031023,3110,80,tcp,232,1922,5,5


In [668]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)

--- Training Data ---
Feature matrix shape: (100660, 10)
Labels shape: (100660,)
--- Test Data ---
Feature matrix shape: (25165, 10)
Labels shape: (25165,)


In [669]:
# Save processed data
save_processed_data(output_dir, X_train, y_train, X_test, y_test, pipeline, numeric_cols, categorical_cols, ip_encoding)

Saved X, y, and preprocessing pipeline to processed_data/inside_split/scenario_one/


### Scenario Two

In [670]:
output_dir = "processed_data/inside_split/scenario_two"

In [671]:
# Load labeled data
SCENARIO_TWO_INSIDE = "../data/DARPA_2000/Scenario_Two/inside/inside_labeled_flows_all.csv"
df = pd.read_csv(SCENARIO_TWO_INSIDE)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,955910700.0,955910700.0,0.195749,172.16.112.149,2104,195.73.151.50,79,tcp,finger,8,141,6,5,SF,T,F,0,0,0
1,f1,955910700.0,955910700.0,0.011438,172.16.112.149,1472,172.16.115.20,53,udp,dns,35,94,1,1,SF,T,T,0,0,0
2,f2,955910700.0,955910700.0,0.003681,172.16.115.20,32780,192.168.1.50,53,udp,dns,35,94,1,1,SF,T,T,0,0,0
3,f3,955910700.0,955910700.0,0.119006,172.16.113.207,2105,207.25.71.200,80,tcp,http,261,24153,11,20,SF,T,F,0,0,0
4,f4,955910700.0,955910700.0,0.027521,172.16.113.207,2106,207.25.71.200,80,tcp,http,329,2445,5,5,SF,T,F,0,0,0


In [672]:
# Split Data into Train/Test
train_size = 0.8
test_size = 1 - train_size
df_train, df_test = train_test_split(
    df, test_size=test_size, stratify=df["phase"], random_state=SEED
)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (63020, 20)
Test set shape: (15755, 20)


In [673]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, feature_set, ip_encoding=ip_encoding
)
df_test_features, _, _, _ = prepare_data(
    df_test, feature_set, ip_encoding=ip_encoding
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'sport', 'dport', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto']
IP feature cols: []


Unnamed: 0,duration,sport,dport,proto,orig_bytes,resp_bytes,orig_pkts,resp_pkts
15054,0.007237,1987,53,udp,34,94,1,1
10151,0.039624,5693,80,tcp,198,1084,5,4
49896,0.0,54036,27179,tcp,0,0,1,0
53277,0.0,57403,18099,tcp,0,0,1,0
46698,0.0,50851,14626,tcp,0,0,1,0


In [674]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)

--- Training Data ---
Feature matrix shape: (63020, 10)
Labels shape: (63020,)
--- Test Data ---
Feature matrix shape: (15755, 10)
Labels shape: (15755,)


In [675]:
# Save processed data
save_processed_data(output_dir, X_train, y_train, X_test, y_test, pipeline, numeric_cols, categorical_cols, ip_encoding)

Saved X, y, and preprocessing pipeline to processed_data/inside_split/scenario_two/


## Split Mode: "insidedmz"

In [676]:
mode = "insidedmz"

### Scenario One

In [677]:
output_dir = "processed_data/insidedmz_split/scenario_one"

In [678]:
# Load Data (Inside and DMZ)
INSIDE_LABELED_FLOWS_CSV = "../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
df_inside = pd.read_csv(INSIDE_LABELED_FLOWS_CSV)
print("Inside set shape:", df_inside.shape)
df_inside.head()

Inside set shape: (125825, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


In [679]:
DMZ_LABELED_FLOWS_CSV = "../data/DARPA_2000/Scenario_One/dmz/dmz_labeled_flows_all.csv"
df_dmz = pd.read_csv(DMZ_LABELED_FLOWS_CSV)
print("DMZ set shape:", df_dmz.shape)
df_dmz.head()

DMZ set shape: (45441, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.071043,172.16.112.50,43703,172.16.114.50,80,tcp,http,285,5952,7,18,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.008217,172.16.114.50,1965,172.16.115.20,53,udp,dns,88,131,2,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.021633,172.16.113.204,43704,199.95.74.90,80,tcp,http,235,406,5,5,SF,T,F,0,0,0
3,f3,952438900.0,952438900.0,0.067433,172.16.113.204,43705,199.95.74.90,80,tcp,http,236,17635,8,17,SF,T,F,0,0,0
4,f4,952438900.0,952438900.0,0.020937,172.16.113.204,43706,199.95.74.97,80,tcp,http,297,282,5,4,SF,T,F,0,0,0


In [680]:
df_train = df_inside.copy()
df_test = df_dmz.copy()

In [681]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, feature_set, ip_encoding=ip_encoding
)
df_test_features, _, _, _ = prepare_data(
    df_test, feature_set, ip_encoding=ip_encoding
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'sport', 'dport', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto']
IP feature cols: []


Unnamed: 0,duration,sport,dport,proto,orig_bytes,resp_bytes,orig_pkts,resp_pkts
0,0.1018,33354,80,tcp,290,7897,8,9
1,0.001107,45135,53,udp,44,131,1,1
2,0.064488,22641,25,tcp,623,247,12,22
3,0.001167,1061,53,udp,90,131,2,1
4,0.000861,1438,53,udp,45,131,1,1


In [682]:
# Prep data

pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)

--- Training Data ---
Feature matrix shape: (125825, 10)
Labels shape: (125825,)
--- Test Data ---
Feature matrix shape: (45441, 10)
Labels shape: (45441,)


In [683]:
# Save data
save_processed_data(output_dir, X_train, y_train, X_test, y_test, pipeline, numeric_cols, categorical_cols, ip_encoding)

Saved X, y, and preprocessing pipeline to processed_data/insidedmz_split/scenario_one/


## Split Mode: "scenario"

Train on Scenario One, Test on Scenario Two.

Note: For now, we only use the "inside" data from both scenarios

In [None]:
output_dir = "processed_data/scenario_split/scenario_one"

In [685]:
# Load Data (Scenario One)
SCENARIO_ONE_INSIDE_DATA_CSV = "../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
df_scenario_one_inside = pd.read_csv(SCENARIO_ONE_INSIDE_DATA_CSV)
print("Inside set shape:", df_scenario_one_inside.shape)
df_scenario_one_inside.head()

Inside set shape: (125825, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


In [686]:
# Load Data (Scenario Two)
SCENARIO_TWO_INSIDE_DATA_CSV = "../data/DARPA_2000/Scenario_Two/inside/inside_labeled_flows_all.csv"
df_scenario_two_inside = pd.read_csv(SCENARIO_TWO_INSIDE_DATA_CSV)
print("Inside set shape:", df_scenario_two_inside.shape)
df_scenario_two_inside.head()

Inside set shape: (78775, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,955910700.0,955910700.0,0.195749,172.16.112.149,2104,195.73.151.50,79,tcp,finger,8,141,6,5,SF,T,F,0,0,0
1,f1,955910700.0,955910700.0,0.011438,172.16.112.149,1472,172.16.115.20,53,udp,dns,35,94,1,1,SF,T,T,0,0,0
2,f2,955910700.0,955910700.0,0.003681,172.16.115.20,32780,192.168.1.50,53,udp,dns,35,94,1,1,SF,T,T,0,0,0
3,f3,955910700.0,955910700.0,0.119006,172.16.113.207,2105,207.25.71.200,80,tcp,http,261,24153,11,20,SF,T,F,0,0,0
4,f4,955910700.0,955910700.0,0.027521,172.16.113.207,2106,207.25.71.200,80,tcp,http,329,2445,5,5,SF,T,F,0,0,0


In [687]:
df_train = df_scenario_one_inside.copy()
df_test = df_scenario_two_inside.copy()

In [688]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, feature_set, ip_encoding=ip_encoding
)
df_test_features, _, _, _ = prepare_data(
    df_test, feature_set, ip_encoding=ip_encoding
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'sport', 'dport', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto']
IP feature cols: []


Unnamed: 0,duration,sport,dport,proto,orig_bytes,resp_bytes,orig_pkts,resp_pkts
0,0.1018,33354,80,tcp,290,7897,8,9
1,0.001107,45135,53,udp,44,131,1,1
2,0.064488,22641,25,tcp,623,247,12,22
3,0.001167,1061,53,udp,90,131,2,1
4,0.000861,1438,53,udp,45,131,1,1


In [689]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)

--- Training Data ---
Feature matrix shape: (125825, 10)
Labels shape: (125825,)
--- Test Data ---
Feature matrix shape: (78775, 10)
Labels shape: (78775,)


In [690]:
# Save data to file
save_processed_data(output_dir, X_train, y_train, X_test, y_test, pipeline, numeric_cols, categorical_cols, ip_encoding)

Saved X, y, and preprocessing pipeline to processed_data/scenario_split/scenario_one/
