# Prepare Datasets for Model Training/Testing

In [123]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump

from scipy.sparse import save_npz

## Data Prep Helper Functions

### IP Encoding Methods

In [124]:
def encode_ip_none(df):
    """Drop IP address fields completely."""
    df = df.copy()
    return df.drop(columns=["src_ip", "dst_ip"]), None

def encode_ip_integer(df):
    """Convert IPv4 x.x.x.x to an integer: a*256^3 + b*256^2 + c*256 + d."""
    def ip_to_int(ip):
        try:
            a, b, c, d = map(int, ip.split("."))
            return (a << 24) + (b << 16) + (c << 8) + d
        except:
            return 0
    df = df.copy()
    df["src_ip_int"] = df["src_ip"].apply(ip_to_int)
    df["dst_ip_int"] = df["dst_ip"].apply(ip_to_int)

    return df.drop(columns=["src_ip", "dst_ip"]), ["src_ip_int", "dst_ip_int"]

def encode_ip_onehot(df):
    """
    One-hot encoding for IP addresses.
    WARNING: Very large dimensionality. Use only for small subsets.
    """
    df = df.copy()
    return df, ["src_ip", "dst_ip"]

IP_ENCODERS = {
    "none": encode_ip_none,
    "integer": encode_ip_integer,
    "onehot": encode_ip_onehot,
}

In [125]:
def prepare_data(df, ip_encoding="none", feature_set=None):
    '''
    Prepare data by filtering on selected features.

    Args:
        df: Input DataFrame with flow data.
        ip_encoding: IP address encoding method ("none", "integer", "onehot").
        feature_set: Optional list of features to retain.
    Returns:
        df: Prepared DataFrame with selected features.
        numeric_cols: List of numeric feature column names.
        categorical_cols: List of categorical feature column names.
        ip_feature_cols: List of IP feature column names.
    '''
    # Drop metadata fields
    df = df.drop(columns=["flow_id", "attack_id", "phase", "attack"])

    # Handle IP address fields
    if ip_encoding not in IP_ENCODERS:
        raise ValueError(f"Unknown IP encoding: {ip_encoding}")

    df, ip_feature_cols = IP_ENCODERS[ip_encoding](df)

    # Categorical features
    categorical_cols = [
        "proto",      # categorical: "tcp", "udp", "icmp", etc.
        "service",    # categorical: "http", "ftp", "dns", etc.
        "conn_state", # categorical: "S0", "S1", "SF", etc.
        "local_orig", # binary flags
        "local_resp", # binary flags
        ]
    
    # Numerical features
    numeric_cols = [
        "duration", 
        "orig_bytes", 
        "resp_bytes",
        "orig_pkts", 
        "resp_pkts",
    ]
    ip_feature_cols = ip_feature_cols if ip_feature_cols else []
    numeric_cols.extend(ip_feature_cols)

    # Filter on features if feature set is provided
    if feature_set:
        df = df[feature_set + ip_feature_cols]
        categorical_cols = [col for col in categorical_cols if col in feature_set]
        numeric_cols = [col for col in numeric_cols if col in feature_set]

    return df, numeric_cols, categorical_cols, ip_feature_cols

In [126]:
def construct_pipeline(numeric_cols, categorical_cols):
    '''
    Construct a preprocessing pipeline for numerical and categorical features.
    Args:
        numeric_cols: List of numerical feature column names.
        categorical_cols: List of categorical feature column names.
    Returns:
        pipeline: sklearn Pipeline object for preprocessing. 
    '''
    transformer = ColumnTransformer(
        transformers=[
            ("numerical", StandardScaler(with_mean=False), numeric_cols),
            ("categorical", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_cols),
        ], 
        sparse_threshold = 1.0 # always return sparse matrix
    )
    pipeline = Pipeline(steps=[("transform", transformer)])

    return pipeline

## Split Mode: Split (60/40)

In [127]:
mode = "split"  # options: split (60/40), insidedmz, scenarios
split_ratio = 0.6  # Used if mode is "split"
output_dir = "../processed_data/split/"

### Load Labeled Data

In [128]:
LABELED_FLOWS_CSV = "../../data/DARPA_2000/inside/inside_labeled_flows_all.csv"
df = pd.read_csv(LABELED_FLOWS_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


### Split Data into Train/Test

In [129]:
split_index = int(len(df) * split_ratio)
df_train = df.iloc[:split_index].reset_index(drop=True)
df_test = df.iloc[split_index:].reset_index(drop=True)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

# print(df_train.head())
# print(df_test.head())

Train set shape: (75495, 20)
Test set shape: (50330, 20)


### Choice of Features

Possible features to consider:
- Time-based features: start_time, end_time, duration
- Ports: sport and dport
- Protocol: proto
- Service: service
- Packet-based features: orig_bytes, resp_bytes, orig_pkts, resp_pkts
- Origin: local_orig, local_resp

In [130]:
feature_set = ["duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]
# feature_set = ["duration", "proto", "service", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]
# feature_set = ["start_time", "end_time", "duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]
# feature_set = None

ip_encoding = "none"

In [131]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, ip_encoding=ip_encoding, feature_set=feature_set
)
df_test_features, _, _, _ = prepare_data(
    df_test, ip_encoding=ip_encoding, feature_set=feature_set
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto']
IP feature cols: []


Unnamed: 0,duration,sport,dport,proto,orig_bytes,resp_bytes,orig_pkts,resp_pkts
0,0.1018,33354,80,tcp,290,7897,8,9
1,0.001107,45135,53,udp,44,131,1,1
2,0.064488,22641,25,tcp,623,247,12,22
3,0.001167,1061,53,udp,90,131,2,1
4,0.000861,1438,53,udp,45,131,1,1


### Prep Data

In [132]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)

--- Training Data ---
Feature matrix shape: (75495, 8)
Labels shape: (75495,)
--- Test Data ---
Feature matrix shape: (50330, 8)
Labels shape: (50330,)


In [133]:
feature_names = pipeline.named_steps["transform"].get_feature_names_out()
print(feature_names)

['numerical__duration' 'numerical__orig_bytes' 'numerical__resp_bytes'
 'numerical__orig_pkts' 'numerical__resp_pkts' 'categorical__proto_icmp'
 'categorical__proto_tcp' 'categorical__proto_udp']


### Save Processed Data

In [134]:
os.makedirs(output_dir, exist_ok=True)

np.save(os.path.join(output_dir, "y_train.npy"), y_train)
np.save(os.path.join(output_dir, "y_test.npy"), y_test)

save_npz(os.path.join(output_dir, "X_train.npz"), X_train)
save_npz(os.path.join(output_dir, "X_test.npz"), X_test)

dump(pipeline, os.path.join(output_dir, "feature_pipeline.joblib"))

with open(os.path.join(output_dir, "feature_info.txt"), "w") as f:
    f.write("Numerical features:\n")
    f.write(str(numeric_cols) + "\n\n")
    f.write("Categorical features:\n")
    f.write(str(categorical_cols) + "\n\n")
    f.write(f"IP encoding: {ip_encoding}\n")

print(f"Saved X, y, and preprocessing pipeline to {output_dir}/")

Saved X, y, and preprocessing pipeline to ../processed_data/split//


## Split Mode: "insidedmz"

In [135]:
mode = "insidedmz"  # options: split (60/40), insidedmz, scenarios
output_dir = "../processed_data/insidedmz"

### Load Data (Inside and DMZ)

In [136]:
INSIDE_LABELED_FLOWS_CSV = "../../data/DARPA_2000/inside/inside_labeled_flows_all.csv"
df_inside = pd.read_csv(INSIDE_LABELED_FLOWS_CSV)
print("Inside set shape:", df_inside.shape)
df_inside.head()

Inside set shape: (125825, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


In [137]:
DMZ_LABELED_FLOWS_CSV = "../../data/DARPA_2000/dmz/dmz_labeled_flows_all.csv"
df_dmz = pd.read_csv(DMZ_LABELED_FLOWS_CSV)
print("DMZ set shape:", df_dmz.shape)
df_dmz.head()

DMZ set shape: (45441, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.071043,172.16.112.50,43703,172.16.114.50,80,tcp,http,285,5952,7,18,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.008217,172.16.114.50,1965,172.16.115.20,53,udp,dns,88,131,2,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.021633,172.16.113.204,43704,199.95.74.90,80,tcp,http,235,406,5,5,SF,T,F,0,0,0
3,f3,952438900.0,952438900.0,0.067433,172.16.113.204,43705,199.95.74.90,80,tcp,http,236,17635,8,17,SF,T,F,0,0,0
4,f4,952438900.0,952438900.0,0.020937,172.16.113.204,43706,199.95.74.97,80,tcp,http,297,282,5,4,SF,T,F,0,0,0


In [138]:
df_train = df_inside.copy()
df_test = df_dmz.copy()

In [139]:
# feature_set = ["duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]
feature_set = ["duration", "proto", "service", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]
# feature_set = ["start_time", "end_time", "duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]
# feature_set = None

ip_encoding = "none"

In [140]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, ip_encoding=ip_encoding, feature_set=feature_set
)
df_test_features, _, _, _ = prepare_data(
    df_test, ip_encoding=ip_encoding, feature_set=feature_set
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto', 'service']
IP feature cols: []


Unnamed: 0,duration,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts
0,0.1018,tcp,http,290,7897,8,9
1,0.001107,udp,dns,44,131,1,1
2,0.064488,tcp,smtp,623,247,12,22
3,0.001167,udp,dns,90,131,2,1
4,0.000861,udp,dns,45,131,1,1


### Prep Data

In [141]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)

--- Training Data ---
Feature matrix shape: (125825, 22)
Labels shape: (125825,)
--- Test Data ---
Feature matrix shape: (45441, 22)
Labels shape: (45441,)


In [142]:
os.makedirs(output_dir, exist_ok=True)

np.save(os.path.join(output_dir, "y_train.npy"), y_train)
np.save(os.path.join(output_dir, "y_test.npy"), y_test)

save_npz(os.path.join(output_dir, "X_train.npz"), X_train)
save_npz(os.path.join(output_dir, "X_test.npz"), X_test)

dump(pipeline, os.path.join(output_dir, "feature_pipeline.joblib"))

with open(os.path.join(output_dir, "feature_info.txt"), "w") as f:
    f.write("Numerical features:\n")
    f.write(str(numeric_cols) + "\n\n")
    f.write("Categorical features:\n")
    f.write(str(categorical_cols) + "\n\n")
    f.write(f"IP encoding: {ip_encoding}\n")

print(f"Saved X, y, and preprocessing pipeline to {output_dir}/")

Saved X, y, and preprocessing pipeline to ../processed_data/insidedmz/


## Future Remarks and Next Steps

Future options when it comes to features:
- Ports: bucket ports into ranges (e.g., well-known, registered, dynamic)
- Look into how to treat timestamps