# Preprocess (Unbalanced) Data

In [26]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [27]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

from helper_func.preprocess_func import (
    prepare_data, 
    construct_pipeline, 
    save_processed_data, 
    preprocess_data
)

In [28]:
# Config

SEED = 123

FEATURE_LIST_FILE_NAME = f"../features_list.json"
with open(FEATURE_LIST_FILE_NAME) as f:
    feature_list = json.load(f)
print(feature_list)

ip_encoding = "none"

['duration', 'proto', 'service', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'ip_proto']


## Scenario One

In [29]:
SCENARIO_ONE_INSIDE_CSV = "../../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
SCENARIO_ONE_DMZ_CSV = "../../data/DARPA_2000/Scenario_One/dmz/dmz_labeled_flows_all.csv"

In [30]:
scenario_one_dir = "../processed_data/scenario_one"

### Split Mode: Inside (Stratified Split)

In [31]:
data_out_root_dir = f"{scenario_one_dir}/inside/stratified/unbalanced"

In [32]:
# Load labeled data
df = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,...,ShADadfF,8,614,9,8261,-,6,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,...,Dd,1,72,1,159,-,17,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,...,ShAdtDFaf,12,1107,22,1382,-,6,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,...,Dd,2,146,1,159,-,17,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,...,Dd,1,73,1,159,-,17,0,0,0


In [33]:
# Split Data into Train/Test
train_size = 0.6
test_size = 1 - train_size
df_train, df_test = train_test_split(
    df, test_size=test_size, stratify=df["phase"], random_state=SEED
)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (75495, 26)
Test set shape: (50330, 26)


In [34]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, feature_list, ip_encoding=ip_encoding, 
)
df_test_features, _, _, _ = prepare_data(
    df_test, feature_list, ip_encoding=ip_encoding,
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes']
Categorical cols: ['proto', 'service', 'conn_state', 'local_orig', 'local_resp', 'history', 'tunnel_parents', 'ip_proto']
IP feature cols: []


Unnamed: 0,duration,proto,service,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto
30301,0.000797,udp,dns,35,95,SF,T,T,0,Dd,1,63,1,123,-,17
121252,0.113402,tcp,http,186,29495,SF,T,F,0,ShADadfF,13,710,24,30459,-,6
103980,0.0,tcp,-,0,0,OTH,F,F,0,A,1,40,0,0,-,6
112109,0.0,tcp,-,0,0,OTH,F,F,0,A,1,40,0,0,-,6
83648,0.0,tcp,-,0,0,OTH,F,F,0,A,1,40,0,0,-,6


In [35]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]
y_phase_train = df_train["phase"]
X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]
y_phase_test = df_test["phase"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)
print("Phase labels shape:", y_phase_train.shape)

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)
print("Phase labels shape:", y_phase_test.shape)

--- Training Data ---
Feature matrix shape: (75495, 101)
Labels shape: (75495,)
Phase labels shape: (75495,)
--- Test Data ---
Feature matrix shape: (50330, 101)
Labels shape: (50330,)
Phase labels shape: (50330,)


In [36]:
# Save processed data
save_processed_data(
    X_train, y_train, y_phase_train,
    X_test, y_test, y_phase_test,
    pipeline, numeric_cols, categorical_cols, ip_encoding,
    output_dir=f"{data_out_root_dir}/all_phases_binary",)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_one/inside/stratified/unbalanced/all_phases_binary/


### Split Mode: "insidedmz"

In [37]:
data_out_root_dir = f"{scenario_one_dir}/inside_dmz/unbalanced"

In [38]:
# Load data (inside and dmz)
df_inside = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
print("Inside set shape:", df_inside.shape)
# df_inside.head()

df_dmz = pd.read_csv(SCENARIO_ONE_DMZ_CSV)
print("DMZ set shape:", df_dmz.shape)
df_dmz.head()

Inside set shape: (125825, 26)
DMZ set shape: (45441, 26)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.071043,172.16.112.50,43703,172.16.114.50,80,tcp,http,...,ShADadtfF,7,585,18,12632,-,6,0,0,0
1,f1,952438900.0,952438900.0,0.008217,172.16.114.50,1965,172.16.115.20,53,udp,dns,...,Dd,2,144,1,159,-,17,0,0,0
2,f2,952438900.0,952438900.0,0.021633,172.16.113.204,43704,199.95.74.90,80,tcp,http,...,ShADadfF,5,455,5,610,-,6,0,0,0
3,f3,952438900.0,952438900.0,0.067433,172.16.113.204,43705,199.95.74.90,80,tcp,http,...,ShADadfF,8,576,17,18319,-,6,0,0,0
4,f4,952438900.0,952438900.0,0.020937,172.16.113.204,43706,199.95.74.97,80,tcp,http,...,ShADdfFa,5,517,4,446,-,6,0,0,0


In [39]:
df_train = df_inside.copy()
df_test = df_dmz.copy()

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (125825, 26)
Test set shape: (45441, 26)


In [40]:
preprocess_data(
    df_train, df_test, feature_list, ip_encoding, 
    output_dir=f"{data_out_root_dir}/all_phases_binary"
)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_one/inside_dmz/unbalanced/all_phases_binary/


## Scenario Two

In [41]:
SCENARIO_TWO_INSIDE_CSV = "../../data/DARPA_2000/Scenario_Two/inside/inside_labeled_flows_all.csv"
SCENARIO_TWO_DMZ_CSV = "../../data/DARPA_2000/Scenario_Two/dmz/dmz_labeled_flows_all.csv"

scenario_two_dir = "../processed_data/scenario_two"


### Split Mode: Inside (Stratified Split)

In [42]:
data_out_root_dir = f"{scenario_two_dir}/inside/stratified/unbalanced"

In [43]:
# Load labeled data
df = pd.read_csv(SCENARIO_TWO_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,955910700.0,955910700.0,0.195749,172.16.112.149,2104,195.73.151.50,79,tcp,finger,...,ShADadfF,6,252,5,345,-,6,0,0,0
1,f1,955910700.0,955910700.0,0.011438,172.16.112.149,1472,172.16.115.20,53,udp,dns,...,Dd,1,63,1,122,-,17,0,0,0
2,f2,955910700.0,955910700.0,0.003681,172.16.115.20,32780,192.168.1.50,53,udp,dns,...,Dd,1,63,1,122,-,17,0,0,0
3,f3,955910700.0,955910700.0,0.119006,172.16.113.207,2105,207.25.71.200,80,tcp,http,...,ShADadfF,11,705,20,24957,-,6,0,0,0
4,f4,955910700.0,955910700.0,0.027521,172.16.113.207,2106,207.25.71.200,80,tcp,http,...,ShADadfF,5,533,5,2649,-,6,0,0,0


In [44]:
# Split Data into Train/Test
train_size = 0.6
test_size = 1 - train_size
df_train, df_test = train_test_split(
    df, test_size=test_size, stratify=df["phase"], random_state=SEED
)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (47265, 26)
Test set shape: (31510, 26)


In [45]:
preprocess_data(
    df_train, df_test, feature_list, ip_encoding, 
    output_dir=f"{data_out_root_dir}/all_phases_binary"
)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_two/inside/stratified/unbalanced/all_phases_binary/


## Both Scenarios

In [46]:
scenarios_dir = "../processed_data/both_scenarios"

### Split Mode: "inside"

Train on scenario one (inside traffic), test on scenario two (inside traffic).

In [47]:
data_out_root_dir = f"{scenarios_dir}/inside/unbalanced"

In [48]:
# Load Data 
df_scenario_one_inside = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
print("Inside set shape:", df_scenario_one_inside.shape)

df_scenario_two_inside = pd.read_csv(SCENARIO_TWO_INSIDE_CSV)
print("Inside set shape:", df_scenario_two_inside.shape)


Inside set shape: (125825, 26)
Inside set shape: (78775, 26)


In [49]:
df_train = df_scenario_one_inside.copy()
df_test = df_scenario_two_inside.copy()

In [50]:
preprocess_data(
    df_train, df_test, feature_list, ip_encoding, 
    output_dir=f"{data_out_root_dir}/all_phases_binary"
)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/both_scenarios/inside/unbalanced/all_phases_binary/
