# Preprocess (Unbalanced) Data

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

from helper_func.preprocess_func import (
    prepare_data, 
    construct_pipeline, 
    save_processed_data, 
    preprocess_data
)

In [3]:
SEED = 123

## Choice of Features

In [4]:
ip_encoding = "none"
# ip_encoding = "integer"

In [5]:
# all features 
# feature_set = ["start_time", "end_time", "duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]

# without time
# feature_set = ["duration", "sport", "dport", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]

# without ports
# feature_set = ["start_time", "end_time", "duration", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]

# without time and ports
feature_set = ["duration", "proto", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts"]


## Scenario One

In [6]:
SCENARIO_ONE_INSIDE_CSV = "../../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
SCENARIO_ONE_DMZ_CSV = "../../data/DARPA_2000/Scenario_One/dmz/dmz_labeled_flows_all.csv"

In [7]:
scenario_one_dir = "../processed_data/scenario_one"

### Split Mode: Inside (Stratified Split)

In [8]:
data_out_root_dir = f"{scenario_one_dir}/inside/stratified"

In [9]:
# Load labeled data
df = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


In [10]:
# Split Data into Train/Test
train_size = 0.6
test_size = 1 - train_size
df_train, df_test = train_test_split(
    df, test_size=test_size, stratify=df["phase"], random_state=SEED
)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (75495, 20)
Test set shape: (50330, 20)


In [11]:
df_train_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(
    df_train, feature_set, ip_encoding=ip_encoding, 
)
df_test_features, _, _, _ = prepare_data(
    df_test, feature_set, ip_encoding=ip_encoding,
)

print("Numeric cols:", numeric_cols)
print("Categorical cols:", categorical_cols)
print("IP feature cols:", ip_feature_cols)

df_train_features.head()

Numeric cols: ['duration', 'orig_bytes', 'resp_bytes', 'orig_pkts', 'resp_pkts']
Categorical cols: ['proto']
IP feature cols: []


Unnamed: 0,duration,proto,orig_bytes,resp_bytes,orig_pkts,resp_pkts
30301,0.000797,udp,35,95,1,1
121252,0.113402,tcp,186,29495,13,24
103980,0.0,tcp,0,0,1,0
112109,0.0,tcp,0,0,1,0
83648,0.0,tcp,0,0,1,0


In [12]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)

X_train = pipeline.fit_transform(df_train_features)
y_train = df_train["attack"]
y_phase_train = df_train["phase"]
X_test = pipeline.transform(df_test_features)
y_test = df_test["attack"]
y_phase_test = df_test["phase"]

print("--- Training Data ---")
print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)
print("Phase labels shape:", y_phase_train.shape)

print("--- Test Data ---")
print("Feature matrix shape:", X_test.shape)
print("Labels shape:", y_test.shape)
print("Phase labels shape:", y_phase_test.shape)

--- Training Data ---
Feature matrix shape: (75495, 8)
Labels shape: (75495,)
Phase labels shape: (75495,)
--- Test Data ---
Feature matrix shape: (50330, 8)
Labels shape: (50330,)
Phase labels shape: (50330,)


In [13]:
# Save processed data
save_processed_data(
    X_train, y_train, y_phase_train,
    X_test, y_test, y_phase_test,
    pipeline, numeric_cols, categorical_cols, ip_encoding,
    output_dir=f"{data_out_root_dir}/unbalanced")

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_one/inside/stratified/unbalanced/


### Split Mode: "insidedmz"

In [14]:
data_out_root_dir = f"{scenario_one_dir}/inside_dmz"

In [15]:
# Load data (inside and dmz)
df_inside = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
print("Inside set shape:", df_inside.shape)
# df_inside.head()

df_dmz = pd.read_csv(SCENARIO_ONE_DMZ_CSV)
print("DMZ set shape:", df_dmz.shape)
df_dmz.head()

Inside set shape: (125825, 20)
DMZ set shape: (45441, 20)


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.071043,172.16.112.50,43703,172.16.114.50,80,tcp,http,285,5952,7,18,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.008217,172.16.114.50,1965,172.16.115.20,53,udp,dns,88,131,2,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.021633,172.16.113.204,43704,199.95.74.90,80,tcp,http,235,406,5,5,SF,T,F,0,0,0
3,f3,952438900.0,952438900.0,0.067433,172.16.113.204,43705,199.95.74.90,80,tcp,http,236,17635,8,17,SF,T,F,0,0,0
4,f4,952438900.0,952438900.0,0.020937,172.16.113.204,43706,199.95.74.97,80,tcp,http,297,282,5,4,SF,T,F,0,0,0


In [16]:
df_train = df_inside.copy()
df_test = df_dmz.copy()

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (125825, 20)
Test set shape: (45441, 20)


In [17]:
preprocess_data(
    df_train, df_test, feature_set, ip_encoding, 
    output_dir=f"{data_out_root_dir}/unbalanced"
)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_one/inside_dmz/unbalanced/


## Scenario Two

In [18]:
SCENARIO_TWO_INSIDE_CSV = "../../data/DARPA_2000/Scenario_Two/inside/inside_labeled_flows_all.csv"
SCENARIO_TWO_DMZ_CSV = "../../data/DARPA_2000/Scenario_Two/dmz/dmz_labeled_flows_all.csv"

scenario_two_dir = "../processed_data/scenario_two"


### Split Mode: Inside (Stratified Split)

In [19]:
data_out_root_dir = f"{scenario_two_dir}/inside/stratified"

In [20]:
# Load labeled data
df = pd.read_csv(SCENARIO_TWO_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,955910700.0,955910700.0,0.195749,172.16.112.149,2104,195.73.151.50,79,tcp,finger,8,141,6,5,SF,T,F,0,0,0
1,f1,955910700.0,955910700.0,0.011438,172.16.112.149,1472,172.16.115.20,53,udp,dns,35,94,1,1,SF,T,T,0,0,0
2,f2,955910700.0,955910700.0,0.003681,172.16.115.20,32780,192.168.1.50,53,udp,dns,35,94,1,1,SF,T,T,0,0,0
3,f3,955910700.0,955910700.0,0.119006,172.16.113.207,2105,207.25.71.200,80,tcp,http,261,24153,11,20,SF,T,F,0,0,0
4,f4,955910700.0,955910700.0,0.027521,172.16.113.207,2106,207.25.71.200,80,tcp,http,329,2445,5,5,SF,T,F,0,0,0


In [21]:
# Split Data into Train/Test
train_size = 0.6
test_size = 1 - train_size
df_train, df_test = train_test_split(
    df, test_size=test_size, stratify=df["phase"], random_state=SEED
)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

Train set shape: (47265, 20)
Test set shape: (31510, 20)


In [22]:
preprocess_data(
    df_train, df_test, feature_set, ip_encoding, 
    output_dir=f"{data_out_root_dir}/unbalanced"
)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_two/inside/stratified/unbalanced/


## Both Scenarios

In [23]:
scenarios_dir = "../processed_data/both_scenarios"

### Split Mode: "inside"

Train on scenario one (inside traffic), test on scenario two (inside traffic).

In [24]:
data_out_root_dir = f"{scenarios_dir}/inside"

In [25]:
# Load Data 
df_scenario_one_inside = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
print("Inside set shape:", df_scenario_one_inside.shape)

df_scenario_two_inside = pd.read_csv(SCENARIO_TWO_INSIDE_CSV)
print("Inside set shape:", df_scenario_two_inside.shape)


Inside set shape: (125825, 20)
Inside set shape: (78775, 20)


In [26]:
df_train = df_scenario_one_inside.copy()
df_test = df_scenario_two_inside.copy()

In [27]:
preprocess_data(
    df_train, df_test, feature_set, ip_encoding, 
    output_dir=f"{data_out_root_dir}/unbalanced"
)

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/both_scenarios/inside/unbalanced/
