# Preprocess (Per-Phase) Data

When running this notebook, the output will be 5 different datasets, each corresponding to a different phase of the multi-step attack present in DARPA 2000. Each dataset will contain only the attack samples related to that specific phase, and all the benign samples present in the dataset.

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from collections import Counter

from helper_func.preprocess_func import preprocess_data
from helper_func.sampling_func import (
    check_class_distribution,
    sample_classes_random
)

In [3]:
# Config

SEED = 123

FEATURE_LIST_FILE_NAME = f"../features_list.json"
with open(FEATURE_LIST_FILE_NAME) as f:
    feature_list = json.load(f)
print(feature_list)

ip_encoding = "none"

['duration', 'proto', 'service', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'ip_proto']


## Load Data

In [4]:
SCENARIO_ONE_INSIDE_CSV = "../../../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
data_out_root_dir = "../processed_data_notebooks/scenario_one/inside/stratified/unbalanced"

In [5]:
df = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,...,ShADadfF,8,614,9,8261,-,6,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,...,Dd,1,72,1,159,-,17,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,...,ShAdtDFaf,12,1107,22,1382,-,6,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,...,Dd,2,146,1,159,-,17,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,...,Dd,1,73,1,159,-,17,0,0,0


## Prepare New Label Columns

In [6]:
df_w_labels = df.copy()
df_w_labels.head()
for phase in range(1,6): 
    column_name = f'is_phase_{phase}'
    df_w_labels[column_name] = (df_w_labels['phase'] == phase).astype(int)

In [7]:
df_w_labels.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,tunnel_parents,ip_proto,attack_id,attack,phase,is_phase_1,is_phase_2,is_phase_3,is_phase_4,is_phase_5
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,...,-,6,0,0,0,0,0,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,...,-,17,0,0,0,0,0,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,...,-,6,0,0,0,0,0,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,...,-,17,0,0,0,0,0,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,...,-,17,0,0,0,0,0,0,0,0


## Create Per-Phase Datasets

In [8]:
train_size = 0.6
test_size = 1 - train_size

In [9]:
for phase in range(1,6):
    label_name = f"is_phase_{phase}"

    # Split data
    df_train, df_test = train_test_split(
        df_w_labels, test_size=test_size, stratify=df_w_labels[label_name], random_state=SEED 
    )

    preprocess_data(
        df_train, df_test, feature_list, ip_encoding, 
        output_dir=f"{data_out_root_dir}/phase_{phase}", 
        save=True,
        label_name=label_name
    )

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/unbalanced/phase_1/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/unbalanced/phase_2/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/unbalanced/phase_3/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/unbalanced/phase_4/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/unbalanced/phase_5/


## Dataset Balancing 

### Data Exploration

In [10]:
# Check class distributions
for phase in range(1,6):
    print(f"Phase {phase}:")
    labels = df_w_labels[f"is_phase_{phase}"]
    counts = Counter(labels)
    print(counts)
    print()

Phase 1:
Counter({0: 125805, 1: 20})

Phase 2:
Counter({0: 125803, 1: 22})

Phase 3:
Counter({0: 125790, 1: 35})

Phase 4:
Counter({0: 125803, 1: 22})

Phase 5:
Counter({0: 92071, 1: 33754})



### Sample Data

In [11]:
data_out_root_dir = "../processed_data_notebooks/scenario_one/inside/stratified/sampled"

In [12]:
desired_target = 10000
print(f'Desired target: {desired_target}\n')

Desired target: 10000



In [13]:
for phase in range(1,6):
    print(f"Phase {phase}:")

    label_name = f"is_phase_{phase}"
    labels = df_train[label_name]

    counts = Counter(labels)
    print(f"Class distribution before sampling: {counts}")

    # Determine balancing mode
    if counts[1] < desired_target:
        mode = 'upsample'
    else:
        mode = 'downsample'
    print(f'Balancing mode: {mode}')

    df_train_upsampled, _ = sample_classes_random(
        mode=mode, 
        X=df_train, 
        y=labels,
        desired_target=desired_target,
        attack_phases=[1] # only upsample attack class
    )

    # check_class_distribution(df_train_upsampled[label_name], phases=[0,phase], plot=True)
    
    preprocess_data(
        df_train_upsampled, df_test, feature_list, ip_encoding,
        output_dir=f"{data_out_root_dir}/phase_{phase}", 
        label_name=label_name
    )
    
    print()

Phase 1:
Class distribution before sampling: Counter({0: 75485, 1: 10})
Balancing mode: upsample
Before sampling: Counter({0: 75485, 1: 10})
Sampling strategy (per-phase target): {1: 10000}
After sampling: Counter({0: 75485, 1: 10000})
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/sampled/phase_1/

Phase 2:
Class distribution before sampling: Counter({0: 75480, 1: 15})
Balancing mode: upsample
Before sampling: Counter({0: 75480, 1: 15})
Sampling strategy (per-phase target): {1: 10000}
After sampling: Counter({0: 75480, 1: 10000})
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/stratified/sampled/phase_2/

Phase 3:
Class distribution before sampling: Counter({0: 75474, 1: 21})
Balancing mode: upsample
Before sampling: Counter({0: 75474, 1: 21})
Sampling strategy (per-phase target): {1: 10000}
After sampling: Counter({0: 75474, 1: 10000})
Saved X, y, y_phase, and preprocessi