# Host Temporal Split

In [2]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [3]:
import pandas as pd
from helper_func.preprocess_func import preprocess_data
import json

## Load Data

In [4]:
SCENARIO_ONE_INSIDE_CSV = "../../../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
data_out_root_dir = "../processed_data_notebooks/scenario_one/inside/stratified/sampled"

In [5]:
df = pd.read_csv(SCENARIO_ONE_INSIDE_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,...,ShADadfF,8,614,9,8261,-,6,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,...,Dd,1,72,1,159,-,17,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,...,ShAdtDFaf,12,1107,22,1382,-,6,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,...,Dd,2,146,1,159,-,17,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,...,Dd,1,73,1,159,-,17,0,0,0


## Split Data

In [6]:
def host_based_temporal_split(df, train_ratio=0.8):
    """
    Performs a host-based temporal split.
    For each (src_ip, dst_ip) pair:
        - Sort by start_time
        - First train_ratio fraction -> train
        - Remaining -> test
    """
    train_parts = []
    test_parts = []

    # Group by host pair
    grouped = df.groupby(["src_ip", "dst_ip"])

    for (src, dst), group in grouped:
        # Sort by timestamp
        group_sorted = group.sort_values("start_time")

        # Compute split index
        split_idx = int(len(group_sorted) * train_ratio)

        # Append to lists
        train_parts.append(group_sorted.iloc[:split_idx])
        test_parts.append(group_sorted.iloc[split_idx:])

    # Concatenate all host groups
    df_train = pd.concat(train_parts).reset_index(drop=True)
    df_test = pd.concat(test_parts).reset_index(drop=True)

    return df_train, df_test


In [7]:
df_train, df_test = host_based_temporal_split(df, train_ratio=0.8)

print("Train phase counts:")
print(df_train["phase"].value_counts())

print("\nTest phase counts:")
print(df_test["phase"].value_counts())


Train phase counts:
phase
0    49908
5    30743
3       32
4       10
1       10
2        8
Name: count, dtype: int64

Test phase counts:
phase
0    42064
5     3011
2       14
4       12
1       10
3        3
Name: count, dtype: int64


In [8]:
print("Number of samples before splitting:")
print(len(df))
print("Number of samples in train set:")
print(len(df_train))
print("Number of samples in test set:")
print(len(df_test))
print("Total samples after splitting:")
print(len(df_train) + len(df_test))

Number of samples before splitting:
125825
Number of samples in train set:
80711
Number of samples in test set:
45114
Total samples after splitting:
125825


## Process Data After Split

In [9]:
# Config

SEED = 123

FEATURE_LIST_FILE_NAME = f"../features_list.json"
with open(FEATURE_LIST_FILE_NAME) as f:
    feature_list = json.load(f)
print(feature_list)

ip_encoding = "none"

['duration', 'proto', 'service', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'ip_proto']


In [10]:
data_out_root_dir = "../processed_data_notebooks/scenario_one/inside/host_split/unbalanced"
output_dir=f"{data_out_root_dir}/all_phases_binary"

In [11]:
preprocess_data(df_train, df_test, feature_list, ip_encoding, output_dir, label_name="attack", save=True)
    

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/host_split/unbalanced/all_phases_binary/


## Per Phase Preprocessing

In [12]:
for phase in range(1,6): 
    column_name = f'is_phase_{phase}'
    df_train[column_name] = (df_train['phase'] == phase).astype(int)
    df_test[column_name] = (df_test['phase'] == phase).astype(int)

In [13]:
for phase in range(1,6):
    label_name = f"is_phase_{phase}"

    preprocess_data(
        df_train, df_test, feature_list, ip_encoding, 
        output_dir=f"{data_out_root_dir}/phase_{phase}", 
        save=True,
        label_name=label_name
    )

Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/host_split/unbalanced/phase_1/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/host_split/unbalanced/phase_2/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/host_split/unbalanced/phase_3/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/host_split/unbalanced/phase_4/
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/scenario_one/inside/host_split/unbalanced/phase_5/


## Balanced

In [14]:
data_out_root_dir = "../processed_data_notebooks/scenario_one/inside/host_split/sampled"

In [15]:
desired_target = 10000
print(f'Desired target: {desired_target}\n')

Desired target: 10000



In [16]:
from collections import Counter
from helper_func.sampling_func import sample_classes_random

In [17]:
for phase in range(1,6):
    print(f"Phase {phase}:")

    label_name = f"is_phase_{phase}"
    labels = df_train[label_name]

    counts = Counter(labels)
    print(f"Class distribution before sampling: {counts}")

    # Determine balancing mode
    if counts[1] < desired_target:
        mode = 'upsample'
    else:
        mode = 'downsample'
    print(f'Balancing mode: {mode}')

    df_train_upsampled, _ = sample_classes_random(
        mode=mode, 
        X=df_train, 
        y=labels, 
        phases_to_sample=[1], # phase corresponds to positive class
        desired_target=desired_target
    )

    # check_class_distribution(df_train_upsampled[label_name], phases=[0,phase], plot=True)
    
    preprocess_data(
        df_train_upsampled, df_test, feature_list, ip_encoding,
        output_dir=f"{data_out_root_dir}/phase_{phase}", 
        label_name=label_name
    )
    
    print()

Phase 1:
Class distribution before sampling: Counter({0: 80701, 1: 10})
Balancing mode: upsample


TypeError: sample_classes_random() got an unexpected keyword argument 'phases_to_sample'

## Pre-Phases Classification

In [None]:
pre_phases = [1, 2, 3, 4]
# pre_phases = [1, 2, 3]
pre_column_name = f'pre_phase'

df_train[pre_column_name] = df_train['phase'].isin(pre_phases).astype(int)
df_test[pre_column_name] = df_test['phase'].isin(pre_phases).astype(int)


In [None]:
print("Train Data:\n")
print("Pre-Phases class distribution before sampling:")
print(df_train[pre_column_name].value_counts())
print()

print("Test Data:\n")
print("Pre-Phases class distribution before sampling:")
print(df_test[pre_column_name].value_counts())
print()

Train Data:

Pre-Phases class distribution before sampling:
pre_phase
0    80651
1       60
Name: count, dtype: int64

Test Data:

Pre-Phases class distribution before sampling:
pre_phase
0    45075
1       39
Name: count, dtype: int64



In [None]:
desired_target = 1000

In [None]:
# Sample data
for phase in range(1,6): 
    print(f"Phase {phase}:")

    label_name = f"is_phase_{phase}"
    labels = df_train[label_name]

    counts = Counter(labels)
    print(f"Class distribution before sampling: {counts}")

    # Determine balancing mode
    if counts[1] < desired_target:
        mode = 'upsample'
    else:
        mode = 'downsample'
    print(f'Balancing mode: {mode}')

    df_train_sampled, _ = sample_classes_random(
        mode=mode, 
        X=df_train, 
        y=labels, 
        phases_to_sample=[1], # phase corresponds to positive class
        desired_target=desired_target
    )

Phase 1:
Class distribution before sampling: Counter({0: 80701, 1: 10})
Balancing mode: upsample
Before sampling: Counter({0: 80701, 1: 10})
Sampling strategy (per-phase target): {1: 1000}
After sampling: Counter({0: 80701, 1: 1000})
Phase 2:
Class distribution before sampling: Counter({0: 80703, 1: 8})
Balancing mode: upsample
Before sampling: Counter({0: 80703, 1: 8})
Sampling strategy (per-phase target): {1: 1000}
After sampling: Counter({0: 80703, 1: 1000})
Phase 3:
Class distribution before sampling: Counter({0: 80679, 1: 32})
Balancing mode: upsample
Before sampling: Counter({0: 80679, 1: 32})
Sampling strategy (per-phase target): {1: 1000}
After sampling: Counter({0: 80679, 1: 1000})
Phase 4:
Class distribution before sampling: Counter({0: 80701, 1: 10})
Balancing mode: upsample
Before sampling: Counter({0: 80701, 1: 10})
Sampling strategy (per-phase target): {1: 1000}
After sampling: Counter({0: 80701, 1: 1000})
Phase 5:
Class distribution before sampling: Counter({0: 49968, 1:

In [None]:
# Pre-Phases
preprocess_data(
    df_train_sampled, df_test, feature_list, ip_encoding,
    output_dir=f"{data_out_root_dir}/pre_phases", 
    label_name=pre_column_name
)


Saved X, y, y_phase, and preprocessing pipeline to ../processed_data/scenario_one/inside/host_split/sampled/pre_phases/
