# Temporal Split

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [None]:
import pandas as pd
import json

In [None]:
from helper_func.preprocess_func import prepare_features
from helper_func.data_split_func import prepare_phase_dataset, build_sequences, temporal_split_with_attack
from helper_func.preprocess_func import construct_pipeline, save_dense_data

In [4]:
# Config
SEED = 123

In [5]:
# Load data
DATASET_PATH = "../../../data/DARPA_2000/Scenario_One/inside/inside_labeled_flows_all.csv"
df = pd.read_csv(DATASET_PATH)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,...,ShADadfF,8,614,9,8261,-,6,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,...,Dd,1,72,1,159,-,17,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,...,ShAdtDFaf,12,1107,22,1382,-,6,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,...,Dd,2,146,1,159,-,17,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,...,Dd,1,73,1,159,-,17,0,0,0


In [6]:
# Load features
FEATURE_LIST_FILE_NAME = f"../features_list.json"
with open(FEATURE_LIST_FILE_NAME) as f:
    feature_list = json.load(f)
print(feature_list)

ip_encoding = "none"

['duration', 'proto', 'service', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'ip_proto']


## Test for phase 2

In [7]:
target_phase = 2

In [8]:
df_phase = prepare_phase_dataset(df, target_phase)
df_phase.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,...,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto,attack_id,attack,phase,y
0,f12289,952438900.0,952439500.0,639.014688,172.16.113.168,21582,172.16.112.50,23,tcp,-,...,652,26396,359,37644,-,6,0,0,0,0
1,f2358,952438900.0,952439100.0,156.104433,172.16.113.204,22585,197.218.177.69,21,tcp,ftp,...,41,1981,30,2173,-,6,0,0,0,0
2,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,...,8,614,9,8261,-,6,0,0,0,0
3,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,...,1,72,1,159,-,17,0,0,0,0
4,f23526,952438900.0,952441200.0,2245.862151,194.7.248.153,43459,172.16.115.20,23,tcp,-,...,199,10463,103,5600,-,6,0,0,0,0


In [None]:
# Prepare data
df_phase_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_features(df_phase, feature_list, ip_encoding)
df_phase_features.head()

Unnamed: 0,duration,proto,service,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto
0,639.014688,tcp,-,316,23284,SF,T,T,0,DdAafF,652,26396,359,37644,-,6
1,156.104433,tcp,ftp,341,973,SF,T,F,0,DdAFaf,41,1981,30,2173,-,6
2,0.1018,tcp,http,290,7897,SF,T,T,0,ShADadfF,8,614,9,8261,-,6
3,0.001107,udp,dns,44,131,SF,T,T,0,Dd,1,72,1,159,-,17
4,2245.862151,tcp,-,115,244,OTH,F,T,0,DdA,199,10463,103,5600,-,6


In [10]:
pipeline = construct_pipeline(numeric_cols, categorical_cols)
X = pipeline.fit_transform(df_phase_features)

In [11]:
y = df_phase["y"]
y.value_counts()

y
0    125803
1        22
Name: count, dtype: int64

In [12]:
y_phase = df_phase["phase"]
y_phase.value_counts()

phase
0    91972
5    33754
3       35
2       22
4       22
1       20
Name: count, dtype: int64

In [13]:
X_sequences, y_sequences, y_phase_sequences = build_sequences(X, y, y_phase, window_size=5)

In [14]:
print(y_sequences.sum())

22


In [15]:
X_train, X_test, y_train, y_test, y_phase_train, y_phase_test = temporal_split_with_attack(X_sequences, y_sequences, y_phase_sequences, test_ratio=0.4, window_size=5)

In [16]:
print(y_train.sum())
print(y_test.sum())

14
8


In [17]:
# Preprocess and save the data
output_dir = f"../processed_data_notebooks/temporal_split/phase_{target_phase}/"

In [18]:
# Save processed data to disk
save_dense_data(X_train, y_train, y_phase_train, X_test, y_test, y_phase_test, pipeline, numeric_cols, categorical_cols, ip_encoding, output_dir)


Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/temporal_split/phase_2//


## All datasets

In [19]:
root_out_dir = f"../processed_data_notebooks/temporal_split"

In [20]:
for target_phase in range(1, 6):
    print(f"Preparing dataset for phase {target_phase}...")

    df_phase = prepare_phase_dataset(df, target_phase)

    # Prepare data
    df_phase_features, numeric_cols, categorical_cols, ip_feature_cols = prepare_data(df_phase, feature_list, ip_encoding)

    pipeline = construct_pipeline(numeric_cols, categorical_cols)
    X = pipeline.fit_transform(df_phase_features)

    y = df_phase["y"]
    y_phase = df_phase["phase"]

    X_sequences, y_sequences, y_phase_sequences = build_sequences(X, y, y_phase, window_size=5)

    X_train, X_test, y_train, y_test, y_phase_train, y_phase_test = temporal_split_with_attack(X_sequences, y_sequences, y_phase_sequences, test_ratio=0.4, window_size=5)
    print(y_train.sum())
    print(y_test.sum())
    
    # Save processed data to disk
    save_dense_data(
        X_train, y_train, y_phase_train, X_test, y_test, y_phase_test, 
        pipeline, numeric_cols, categorical_cols, ip_encoding, 
        output_dir=f"{root_out_dir}/phase_{target_phase}"
    )

Preparing dataset for phase 1...
12
8
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/temporal_split/phase_1/
Preparing dataset for phase 2...
14
8
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/temporal_split/phase_2/
Preparing dataset for phase 3...
19
16
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/temporal_split/phase_3/
Preparing dataset for phase 4...
15
7
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/temporal_split/phase_4/
Preparing dataset for phase 5...
20221
13533
Saved X, y, y_phase, and preprocessing pipeline to ../processed_data_notebooks/temporal_split/phase_5/
