# Feature Exploration

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Data Preparation

### Load Labeled Data

In [49]:
LABELED_FLOWS_CSV = "../../data/DARPA_2000/inside/inside_labeled_flows_all.csv"
df = pd.read_csv(LABELED_FLOWS_CSV)
df.head()

Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


### Split Data into Train and Test Sets

In [71]:
# For now: 60/40 train-test split
split_ratio = 0.6
split_index = int(len(df) * split_ratio)
df_train = df.iloc[:split_index].reset_index(drop=True)
df_test = df.iloc[split_index:].reset_index(drop=True)

print("Train set shape:", df_train.shape)
print("Test set shape:", df_test.shape)

print(df_train.head())
print(df_test.head())

Train set shape: (75495, 20)
Test set shape: (50330, 20)
  flow_id    start_time      end_time  duration          src_ip  sport  \
0      f0  9.524389e+08  9.524389e+08  0.101800   172.16.112.50  33354   
1      f1  9.524389e+08  9.524389e+08  0.001107   172.16.114.50  45135   
2      f2  9.524389e+08  9.524389e+08  0.064488  172.16.113.204  22641   
3      f3  9.524389e+08  9.524389e+08  0.001167  172.16.112.100   1061   
4      f4  9.524389e+08  9.524389e+08  0.000861  172.16.113.204   1438   

           dst_ip  dport proto service  orig_bytes  resp_bytes  orig_pkts  \
0   172.16.114.50     80   tcp    http         290        7897          8   
1   172.16.115.20     53   udp     dns          44         131          1   
2  172.16.112.100     25   tcp    smtp         623         247         12   
3   172.16.115.20     53   udp     dns          90         131          2   
4   172.16.115.20     53   udp     dns          45         131          1   

   resp_pkts conn_state local_orig 

### IP Encoding Methods

In [72]:
def encode_ip_none(df):
    """Drop IP address fields completely."""
    df = df.copy()
    return df.drop(columns=["src_ip", "dst_ip"]), None

def encode_ip_integer(df):
    """Convert IPv4 x.x.x.x to an integer: a*256^3 + b*256^2 + c*256 + d."""
    def ip_to_int(ip):
        try:
            a, b, c, d = map(int, ip.split("."))
            return (a << 24) + (b << 16) + (c << 8) + d
        except:
            return 0
    df = df.copy()
    df["src_ip_int"] = df["src_ip"].apply(ip_to_int)
    df["dst_ip_int"] = df["dst_ip"].apply(ip_to_int)

    return df.drop(columns=["src_ip", "dst_ip"]), ["src_ip_int", "dst_ip_int"]

def encode_ip_onehot(df):
    """
    One-hot encoding for IP addresses.
    WARNING: Very large dimensionality. Use only for small subsets.
    """
    df = df.copy()
    return df, ["src_ip", "dst_ip"]

IP_ENCODERS = {
    "none": encode_ip_none,
    "integer": encode_ip_integer,
    "onehot": encode_ip_onehot,
}

In [73]:
ip_encoding = "none"  # Example setting
if ip_encoding not in IP_ENCODERS:
    raise ValueError(f"Unknown IP encoding: {ip_encoding}")

df_ip_none, ip_feature_cols = IP_ENCODERS[ip_encoding](df)
print("IP feature columns:", ip_feature_cols)
df_ip_none.head()

IP feature columns: None


Unnamed: 0,flow_id,start_time,end_time,duration,sport,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,33354,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,45135,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,22641,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,1061,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,1438,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


In [74]:
ip_encoding = "integer"  # Example setting
if ip_encoding not in IP_ENCODERS:
    raise ValueError(f"Unknown IP encoding: {ip_encoding}")

df_ip_int, ip_feature_cols = IP_ENCODERS[ip_encoding](df)
print("IP feature columns:", ip_feature_cols)
df_ip_int.head()

IP feature columns: ['src_ip_int', 'dst_ip_int']


Unnamed: 0,flow_id,start_time,end_time,duration,sport,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase,src_ip_int,dst_ip_int
0,f0,952438900.0,952438900.0,0.1018,33354,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0,2886758450,2886758962
1,f1,952438900.0,952438900.0,0.001107,45135,53,udp,dns,44,131,1,1,SF,T,T,0,0,0,2886758962,2886759188
2,f2,952438900.0,952438900.0,0.064488,22641,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0,2886758860,2886758500
3,f3,952438900.0,952438900.0,0.001167,1061,53,udp,dns,90,131,2,1,SF,T,T,0,0,0,2886758500,2886759188
4,f4,952438900.0,952438900.0,0.000861,1438,53,udp,dns,45,131,1,1,SF,T,T,0,0,0,2886758860,2886759188


In [75]:
ip_encoding = "onehot"  # Example setting
if ip_encoding not in IP_ENCODERS:
    raise ValueError(f"Unknown IP encoding: {ip_encoding}")

df_ip_none, ip_feature_cols = IP_ENCODERS[ip_encoding](df)
print("IP feature columns:", ip_feature_cols)
df_ip_none.head()

IP feature columns: ['src_ip', 'dst_ip']


Unnamed: 0,flow_id,start_time,end_time,duration,src_ip,sport,dst_ip,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp,attack_id,attack,phase
0,f0,952438900.0,952438900.0,0.1018,172.16.112.50,33354,172.16.114.50,80,tcp,http,290,7897,8,9,SF,T,T,0,0,0
1,f1,952438900.0,952438900.0,0.001107,172.16.114.50,45135,172.16.115.20,53,udp,dns,44,131,1,1,SF,T,T,0,0,0
2,f2,952438900.0,952438900.0,0.064488,172.16.113.204,22641,172.16.112.100,25,tcp,smtp,623,247,12,22,SF,T,T,0,0,0
3,f3,952438900.0,952438900.0,0.001167,172.16.112.100,1061,172.16.115.20,53,udp,dns,90,131,2,1,SF,T,T,0,0,0
4,f4,952438900.0,952438900.0,0.000861,172.16.113.204,1438,172.16.115.20,53,udp,dns,45,131,1,1,SF,T,T,0,0,0


### Build Feature Sets

In [76]:
def build_feature_matrix(df, ip_encoding="none"):
    """
    Construct ML-ready features from labeled flow data.
    """
    
    df = df.copy()

    # Label = binary attack classification (1 = attack, 0 = benign)
    y = df["attack"].astype(int)

    # Drop metadata fields
    df = df.drop(columns=["flow_id", "attack_id", "phase", "attack"])

    # ----------------------------------------------------------------------
    # 1) Handle IP address fields
    # ----------------------------------------------------------------------
    if ip_encoding not in IP_ENCODERS:
        raise ValueError(f"Unknown IP encoding: {ip_encoding}")

    df, ip_feature_cols = IP_ENCODERS[ip_encoding](df)

    # ----------------------------------------------------------------------
    # 2) Categorical features
    # ----------------------------------------------------------------------
    categorical_cols = [
        "proto",      # categorical: "tcp", "udp", "icmp", etc.
        "service",    # categorical: "http", "ftp", "dns", etc.
        "conn_state", # categorical: "S0", "S1", "SF", etc.
        "local_orig", # binary flags
        "local_resp", # binary flags
        ]

    # ----------------------------------------------------------------------
    # 3) Numerical features
    # ----------------------------------------------------------------------
    numeric_cols = [
        "duration", 
        "orig_bytes", 
        "resp_bytes",
        "orig_pkts", 
        "resp_pkts",
    ]
    
    port_cols = ["sport", "dport"] # should ports be scaled or not?
    numeric_cols += port_cols

    if ip_feature_cols: # does it make sense to scale IP features?
        numeric_cols += ip_feature_cols

    # ----------------------------------------------------------------------
    # ColumnTransformer: one-hot + normalization
    # ----------------------------------------------------------------------
    transformer = ColumnTransformer(
        transformers=[
            ("numerical", StandardScaler(), numeric_cols),
            ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ]
    )

    pipeline = Pipeline(steps=[("transform", transformer)])

    X = pipeline.fit_transform(df)

    return X, y, pipeline, numeric_cols, categorical_cols, df

In [79]:
ip_encoding="none"
# Split data before calling build_feature_matrix function (only on test data)
X_train, y_train, pipeline, numeric_cols, categorical_cols, df_train_unprocessed = build_feature_matrix(df_train, ip_encoding=ip_encoding)

print("Feature matrix shape:", X_train.shape)
print("Labels shape:", y_train.shape)

df_train_unprocessed.head()

Feature matrix shape: (75495, 36)
Labels shape: (75495,)


Unnamed: 0,start_time,end_time,duration,sport,dport,proto,service,orig_bytes,resp_bytes,orig_pkts,resp_pkts,conn_state,local_orig,local_resp
0,952438900.0,952438900.0,0.1018,33354,80,tcp,http,290,7897,8,9,SF,T,T
1,952438900.0,952438900.0,0.001107,45135,53,udp,dns,44,131,1,1,SF,T,T
2,952438900.0,952438900.0,0.064488,22641,25,tcp,smtp,623,247,12,22,SF,T,T
3,952438900.0,952438900.0,0.001167,1061,53,udp,dns,90,131,2,1,SF,T,T
4,952438900.0,952438900.0,0.000861,1438,53,udp,dns,45,131,1,1,SF,T,T


In [83]:
# To process val or test sets, use the returned pipeline:
X_test = pipeline.transform(df_test) # works since ip_encoding is the same and `none`
y_test = df_test["attack"].astype(int)

print("Test feature matrix shape:", X_test.shape)
print("Test labels shape:", y_test.shape)

Test feature matrix shape: (50330, 36)
Test labels shape: (50330,)


## Future Remarks and Next Steps

Future options when it comes to features:
- Ports: bucket ports into ranges (e.g., well-known, registered, dynamic)
- Look into how to treat timestamps