In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===============================
# File paths
# ===============================
dga_malware_file = "/home/ubuntu/DoH_DGA_training/datasets/DGA_HKD/all_pcaps_allN.csv"
doh_tunnel_file = "/home/ubuntu/DoH_DGA_training/datasets/DoH_HKD/DoH-Pcaps/all_pcaps_allN.csv"
dga_malware_synthetic1 = "/home/ubuntu/DoH_DGA_training/datasets/DGA_Google/all_pcaps_allN.csv"
dga_malware_synthetic2 = "/home/ubuntu/DoH_DGA_training/datasets/DGA_CF/all_pcaps_allN.csv"

nondoh_benign_file = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/all_nondoh.csv"
doh_benign_file = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/all_doh.csv"

# ===============================
# Load datasets
# ===============================
df_dga_malware = pd.read_csv(dga_malware_file)

df_dga_synthetic1 = pd.read_csv(dga_malware_synthetic1)
df_dga_synthetic2 = pd.read_csv(dga_malware_synthetic2)
df_dga_synthetic = pd.concat([df_dga_synthetic1, df_dga_synthetic2], ignore_index=True)

df_doh_tunnel = pd.read_csv(doh_tunnel_file)
df_nondoh_benign = pd.read_csv(nondoh_benign_file)
df_doh_benign = pd.read_csv(doh_benign_file)

# ===============================
# Assign labels
# ===============================
df_dga_malware["label"] = "DGA_MALWARE"
df_dga_synthetic["label"] = "DGA_Synthetic"
df_doh_tunnel["label"] = "DoH_Tunnel"
df_nondoh_benign["label"] = "NonDoH_Benign"
df_doh_benign["label"] = "DoH_Benign"

# ===============================
# Combine all into one DataFrame
# ===============================
df_all = pd.concat(
    [df_dga_malware, df_dga_synthetic, df_doh_tunnel, df_nondoh_benign, df_doh_benign],
    ignore_index=True
)

print("DGA_MALWARE shape:", df_dga_malware.shape)
print("DGA_Synthetic shape:", df_dga_synthetic.shape)
print("DoH_Tunnel shape:", df_doh_tunnel.shape)
print("NonDoH_Benign shape:", df_nondoh_benign.shape)
print("DoH_Benign shape:", df_doh_benign.shape)

print("\nFinal dataset shape:", df_all.shape)
print("Labels distribution:\n", df_all["label"].value_counts())
print("Columns:\n", list(df_all.columns))


DGA_MALWARE shape: (9012, 35)
DGA_Synthetic shape: (203428, 35)
DoH_Tunnel shape: (6008, 36)
NonDoH_Benign shape: (2139204, 36)
DoH_Benign shape: (116412, 36)

Final dataset shape: (2474064, 37)
Labels distribution:
 label
NonDoH_Benign    2139204
DGA_Synthetic     203428
DoH_Benign        116412
DGA_MALWARE         9012
DoH_Tunnel          6008
Name: count, dtype: int64
Columns:
 ['session_id', 'client_ip', 'server_ip', 'client_port', 'server_port', 'protocol', 'N', 'n_client', 'client_bytes', 'client_pkt_min', 'client_pkt_mean', 'client_pkt_max', 'client_iat_min', 'client_iat_mean', 'client_iat_max', 'n_server', 'server_bytes', 'server_pkt_min', 'server_pkt_mean', 'server_pkt_max', 'server_iat_min', 'server_iat_mean', 'server_iat_max', 'pkt_fraction_client', 'bytes_fraction_client', 'flow_duration', 'time_first_response', 'dir_switches', 'size_min', 'size_mean', 'size_max', 'iat_min', 'iat_mean', 'iat_max', 'label', 'pcap_file', 'source_pcap']


In [2]:
N_value = 16   # <-- change this to 8, 16, 32, 64 as needed
df_subset = df_all[df_all["N"] == N_value].copy()

print(f"\nTraining on N={N_value}, dataset size={df_subset.shape[0]}")

# Separate features and target
y = df_subset["label"]
X = df_subset.drop(columns=[
    "label", 'session_id', 'client_ip', 'server_ip', 
    'client_port', 'server_port', 'protocol', 'N',
    'pcap_file', 'source_pcap'
], errors="ignore")
feature_names = X.columns.tolist()
# Keep only numeric features
X = X.select_dtypes(include=["int64", "float64"])

print("Remaining feature columns:\n", list(X.columns))
print("Target classes:", y.unique())


Training on N=16, dataset size=618516
Remaining feature columns:
 ['n_client', 'client_bytes', 'client_pkt_min', 'client_pkt_mean', 'client_pkt_max', 'client_iat_min', 'client_iat_mean', 'client_iat_max', 'n_server', 'server_bytes', 'server_pkt_min', 'server_pkt_mean', 'server_pkt_max', 'server_iat_min', 'server_iat_mean', 'server_iat_max', 'pkt_fraction_client', 'bytes_fraction_client', 'flow_duration', 'time_first_response', 'dir_switches', 'size_min', 'size_mean', 'size_max', 'iat_min', 'iat_mean', 'iat_max']
Target classes: ['DGA_MALWARE' 'DGA_Synthetic' 'DoH_Tunnel' 'NonDoH_Benign' 'DoH_Benign']


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

# ---- Config ----
N_values = [8, 16, 32, 64]      # run for each N
test_size = 0.2
random_state = 42

# MLP config (match your reduced model)
mlp_params = dict(
    hidden_layer_sizes=(64, 32),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=64,
    learning_rate="adaptive",
    learning_rate_init=0.001,
    max_iter=500,
    shuffle=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=random_state,
    verbose=False
)

# columns to drop from features (identifiers etc)
DROP_COLS = [
    "label", "session_id", "client_ip", "server_ip",
    "client_port", "server_port", "protocol", "N",
    "pcap_file", "source_pcap"
]

# ---- Functions to prepare datasets for each stage ----
def make_stage_A(df):
    """Stage A: DoH vs NonDoH"""
    df_stage = df.copy()
    df_stage = df_stage[df_stage["label"].isin([
        "DoH_Benign", "DoH_Tunnel", "NonDoH_Benign", "DGA_Synthetic", "DGA_MALWARE"
    ])]
    df_stage["stage_label"] = df_stage["label"].apply(lambda x: "DoH" if x.startswith("DoH_") else "NonDoH")
    return df_stage

def make_stage_B(df):
    """Stage B: within NonDoH: DGA vs NonDoH_Benign
       We keep only rows that are in NonDoH_Benign or DGA_*"""
    df_stage = df[df["label"].isin(["NonDoH_Benign", "DGA_Synthetic", "DGA_MALWARE"])].copy()
    df_stage["stage_label"] = df_stage["label"].apply(lambda x: "DGA" if x.startswith("DGA_") else "NonDGA_Benign")
    return df_stage

def make_stage_C(df):
    """Stage C: within DoH: DoH_Tunnel vs DoH_Benign"""
    df_stage = df[df["label"].isin(["DoH_Benign", "DoH_Tunnel"])].copy()
    df_stage["stage_label"] = df_stage["label"].apply(lambda x: "DoH_Tunnel" if x == "DoH_Tunnel" else "DoH_Benign")
    return df_stage

STAGES = {
    "A_DoH_vs_NonDoH": make_stage_A,
    "B_DGA_vs_NonDGA": make_stage_B,
    "C_DoH_Tunnel_vs_Benign": make_stage_C
}

# ---- storage for results ----
results = []

# ---- main loop ----
for N in N_values:
    print(f"\n\n======== Running experiments for N = {N} ========")
    df_subset = df_all[df_all["N"] == N].copy()
    print(f"Dataset size for N={N}: {df_subset.shape[0]}")
    if df_subset.shape[0] == 0:
        print("No rows for this N, skipping.")
        continue

    for stage_name, stage_fn in STAGES.items():
        df_stage = stage_fn(df_subset)
        if df_stage.shape[0] == 0:
            print(f"Stage {stage_name}: no data, skipping")
            continue

        # keep only rows with non-null stage_label
        df_stage = df_stage.dropna(subset=["stage_label"])
        # features and target
        y = df_stage["stage_label"].copy()
        X = df_stage.drop(columns=DROP_COLS, errors="ignore").copy()

        # keep numeric columns only (you already requested this)
        X = X.select_dtypes(include=["int64", "float64"])
        print(f"\nStage {stage_name} (N={N}) - samples: {X.shape[0]}, numeric features: {X.shape[1]}")
        print("Label distribution:", Counter(y))

        # ensure we have at least two classes in this stage
        if len(y.unique()) < 2:
            print(f"Stage {stage_name} (N={N}) has <2 classes, skipping.")
            continue

        # encode labels to 0/1
        le = LabelEncoder()
        y_enc = le.fit_transform(y)

        # train/test split with stratify
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_enc, test_size=test_size, stratify=y_enc, random_state=random_state
        )

        # pipeline: imputer -> scaler -> MLP
        pipeline = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
            ("mlp", MLPClassifier(**mlp_params))
        ])

        # Train
        print("Training MLP...")
        pipeline.fit(X_train, y_train)

        # Predict + report
        y_pred = pipeline.predict(X_test)
        class_names = le.inverse_transform([0,1]) if len(le.classes_)==2 else le.classes_
        print(f"\nClassification report for Stage={stage_name}, N={N}:")
        print(classification_report(y_test, y_pred, target_names=le.classes_, digits=4))
        print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

        # store summary metrics (you can expand with more metrics)
        # compute per-class support, f1, precision, recall via classification_report dict
        report_dict = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
        for cls in le.classes_:
            entry = {
                "N": N,
                "stage": stage_name,
                "class": cls,
                "precision": report_dict[cls]["precision"],
                "recall": report_dict[cls]["recall"],
                "f1": report_dict[cls]["f1-score"],
                "support": report_dict[cls]["support"]
            }
            results.append(entry)

# ---- collect results to DataFrame ----
results_df = pd.DataFrame(results)
print("\n\nSummary results (first rows):")
print(results_df.head())

# optionally save to csv
#results_df.to_csv("hierarchical_mlp_results_by_N_and_stage.csv", index=False)
print("\nSaved results to hierarchical_mlp_results_by_N_and_stage.csv")






Dataset size for N=8: 618516

Stage A_DoH_vs_NonDoH (N=8) - samples: 618516, numeric features: 27
Label distribution: Counter({'NonDoH': 587911, 'DoH': 30605})
Training MLP...

Classification report for Stage=A_DoH_vs_NonDoH, N=8:
              precision    recall  f1-score   support

         DoH     0.9717    0.9822    0.9769      6121
      NonDoH     0.9991    0.9985    0.9988    117583

    accuracy                         0.9977    123704
   macro avg     0.9854    0.9904    0.9879    123704
weighted avg     0.9977    0.9977    0.9977    123704

Confusion matrix:
 [[  6012    109]
 [   175 117408]]

Stage B_DGA_vs_NonDGA (N=8) - samples: 587911, numeric features: 27
Label distribution: Counter({'NonDGA_Benign': 534801, 'DGA': 53110})
Training MLP...

Classification report for Stage=B_DGA_vs_NonDGA, N=8:
               precision    recall  f1-score   support

          DGA     0.9992    0.9987    0.9990     10622
NonDGA_Benign     0.9999    0.9999    0.9999    106961

     accurac