In [6]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ---- Config ----
N_values = [8, 16, 32, 64]      # run for each N
test_size = 0.2
random_state = 42

# MLP config (example - match what you used earlier)
mlp_params = dict(
    hidden_layer_sizes=(64, 32),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=64,
    learning_rate="adaptive",
    learning_rate_init=0.001,
    max_iter=500,
    shuffle=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=random_state,
    verbose=False
)

# ===============================
# File paths
# ===============================
doh_tunnel_iodine = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/iodine/all_pcaps_allN.csv"
doh_tunnel_dns2tcp = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dns2tcp/all_pcaps_allN.csv"
doh_tunnel_dnscat2 = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dnscat2/all_pcaps_allN.csv"

dga_malware_google = "/home/ubuntu/DoH_DGA_training/datasets/DGA_Google/all_pcaps_allN.csv"
dga_malware_clouflare = "/home/ubuntu/DoH_DGA_training/datasets/DGA_CF/all_pcaps_allN.csv"
dga_malware_adguard = "/home/ubuntu/DoH_DGA_training/datasets/DGA_ADGuard/all_pcaps_allN.csv"
dga_malware_quad9 = "/home/ubuntu/DoH_DGA_training/datasets/DGA_Quad9/all_pcaps_allN.csv"

nondoh_benign_file = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/all_nondoh.csv"
doh_benign_file = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/all_doh.csv"

# ===============================
# Load datasets
# ===============================
df_doh_tunnel_iodine = pd.read_csv(doh_tunnel_iodine)
df_doh_tunnel_dns2tcp = pd.read_csv(doh_tunnel_dns2tcp)
df_doh_tunnel_dnscat2 = pd.read_csv(doh_tunnel_dnscat2)
df_doh_tunnel = pd.concat([df_doh_tunnel_iodine, df_doh_tunnel_dns2tcp, df_doh_tunnel_dnscat2], ignore_index=True)

df_dga_google = pd.read_csv(dga_malware_google)
df_dga_clouflare = pd.read_csv(dga_malware_clouflare)
df_dga_adguard = pd.read_csv(dga_malware_adguard)
df_dga_quad9 = pd.read_csv(dga_malware_quad9)
df_dga = pd.concat([df_dga_google, df_dga_clouflare, df_dga_adguard, df_dga_quad9], ignore_index=True)

df_nondoh_benign = pd.read_csv(nondoh_benign_file)
df_doh_benign = pd.read_csv(doh_benign_file)

# ===============================
# Assign labels (consistent)
# ===============================
df_dga["label"] = "DGA"
df_doh_tunnel["label"] = "DoH_Tunnel"
df_nondoh_benign["label"] = "NonDoH_Benign"
df_doh_benign["label"] = "DoH_Benign"

# ===============================
# Combine all into one DataFrame (note: removed duplicate doh_benign)
# ===============================
df_all = pd.concat([df_dga, df_doh_tunnel, df_nondoh_benign, df_doh_benign], ignore_index=True)

print("DGA shape:", df_dga.shape)
print("DoH_Tunnel shape:", df_doh_tunnel.shape)
print("NonDoH_Benign shape:", df_nondoh_benign.shape)
print("DoH_Benign shape:", df_doh_benign.shape)
print("\nFinal dataset shape:", df_all.shape)
print("Labels distribution:\n", df_all["label"].value_counts())
print("Columns (sample):", list(df_all.columns)[:40])

# ===============================
# Keep only the desired columns in df_all (features + label + N if present)
# ===============================
FEATURE_COLS = [
    'client_pkt_max', 'server_bytes', 'n_server', 'client_bytes', 'size_max',
    'n_client', 'server_pkt_max', 'pkt_fraction_client', 'bytes_fraction_client',
    'dir_switches', 'size_mean', 'client_pkt_mean', 'size_min',
    'client_pkt_min', 'server_pkt_min', 'server_pkt_mean'
]
# ---------- Updated stage definition functions (pipeline style) ----------
def make_stage_A(df):
    """Stage A: NonDoH vs (DGA, DoH_Benign, DoH_Tunnel)"""
    df_stage = df.copy()
    df_stage = df_stage[df_stage["label"].isin(["NonDoH_Benign", "DGA", "DoH_Benign", "DoH_Tunnel"])].copy()
    # NonDoH_Benign stays NonDoH; everything else grouped as DoH_or_DGA
    df_stage["stage_label"] = df_stage["label"].apply(lambda x: "NonDoH" if str(x) == "NonDoH_Benign" else "DoH_or_DGA")
    return df_stage

def make_stage_B(df):
    """Stage B: (input should be the output of A filtered to DoH_or_DGA)
       DoH_Benign vs (DGA, DoH_Tunnel) -> labels are 'DoH_Benign' and 'DGA_or_Tunnel'"""
    df_stage = df.copy()
    # keep only relevant raw labels
    df_stage = df_stage[df_stage["label"].isin(["DGA", "DoH_Benign", "DoH_Tunnel"])].copy()
    df_stage["stage_label"] = df_stage["label"].apply(lambda x: "DoH_Benign" if str(x) == "DoH_Benign" else "DGA_or_Tunnel")
    return df_stage

def make_stage_C(df):
    """Stage C: (input should be the output of B filtered to DGA_or_Tunnel)
       DGA vs DoH_Tunnel"""
    df_stage = df.copy()
    df_stage = df_stage[df_stage["label"].isin(["DGA", "DoH_Tunnel"])].copy()
    df_stage["stage_label"] = df_stage["label"].apply(lambda x: "DGA" if str(x).startswith("DGA") else "DoH_Tunnel")
    return df_stage

# ---------- helper to run one stage (train/eval/record metrics) ----------
results = []

def run_stage(df_stage, stage_name, N):
    """Train & evaluate stage using df_stage which must already contain 'stage_label'."""
    if df_stage is None or df_stage.shape[0] == 0:
        print(f"Stage {stage_name} (N={N}): no data, skipping")
        return

    df_stage = df_stage.dropna(subset=["stage_label"]).copy()
    if df_stage.shape[0] == 0:
        print(f"Stage {stage_name} (N={N}): all rows had null stage_label, skipping")
        return

    y = df_stage["stage_label"].copy()

    # select only the feature columns we keep (safe intersection)
    available = [c for c in FEATURE_COLS if c in df_stage.columns]
    missing = [c for c in FEATURE_COLS if c not in df_stage.columns]
    if missing:
        print(f"Warning: missing feature columns for Stage={stage_name}, N={N}: {missing}")

    X = df_stage[available].copy()
    X = X.drop(columns=["label", "stage_label"], errors="ignore")
    X = X.select_dtypes(include=[np.number])
    if X.shape[1] == 0:
        print(f"Stage {stage_name} (N={N}): no numeric features available after selection, skipping.")
        return

    print(f"\nStage {stage_name} (N={N}) - samples: {X.shape[0]}, numeric features used: {X.shape[1]}")
    print("Label distribution:", Counter(y))

    # ensure we have at least two classes
    if len(y.unique()) < 2:
        print(f"Stage {stage_name} (N={N}) has <2 classes, skipping.")
        return

    # encode labels
    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    # guard for stratify requirements
    min_samples_needed = int(np.ceil(1.0 / test_size))
    counts = Counter(y_enc)
    if any(v < 2 for v in counts.values()) or any(v < min_samples_needed for v in counts.values()):
        print("Warning: not enough samples for stratified split; using non-stratified split.")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_enc, test_size=test_size, random_state=random_state, shuffle=True
        )
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y_enc, test_size=test_size, stratify=y_enc, random_state=random_state
        )

    pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("mlp", MLPClassifier(**mlp_params))
    ])

    print(f"Training MLP for {stage_name} (N={N})...")
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    print(f"\nClassification report for Stage={stage_name}, N={N}:")
    print(classification_report(y_test, y_pred, target_names=le.classes_, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    # store summary metrics
    report_dict = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
    for cls in le.classes_:
        entry = {
            "N": N,
            "stage": stage_name,
            "class": cls,
            "precision": report_dict[cls]["precision"],
            "recall": report_dict[cls]["recall"],
            "f1": report_dict[cls]["f1-score"],
            "support": report_dict[cls]["support"]
        }
        results.append(entry)

# ---------- Main loop (pipeline order: A -> B -> C) ----------
for N in N_values:
    print(f"\n\n======== Running pipeline for N = {N} ========")

    # filter by N if available
    if "N" in df_all.columns:
        df_subset = df_all[df_all["N"] == N].copy()
        print(f"Dataset size for N={N}: {df_subset.shape[0]}")
        if df_subset.shape[0] == 0:
            print("No rows for this N, skipping.")
            continue
    else:
        print("Column 'N' not found in df_all — using entire dataset (no per-N filtering).")
        df_subset = df_all.copy()

    # ---------- Stage A ----------
    df_A = make_stage_A(df_subset)
    run_stage(df_A, "A_NonDoH_vs_DoH+DGA", N)

    # For Stage B, pipeline uses only the "DoH_or_DGA" group from Stage A
    df_for_B = df_A[df_A["stage_label"] == "DoH_or_DGA"].copy()
    if df_for_B.shape[0] == 0:
        print(f"Stage B (N={N}): no samples passed from Stage A -> skipping Stage B and C.")
        continue

    # map into Stage B labels and run
    df_B = make_stage_B(df_for_B)
    run_stage(df_B, "B_DoHBenign_vs_DGA+Tunnel", N)

    # For Stage C, pipeline uses only the "DGA_or_Tunnel" group from Stage B
    df_for_C = df_B[df_B["stage_label"] == "DGA_or_Tunnel"].copy()
    if df_for_C.shape[0] == 0:
        print(f"Stage C (N={N}): no samples passed from Stage B -> skipping Stage C.")
        continue

    df_C = make_stage_C(df_for_C)
    run_stage(df_C, "C_DGA_vs_Tunnel", N)

# ---- collect results to DataFrame ----
results_df = pd.DataFrame(results)
print("\n\nSummary results (first rows):")
print(results_df.head(6))

DGA shape: (406808, 35)
DoH_Tunnel shape: (707228, 35)
NonDoH_Benign shape: (2139204, 36)
DoH_Benign shape: (116412, 36)

Final dataset shape: (3369652, 36)
Labels distribution:
 label
NonDoH_Benign    2139204
DoH_Tunnel        707228
DGA               406808
DoH_Benign        116412
Name: count, dtype: int64
Columns (sample): ['session_id', 'client_ip', 'server_ip', 'client_port', 'server_port', 'protocol', 'N', 'n_client', 'client_bytes', 'client_pkt_min', 'client_pkt_mean', 'client_pkt_max', 'client_iat_min', 'client_iat_mean', 'client_iat_max', 'n_server', 'server_bytes', 'server_pkt_min', 'server_pkt_mean', 'server_pkt_max', 'server_iat_min', 'server_iat_mean', 'server_iat_max', 'pkt_fraction_client', 'bytes_fraction_client', 'flow_duration', 'time_first_response', 'dir_switches', 'size_min', 'size_mean', 'size_max', 'iat_min', 'iat_mean', 'iat_max', 'label', 'source_pcap']


Dataset size for N=8: 842413

Stage A_NonDoH_vs_DoH+DGA (N=8) - samples: 842413, numeric features used: 16