In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ===============================
# File paths
# ===============================

doh_tunnel_iodine = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/iodine/all_pcaps_allN.csv"
doh_tunnel_dns2tcp = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dns2tcp/all_pcaps_allN.csv"
doh_tunnel_dnscat2 = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHMalicious/dnscat2/all_pcaps_allN.csv"

#doh_tunnel_hkd = "/home/ubuntu/DoH_DGA_training/datasets/DoH_HKD/DoH-Pcaps/all_pcaps_allN.csv"

dga_malware_google = "/home/ubuntu/DoH_DGA_training/datasets/DGA_Google/all_pcaps_allN.csv"
dga_malware_clouflare = "/home/ubuntu/DoH_DGA_training/datasets/DGA_CF/all_pcaps_allN.csv"
dga_malware_adguard = "/home/ubuntu/DoH_DGA_training/datasets/DGA_ADGuard/all_pcaps_allN.csv"
dga_malware_quad9 = "/home/ubuntu/DoH_DGA_training/datasets/DGA_Quad9/all_pcaps_allN.csv"

dga_malware_hkd = "/home/ubuntu/DoH_DGA_training/datasets/DGA_HKD/all_pcaps_allN.csv"

nondoh_benign_file = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/all_nondoh.csv"

doh_benign_file = "/home/ubuntu/DoH_DGA_training/datasets/PCAPs/DoHBenign-NonDoH/all_doh.csv"

# ===============================
# Load datasets
# ===============================
df_doh_tunnel_iodine = pd.read_csv(doh_tunnel_iodine)
df_doh_tunnel_dns2tcp = pd.read_csv(doh_tunnel_dns2tcp)
df_doh_tunnel_dnscat2 = pd.read_csv(doh_tunnel_dnscat2)

#df_doh_tunnel_hkd = pd.read_csv(doh_tunnel_hkd)

df_doh_tunnel = pd.concat([df_doh_tunnel_iodine, df_doh_tunnel_dns2tcp, df_doh_tunnel_dnscat2], ignore_index=True)

df_dga_google = pd.read_csv(dga_malware_google)
df_dga_clouflare = pd.read_csv(dga_malware_clouflare)
df_dga_adguard = pd.read_csv(dga_malware_adguard)
df_dga_quad9 = pd.read_csv(dga_malware_quad9)
df_dga_hkd = pd.read_csv(dga_malware_hkd)
df_dga = pd.concat([df_dga_google, df_dga_clouflare, df_dga_adguard, df_dga_quad9, df_dga_hkd], ignore_index=True)

df_nondoh_benign = pd.read_csv(nondoh_benign_file)
df_doh_benign = pd.read_csv(doh_benign_file)

# ===============================
# Assign labels
# ===============================
df_dga["label"] = "DGA"
df_doh_tunnel["label"] = "DoH_Tunnel"
df_nondoh_benign["label"] = "NonDoH_Benign"
df_doh_benign["label"] = "DoH_Benign"

# ===============================
# Combine all into one DataFrame
# ===============================
df_all = pd.concat(
    [df_dga, df_doh_tunnel, df_nondoh_benign, df_doh_benign, df_doh_benign],
    ignore_index=True
)


print("DGA_shape:", df_dga.shape)
print("DoH_Tunnel shape:", df_doh_tunnel.shape)
print("NonDoH_Benign shape:", df_nondoh_benign.shape)
print("DoH_Benign shape:", df_doh_benign.shape)

print("\nFinal dataset shape:", df_all.shape)
print("Labels distribution:\n", df_all["label"].value_counts())
print("Columns:\n", list(df_all.columns))


DGA_shape: (415820, 35)
DoH_Tunnel shape: (702676, 35)
NonDoH_Benign shape: (2139204, 36)
DoH_Benign shape: (116412, 36)

Final dataset shape: (3490524, 36)
Labels distribution:
 label
NonDoH_Benign    2139204
DoH_Tunnel        702676
DGA               415820
DoH_Benign        232824
Name: count, dtype: int64
Columns:
 ['session_id', 'client_ip', 'server_ip', 'client_port', 'server_port', 'protocol', 'N', 'n_client', 'client_bytes', 'client_pkt_min', 'client_pkt_mean', 'client_pkt_max', 'client_iat_min', 'client_iat_mean', 'client_iat_max', 'n_server', 'server_bytes', 'server_pkt_min', 'server_pkt_mean', 'server_pkt_max', 'server_iat_min', 'server_iat_mean', 'server_iat_max', 'pkt_fraction_client', 'bytes_fraction_client', 'flow_duration', 'time_first_response', 'dir_switches', 'size_min', 'size_mean', 'size_max', 'iat_min', 'iat_mean', 'iat_max', 'label', 'source_pcap']


In [40]:
N_value = 16   # <-- change this to 8, 16, 32, 64 as needed
df_subset = df_all[df_all["N"] == N_value].copy()
top_features = [
    "client_pkt_max",
    "n_client",
    "bytes_fraction_client",
    "n_server",
    "pkt_fraction_client",
    "client_bytes",
    "server_pkt_max",
    "size_min",
    "size_mean",
    "server_pkt_mean",
    "dir_switches",
    "server_bytes",
    "size_max",
    "client_pkt_min",
    "server_pkt_min",
    "client_pkt_mean"
]

print(f"\nTraining on N={N_value}, dataset size={df_subset.shape[0]}")

# Separate features and target
y = df_subset["label"]
X = df_subset.drop(columns=[
    "label", 'session_id', 'client_ip', 'server_ip', 
    'client_port', 'server_port', 'protocol', 'N',
    'pcap_file', 'source_pcap'
], errors="ignore")
feature_names = X.columns.tolist()
# Keep only numeric features
X = X.select_dtypes(include=["int64", "float64"])

# Keep only the predefined top features (in the exact order)
available_features = [f for f in top_features if f in X.columns]
missing_features = [f for f in top_features if f not in X.columns]

if missing_features:
    print("Warning: Missing features:", missing_features)

X = X[available_features].copy()
print(f"Selected {len(available_features)} top features: {available_features}")

print("Remaining feature columns:\n", list(X.columns))
print("Target classes:", y.unique())


Training on N=16, dataset size=872631
Selected 16 top features: ['client_pkt_max', 'n_client', 'bytes_fraction_client', 'n_server', 'pkt_fraction_client', 'client_bytes', 'server_pkt_max', 'size_min', 'size_mean', 'server_pkt_mean', 'dir_switches', 'server_bytes', 'size_max', 'client_pkt_min', 'server_pkt_min', 'client_pkt_mean']
Remaining feature columns:
 ['client_pkt_max', 'n_client', 'bytes_fraction_client', 'n_server', 'pkt_fraction_client', 'client_bytes', 'server_pkt_max', 'size_min', 'size_mean', 'server_pkt_mean', 'dir_switches', 'server_bytes', 'size_max', 'client_pkt_min', 'server_pkt_min', 'client_pkt_mean']
Target classes: ['DGA' 'DoH_Tunnel' 'NonDoH_Benign' 'DoH_Benign']


In [41]:
# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.3, random_state=42, stratify=y_enc
)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Train MLP
# Extended MLP with more options
mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),   # deeper network with 3 hidden layers
    activation="relu",                  # alternatives: 'tanh', 'logistic'
    solver="adam",                      # alternatives: 'sgd', 'lbfgs'
    alpha=1e-4,                         # L2 regularization (weight decay)
    batch_size=64,                      # mini-batch size
    learning_rate="adaptive",           # 'constant', 'invscaling', 'adaptive'
    learning_rate_init=0.001,           # initial learning rate
    max_iter=200,                       # train longer
    shuffle=True,                       # shuffle samples every epoch
    early_stopping=True,                # stop if validation score doesn’t improve
    validation_fraction=0.1,            # use 10% of training for validation
    n_iter_no_change=20,                # patience for early stopping
    random_state=42,                    # reproducibility
    verbose=True                        # print progress during training
)

mlp.fit(X_train, y_train)

# Evaluate
y_pred = mlp.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, digits=4))

Iteration 1, loss = 0.06138728
Validation score: 0.990538
Iteration 2, loss = 0.03127899
Validation score: 0.994107
Iteration 3, loss = 0.02687173
Validation score: 0.990931
Iteration 4, loss = 0.02408647
Validation score: 0.991062
Iteration 5, loss = 0.02250228
Validation score: 0.996038
Iteration 6, loss = 0.02113137
Validation score: 0.995547
Iteration 7, loss = 0.01802479
Validation score: 0.996087
Iteration 8, loss = 0.01843537
Validation score: 0.992748
Iteration 9, loss = 0.01740346
Validation score: 0.996955
Iteration 10, loss = 0.01711664
Validation score: 0.993435
Iteration 11, loss = 0.01565428
Validation score: 0.995940
Iteration 12, loss = 0.01444269
Validation score: 0.997413
Iteration 13, loss = 0.01522190
Validation score: 0.993665
Iteration 14, loss = 0.01524454
Validation score: 0.996890
Iteration 15, loss = 0.01461721
Validation score: 0.997397
Iteration 16, loss = 0.01529560
Validation score: 0.997512
Iteration 17, loss = 0.01366326
Validation score: 0.997348
Iterat

# Export model

In [43]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler

output_folder = "./mlp_multi_models"
os.makedirs(output_folder, exist_ok=True)

# --- Adjust these to match your environment ----
# 1) If you used a pipeline similar to your example, set pipe = results["pipeline"]
pipe = None               # <-- set this if you have a pipeline object
# 2) If you trained the MLP directly, ensure mlp_reduced is available in the namespace
mlp = mlp                # <-- will be set below
# 3) Provide the scaler if you used it standalone, or let the script extract it from the pipeline
scaler = scaler            # <-- set to your StandardScaler instance if not using a pipeline
# 4) Provide your feature list (order must match training columns used)
#    If you used top_features when fitting, set feature_cols = top_features
feature_cols = top_features       # <-- e.g. feature_cols = list(top_features) or results["feature_cols"]

# ----------------- Helper: write arrays to C -----------------
def write_array_to_c(name, array, f, add_f_suffix=False):
    shape = array.shape
    flat  = array.flatten()
    f.write(f"// Shape: {shape}\n")
    f.write(f"static const float {name}[{len(flat)}] ALIGN16 = {{\n    ")
    if add_f_suffix:
        f.write(", ".join(f"{float(x):.6f}f" for x in flat))
    else:
        f.write(", ".join(f"{float(x):.6f}" for x in flat))
    f.write("\n};\n\n")

def export_mlp_to_c(model, out_path):
    with open(out_path, "w") as f:
        f.write("// Auto-generated MLP weights + pointers for C inference\n")
        f.write("// Generated from scikit-learn MLPClassifier\n\n")
        f.write("#pragma once\n\n")
        f.write("#define ALIGN16 __attribute__((aligned(16)))\n\n")

        n_layers = len(model.coefs_)  # number of weight matrices (hidden layers + output)
        f.write(f"#define NUM_LAYERS {n_layers}\n\n")
        # number of outputs = neurons in final layer
        output_size = model.coefs_[-1].shape[1]

        # if binary classifier (single output neuron) -> 2 classes
        num_classes = 2 if output_size == 1 else output_size

        f.write(f"#define NUM_CLASSES {num_classes}\n\n")
        in_size = model.coefs_[0].shape[0]
        sizes   = [in_size] + [w.shape[1] for w in model.coefs_]
        f.write("static const int LAYER_SIZES[NUM_LAYERS+1] = { " +
                ", ".join(str(int(s)) for s in sizes) +
                " };\n\n")

        # determine output size (number of outputs from final layer)
        output_size = int(model.coefs_[-1].shape[1])
        f.write(f"#define OUTPUT_SIZE {output_size}\n\n")

        # Write conditional compile flags for binary vs multiclass
        f.write("// Convenience macros for selecting code paths at compile time\n")
        f.write("#if OUTPUT_SIZE == 1\n")
        f.write("    #define IS_BINARY_CLASSIFICATION 1\n")
        f.write("    #define IS_MULTICLASS_CLASSIFICATION 0\n")
        f.write("#else\n")
        f.write("    #define IS_BINARY_CLASSIFICATION 0\n")
        f.write("    #define IS_MULTICLASS_CLASSIFICATION 1\n")
        f.write("#endif\n\n")

        for idx, (W, b) in enumerate(zip(model.coefs_, model.intercepts_)):
            # sklearn W is (size_in, size_out)
            W = W.astype(np.float32, copy=False)
            B = b.astype(np.float32, copy=False)
            write_array_to_c(f"W{idx}", W, f)
            write_array_to_c(f"B{idx}", B, f)

        f.write("// Weight & bias pointers per layer\n")
        f.write("static const float *WEIGHTS[NUM_LAYERS] = { " +
                ", ".join(f"W{j}" for j in range(n_layers)) +
                " };\n")
        f.write("static const float *BIASES[NUM_LAYERS]  = { " +
                ", ".join(f"B{j}" for j in range(n_layers)) +
                " };\n\n")

        f.write("#undef ALIGN16\n")

# ----------------- Determine mlp, scaler, feature names -----------------
# Case A: user provides a pipeline object in variable `pipe`
if pipe is not None:
    try:
        mlp = pipe.named_steps["clf"]
    except Exception:
        raise RuntimeError("Pipeline provided but 'clf' not found in named_steps")

    # try to locate scaler inside a ColumnTransformer inside preproc
    try:
        preproc = pipe.named_steps["preproc"]
        # assumes ColumnTransformer named transformer 'num' containing a pipeline with scaler step named 'scaler'
        scaler = preproc.named_transformers_["num"].named_steps["scaler"]
    except Exception:
        # fallback: look for any StandardScaler in pipeline
        for name, step in preproc.named_steps.items():
            if isinstance(step, StandardScaler):
                scaler = step
                break

# Case B: pipeline not used, user trained `mlp_reduced` directly
if mlp is None:
    # try to use provided mlp_reduced from user's scope
    try:
        mlp = globals().get("mlp_reduced", None) or globals().get("mlp", None)
    except Exception:
        mlp = None

if mlp is None:
    raise RuntimeError("Could not locate trained MLP. Make sure `mlp_reduced` or `pipe` is defined.")

# If scaler not found, maybe user passed a plain StandardScaler variable; if not, attempt to compute mean/std from training data if available
if scaler is None:
    # try to find StandardScaler in globals
    scaler = globals().get("scaler", None)

# If still None, but feature columns & X_train are available, compute scaler from training data (this reproduces what you'd do for inference stats)
if scaler is None:
    X_train = globals().get("X_train", None)
    if X_train is not None and feature_cols is not None:
        print("[WARN] No StandardScaler instance found — computing mean/std from X_train[feature_cols]. This must match the scaler used at training time.")
        tmp_scaler = StandardScaler()
        tmp_scaler.fit(X_train[feature_cols])
        scaler = tmp_scaler
    else:
        raise RuntimeError("No scaler found and cannot compute it (X_train or feature_cols missing). Provide the scaler or set X_train/feature_cols.")

# If feature_cols not provided and you have a pipeline with feature names available in e.g. results dict
if feature_cols is None:
    # try common names used in examples
    feature_cols = globals().get("feature_cols", None) or globals().get("top_features", None)
    if feature_cols is None:
        # try to infer from scaler if it's a named transformer (rare)
        raise RuntimeError("feature_cols is not set. Provide the ordered list of features used to train the model.")

# ----------------- Compute means/stds and safe-guard -----------------
means = scaler.mean_.astype(np.float32)
stds  = scaler.scale_.astype(np.float32)

# safety to avoid divide-by-zero in C inference
stds[stds == 0.0] = 1.0

# ----------------- Export MLP weights -----------------
mlp_header_path = os.path.join(output_folder, "mlp128_weights.h")
export_mlp_to_c(mlp, mlp_header_path)
print(f"[OK] Wrote weights to {mlp_header_path}")

# ----------------- Export feature stats header -----------------
feat_header_path = os.path.join(output_folder, "feature128_stats.h")
with open(feat_header_path, "w") as f:
    f.write("// Auto-generated feature stats for z-score normalization\n\n")
    f.write("#pragma once\n\n")
    f.write("#define ALIGN16 __attribute__((aligned(16)))\n\n")
    f.write(f"#define NUM_FEATURES {len(means)}\n\n")

    f.write("static const float FEATURE_MEAN[NUM_FEATURES] ALIGN16 = {\n    ")
    f.write(", ".join(f"{float(m):.6f}f" for m in means))
    f.write("\n};\n\n")

    f.write("static const float FEATURE_STD[NUM_FEATURES] ALIGN16 = {\n    ")
    f.write(", ".join(f"{float(s):.6f}f" for s in stds))
    f.write("\n};\n\n")
    f.write("#undef ALIGN16\n")
print(f"[OK] Wrote scaler stats to {feat_header_path}")

# ----------------- Save feature order for verification -----------------
names_txt = os.path.join(output_folder, "feature_names.txt")
with open(names_txt, "w") as f:
    for c in feature_cols:
        f.write(str(c) + "\n")
print(f"[OK] Wrote feature names to {names_txt}")

# ----------------- Sanity print -----------------
print("\n=== Sanity ===")
print("Feature count:", len(feature_cols))
layer_sizes = [mlp.coefs_[0].shape[0]] + [w.shape[1] for w in mlp.coefs_]
print("Layer sizes  :", layer_sizes)
for i, (W, b) in enumerate(zip(mlp.coefs_, mlp.intercepts_)):
    print(f"Layer {i}: W {W.shape}, B {b.shape}")

print("\nAll done — check folder:", output_folder)

[OK] Wrote weights to ./mlp_multi_models/mlp128_weights.h
[OK] Wrote scaler stats to ./mlp_multi_models/feature128_stats.h
[OK] Wrote feature names to ./mlp_multi_models/feature_names.txt

=== Sanity ===
Feature count: 16
Layer sizes  : [16, 128, 64, 4]
Layer 0: W (16, 128), B (128,)
Layer 1: W (128, 64), B (64,)
Layer 2: W (64, 4), B (4,)

All done — check folder: ./mlp_multi_models


## Debugging flows

In [44]:
from pprint import pprint

print("LabelEncoder class mapping:")
for i, c in enumerate(le.classes_):
    print(f"  {i} → {c}")

LabelEncoder class mapping:
  0 → DGA
  1 → DoH_Benign
  2 → DoH_Tunnel
  3 → NonDoH_Benign
