In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# === Step 2: Load dataset ===
df = pd.read_csv("csic_database.csv")

# === Step 3: Quick check ===
print("Shape of dataset:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())


ModuleNotFoundError: No module named 'pandas'

In [10]:
DATA_PATH = ("csic_database.csv")
df = pd.read_csv(DATA_PATH)

# 1) Standardize column names (strip spaces)
df.columns = [c.strip() for c in df.columns]

# 2) Create a proper numeric content-length column from the 'lenght' header (keep original)
if "lenght" in df.columns:
    df["content_length_header"] = (
        df["lenght"].astype(str).str.extract(r"(\d+)").astype(float)
    )

# 3) Ensure key columns exist
for c in ["Method", "connection", "content-type", "URL", "content", "classification"]:
    if c not in df.columns:
        df[c] = np.nan

# 4) Trim whitespace in categorical-like columns
for c in ["Method", "connection", "content-type", "URL"]:
    df[c] = df[c].astype(str).str.strip()

# 5) Handle missing values
#   - Text fields: empty string
for c in ["URL", "content"]:
    df[c] = df[c].fillna("")

#   - Categoricals: 'missing'
for c in ["Method", "connection", "content-type"]:
    df[c] = df[c].fillna("missing").replace({"": "missing"})

#   - Numeric fields: 0 (only for the ones we know should be numeric)
numeric_candidates = ["content_length_header"]
numeric_cols = [c for c in numeric_candidates if c in df.columns]
for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

# 6) Remove exact duplicate requests (based on Method+URL+content)
before = len(df)
df = df.drop_duplicates(subset=["Method", "URL", "content"])
removed_dupes = before - len(df)

# 7) Drop columns that are >95% missing OR constant (except target)
missing_ratio = df.isna().mean()
const_cols = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
to_drop = set(missing_ratio[missing_ratio > 0.95].index.tolist() + const_cols)
to_drop.discard("classification")  # never drop target if present
df.drop(columns=list(to_drop), inplace=True, errors="ignore")

# 8) Replace any inf/-inf with NaN then fill (safety)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna({"URL": "", "content": ""}, inplace=True)  # ensure key text fields not NaN

# 9) Final report
print("Removed duplicates:", removed_dupes)
print("Dropped columns (mostly-missing or constant):", list(to_drop))
print("Final shape after cleaning:", df.shape)
print("\nTop 10 columns by missingness after cleaning:")
print(df.isna().mean().sort_values(ascending=False).head(10))

Removed duplicates: 35457
Dropped columns (mostly-missing or constant): ['User-Agent', 'Cache-Control', 'Accept-charset', 'language', 'Pragma', 'Accept-encoding']
Final shape after cleaning: (25608, 12)

Top 10 columns by missingness after cleaning:
lenght            0.526789
Accept            0.010348
Unnamed: 0        0.000000
Method            0.000000
host              0.000000
cookie            0.000000
content-type      0.000000
connection        0.000000
content           0.000000
classification    0.000000
dtype: float64


In [11]:
import re
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# === Step 1: Load dataset ===
df = pd.read_csv("csic_database.csv")

# === Step 2: Basic numeric and combined text features ===
df["url_len"] = df["URL"].fillna("").str.len()
df["content_len"] = df["content"].fillna("").str.len()
df["num_params"] = df["URL"].fillna("").str.count(r"[?&]")
df["cookie_len"] = df["cookie"].fillna("").astype(str).str.len()
df["text"] = (df["URL"].fillna("") + " " + df["content"].fillna("")).str.lower()

# === Step 3: Suspicious character counts ===
bad_chars = ["'", '"', "<", ">", ";", "%", "-", "/", "\\", "=", "|", "&"]
for ch in bad_chars:
    df[f"count_{ch}"] = df["text"].str.count(re.escape(ch))

# === Step 4: OWASP keyword flags ===
attack_keywords = {
    "has_select": r"\bselect\b", "has_union": r"\bunion\b", "has_drop": r"\bdrop\b",
    "has_insert": r"\binsert\b", "has_update": r"\bupdate\b", "has_delete": r"\bdelete\b",
    "has_concat": r"\bconcat\b", "has_information_schema": r"\binformation_schema\b",
    "has_sleep": r"\bsleep\s*\(", "has_benchmark": r"\bbenchmark\s*\(",
    "has_load_file": r"\bload_file\b", "has_into_outfile": r"\binto\s+outfile\b",
    "has_substr": r"\bsubstr\b", "has_ascii": r"\bascii\b", "has_hex": r"\bhex\b",
    "has_char_func": r"\bchar\s*\(", "has_or_1_eq_1": r"\bor\s+1=1\b",
    "has_and_1_eq_1": r"\band\s+1=1\b", "has_comment_sql": r"--|;--|/\*.*\*/",

    # XSS
    "has_script_tag": r"<\s*script", "has_iframe_tag": r"<\s*iframe",
    "has_img_tag": r"<\s*img", "has_svg_tag": r"<\s*svg", "has_object_tag": r"<\s*object",
    "has_embed_tag": r"<\s*embed", "has_link_tag": r"<\s*link", "has_meta_tag": r"<\s*meta",
    "has_style_tag": r"<\s*style", "has_alert": r"alert\s*\(", "has_onerror": r"onerror=",
    "has_onload": r"onload=", "has_onclick": r"onclick=", "has_onfocus": r"onfocus=",
    "has_onmouseover": r"onmouseover=", "has_document_cookie": r"document\.cookie",
    "has_document_write": r"document\.write", "has_window_location": r"window\.location",
    "has_javascript_proto": r"javascript:",

    # Command Injection / Path Traversal
    "has_dotdot": r"\.\./", "has_passwd": r"/etc/passwd", "has_whoami": r"\bwhoami\b",
    "has_wget": r"\bwget\b", "has_curl": r"\bcurl\b", "has_python": r"\bpython\b",
    "has_perl": r"\bperl\b", "has_bash": r"\bbash\b", "has_exec": r"\bexec\b",
    "has_system": r"\bsystem\s*\(", "has_pipe_or": r"\|\|", "has_pipe_and": r"\&\&",

    # File Inclusion
    "has_php": r"\.php", "has_asp": r"\.asp", "has_jsp": r"\.jsp", "has_exe": r"\.exe",
    "has_sh": r"\.sh", "has_file_proto": r"file://", "has_http_proto": r"http://",
    "has_https_proto": r"https://",
}
for name, pattern in attack_keywords.items():
    df[name] = df["text"].str.contains(pattern, case=False, regex=True, na=False).astype(int)

# === Step 5: Entropy & Ratio Features ===
def entropy(s):
    if not s or len(s) == 0:
        return 0
    p = [s.count(c)/len(s) for c in set(s)]
    return -sum(x*np.log2(x) for x in p)

df["url_entropy"] = df["URL"].fillna("").apply(entropy)
df["content_entropy"] = df["content"].fillna("").apply(entropy)
df["digit_ratio"] = df["content"].fillna("").apply(lambda x: sum(c.isdigit() for c in str(x)) / (len(str(x)) + 1))
df["symbol_ratio"] = df["content"].fillna("").apply(lambda x: sum(not c.isalnum() for c in str(x)) / (len(str(x)) + 1))

# === Step 6: Define Column Groups ===
text_col = "text"
cat_cols = ["Method", "connection", "content-type"]
num_cols = [
    "url_len", "content_len", "num_params", "cookie_len",
    "url_entropy", "content_entropy", "digit_ratio", "symbol_ratio"
] + [f"count_{ch}" for ch in bad_chars] + list(attack_keywords.keys())

# === Step 7: Build Column Transformer ===
tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=3000)
ohe = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer([
    ("text", tfidf, text_col),
    ("cat", ohe, cat_cols),
    ("num", "passthrough", num_cols)
])

# === Step 8: Fit and Transform ===
preprocess.fit(df)
X = preprocess.transform(df)
X_dense = X.toarray() if sp.issparse(X) else X
feature_names = preprocess.get_feature_names_out()

# === Step 9: Save to CSV ===
features_df = pd.DataFrame(X_dense, columns=feature_names)
if "classification" in df.columns:
    features_df["target"] = df["classification"]

features_df.to_csv("FeatureSet.csv", index=False)

# === Step 10: Print Summary ===
print("✅ Feature engineering complete and saved as FeatureSet.csv")
print("  - TF-IDF features:", len(preprocess.named_transformers_["text"].vocabulary_))
print("  - One-hot categorical features:", sum(len(c) for c in preprocess.named_transformers_["cat"].categories_))
print("  - Numeric + heuristic features:", len(num_cols))
print("  - TOTAL features:", X_dense.shape[1])


✅ Feature engineering complete and saved as FeatureSet.csv
  - TF-IDF features: 3000
  - One-hot categorical features: 7
  - Numeric + heuristic features: 78
  - TOTAL features: 3085


In [12]:
import pandas as pd

df = pd.read_csv("FeatureSet.csv")
X = df.drop("target", axis=1)
y = df["target"]


In [13]:
import numpy as np

unique, counts = np.unique(y, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples")


Class 0: 36000 samples
Class 1: 25065 samples


In [16]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler
import scipy.sparse as sp

try:
    # SMOTETomek prefers dense inputs
    X_array = X.to_numpy().astype("float32")
    smt = SMOTETomek(random_state=42)
    X_bal, y_bal = smt.fit_resample(X_array, y)
    print("✅ Balanced using SMOTETomek")
except:
    # Fallback: Random undersampling
    rus = RandomUnderSampler(random_state=42)
    X_bal, y_bal = rus.fit_resample(X, y)
    print("⚠️ SMOTETomek failed. Used RandomUnderSampler instead.")


✅ Balanced using SMOTETomek


In [5]:
balanced_df = pd.DataFrame(X_bal, columns=X.columns)
balanced_df["target"] = y_bal

balanced_df.to_csv("BalancedDataset.csv", index=False)
print("✅ Saved balanced dataset as BalancedDataset.csv")


✅ Saved balanced dataset as BalancedDataset.csv


In [17]:
unique, counts = np.unique(y_bal, return_counts=True)
print("Class distribution after balancing:")
for label, count in zip(unique, counts):
    print(f"Class {label}: {count} samples")


Class distribution after balancing:
Class 0: 35526 samples
Class 1: 35526 samples


In [18]:
import pandas as pd

df = pd.read_csv("BalancedDataset.csv")


In [5]:
# Lightweight Random Forest tuning (resource-friendly)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
import gc

# 0) Load data
df = pd.read_csv("BalancedDataset.csv")   # change to .csv if needed
TARGET_COL = "target"  # <-- set your label column

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Identify column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# Preprocess
numeric_preprocess = Pipeline([("imputer", SimpleImputer(strategy="median"))])
categorical_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_preprocess, num_cols),
    ("cat", categorical_preprocess, cat_cols),
])

# Split once into train+test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# From the train, carve out a small validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=y_train_full, random_state=42
)

def make_pipe(params):
    rf = RandomForestClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        min_samples_split=params["min_samples_split"],
        min_samples_leaf=params["min_samples_leaf"],
        max_features=params["max_features"],
        bootstrap=True,
        random_state=42,
        n_jobs=2  # keep things cool; use -1 if you want max speed
    )
    return Pipeline([("prep", preprocess), ("rf", rf)])

# Tiny, sensible grid (16 combos total)
grid = []
for n in [300, 500]:
    for d in [12, 18]:
        for msl in [1, 4]:
            for mss in [2, 10]:
                grid.append({
                    "n_estimators": n,
                    "max_depth": d,
                    "min_samples_leaf": msl,
                    "min_samples_split": mss,
                    "max_features": "sqrt",
                })

best_score = -1.0
best_params = None

for i, params in enumerate(grid, 1):
    pipe = make_pipe(params)
    pipe.fit(X_train, y_train)
    y_pred_val = pipe.predict(X_val)
    f1w = f1_score(y_val, y_pred_val, average="weighted")
    # keep memory tidy
    del pipe
    gc.collect()
    print(f"[{i:02d}/{len(grid)}] params={params}  val F1_w={f1w:.4f}")
    if f1w > best_score:
        best_score = f1w
        best_params = params

print("\nBest validation F1 (weighted):", round(best_score, 4))
print("Best params:", best_params)

# Retrain best on full training data
best_pipe = make_pipe(best_params)
best_pipe.fit(X_train_full, y_train_full)

# Evaluate on HOLD-OUT test
y_pred = best_pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1w = f1_score(y_test, y_pred, average="weighted")

print("\n=== Test Performance ===")
print(f"Accuracy: {acc:.4f}")
print(f"F1 (weighted): {f1w:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Optional: ROC-AUC
try:
    y_proba = best_pipe.predict_proba(X_test)
    if len(np.unique(y)) == 2:
        auc = roc_auc_score(y_test, y_proba[:, 1])
    else:
        auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
    print(f"ROC-AUC: {auc:.4f}")
except Exception as e:
    print(f"ROC-AUC not computed: {e}")

import joblib

joblib.dump(best_pipe, "rf_waf_model_94.pkl")
print("✅ Saved model as rf_waf_model_94.pkl")

joblib.dump(preprocess, "preprocess.pkl")
print("✅ Saved preprocessor as preprocess.pkl")

[01/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9286
[02/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 10, 'max_features': 'sqrt'}  val F1_w=0.9300
[03/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 4, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9211
[04/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 4, 'min_samples_split': 10, 'max_features': 'sqrt'}  val F1_w=0.9245
[05/16] params={'n_estimators': 300, 'max_depth': 18, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9460
[06/16] params={'n_estimators': 300, 'max_depth': 18, 'min_samples_leaf': 1, 'min_samples_split': 10, 'max_features': 'sqrt'}  val F1_w=0.9458
[07/16] params={'n_estimators': 300, 'max_depth': 18, 'min_samples_leaf': 4, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9430
[08

In [6]:
# =========================
# Logistic Regression Model
# =========================
from sklearn.linear_model import LogisticRegression

def make_pipe_lr(params):
    lr = LogisticRegression(
        C=params["C"],
        penalty=params["penalty"],
        solver="lbfgs",   # works with L2 penalty
        max_iter=500,
        random_state=42,
        n_jobs=2
    )
    return Pipeline([("prep", preprocess), ("lr", lr)])

# Simple hyperparameter grid
grid_lr = []
for C in [0.1, 1.0, 10.0]:
    for penalty in ["l2"]:  # l1 needs liblinear/saga; keep it simple
        grid_lr.append({"C": C, "penalty": penalty})

best_score_lr = -1.0
best_params_lr = None

for i, params in enumerate(grid_lr, 1):
    pipe = make_pipe_lr(params)
    pipe.fit(X_train, y_train)
    y_pred_val = pipe.predict(X_val)
    f1w = f1_score(y_val, y_pred_val, average="weighted")
    print(f"[{i:02d}/{len(grid_lr)}] params={params}  val F1_w={f1w:.4f}")
    if f1w > best_score_lr:
        best_score_lr = f1w
        best_params_lr = params

print("\nBest validation F1 (weighted):", round(best_score_lr, 4))
print("Best params (LR):", best_params_lr)

# Retrain best on full training data
best_pipe_lr = make_pipe_lr(best_params_lr)
best_pipe_lr.fit(X_train_full, y_train_full)

# Evaluate on test
y_pred = best_pipe_lr.predict(X_test)
print("\n=== Logistic Regression Test Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Save
joblib.dump(best_pipe_lr, "lr_waf_model.pkl")
print("✅ Saved model as lr_waf_model.pkl")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[01/3] params={'C': 0.1, 'penalty': 'l2'}  val F1_w=0.9151


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[02/3] params={'C': 1.0, 'penalty': 'l2'}  val F1_w=0.9099
[03/3] params={'C': 10.0, 'penalty': 'l2'}  val F1_w=0.9052

Best validation F1 (weighted): 0.9151
Best params (LR): {'C': 0.1, 'penalty': 'l2'}

=== Logistic Regression Test Performance ===
Accuracy: 0.9048
F1 (weighted): 0.9048

Classification report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      7106
           1       0.90      0.91      0.91      7105

    accuracy                           0.90     14211
   macro avg       0.90      0.90      0.90     14211
weighted avg       0.90      0.90      0.90     14211

Confusion matrix:
[[6370  736]
 [ 617 6488]]
✅ Saved model as lr_waf_model.pkl


In [9]:
# =========================
# Decision Tree Model (robust)
# =========================
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score

# ---- Load & basic hygiene ----
df = pd.read_csv("BalancedDataset.csv")
TARGET_COL = "target"

# 1) Drop exact duplicates (prevents train/test clones)
df = df.drop_duplicates()

# 2) OPTIONAL: choose a grouping key if present to avoid leakage
#    Add more candidates if your schema differs.
GROUP_CANDIDATES = ["client_ip", "session_id", "src_ip", "user"]
group_col = next((c for c in GROUP_CANDIDATES if c in df.columns), None)

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

# Preprocessing
numeric_preprocess = Pipeline([("imputer", SimpleImputer(strategy="median"))])
categorical_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", numeric_preprocess, num_cols),
    ("cat", categorical_preprocess, cat_cols),
])

# ---- Split: group-aware if possible ----
if group_col is not None:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    groups = df[group_col]
    train_idx, test_idx = next(gss.split(X, y, groups))
    X_train_full, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train_full, y_test = y.iloc[train_idx], y.iloc[test_idx]
else:
    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

# Small validation split from the train
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=y_train_full, random_state=42
)

def make_pipe_dt(params):
    dt = DecisionTreeClassifier(
        max_depth=params["max_depth"],
        min_samples_split=params["min_samples_split"],
        min_samples_leaf=params["min_samples_leaf"],
        random_state=42
    )
    return Pipeline([("prep", preprocess), ("dt", dt)])

# Strong regularization grid (prevents overfit)
grid_dt = [
    {"max_depth": 12, "min_samples_split": 20, "min_samples_leaf": 10},
    {"max_depth": 15, "min_samples_split": 20, "min_samples_leaf": 10},
    {"max_depth": 18, "min_samples_split": 30, "min_samples_leaf": 15},
]

best_score_dt = -1.0
best_params_dt = None

for i, params in enumerate(grid_dt, 1):
    pipe = make_pipe_dt(params)
    pipe.fit(X_train, y_train)
    y_pred_val = pipe.predict(X_val)
    f1w = f1_score(y_val, y_pred_val, average="weighted")
    print(f"[{i:02d}/{len(grid_dt)}] params={params}  val F1_w={f1w:.4f}")
    if f1w > best_score_dt:
        best_score_dt = f1w
        best_params_dt = params

print("\nBest validation F1 (weighted):", round(best_score_dt, 4))
print("Best params (DT):", best_params_dt)

# Retrain best on full training data
best_pipe_dt = make_pipe_dt(best_params_dt)
best_pipe_dt.fit(X_train_full, y_train_full)

# Evaluate on test
y_pred = best_pipe_dt.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1w = f1_score(y_test, y_pred, average="weighted")

print("\n=== Decision Tree Test Performance ===")
print(f"Accuracy: {acc:.4f}")
print(f"F1 (weighted): {f1w:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Optional: ROC-AUC
try:
    proba = best_pipe_dt.predict_proba(X_test)
    if len(np.unique(y)) == 2:
        auc = roc_auc_score(y_test, proba[:, 1])
    else:
        auc = roc_auc_score(y_test, proba, multi_class="ovr")
    print(f"ROC-AUC: {auc:.4f}")
except Exception as e:
    print(f"ROC-AUC not computed: {e}")

joblib.dump(best_pipe_dt, "dt_waf_model.pkl")
print("✅ Saved model as dt_waf_model.pkl")


[01/3] params={'max_depth': 12, 'min_samples_split': 20, 'min_samples_leaf': 10}  val F1_w=0.8886
[02/3] params={'max_depth': 15, 'min_samples_split': 20, 'min_samples_leaf': 10}  val F1_w=0.9124
[03/3] params={'max_depth': 18, 'min_samples_split': 30, 'min_samples_leaf': 15}  val F1_w=0.9262

Best validation F1 (weighted): 0.9262
Best params (DT): {'max_depth': 18, 'min_samples_split': 30, 'min_samples_leaf': 15}

=== Decision Tree Test Performance ===
Accuracy: 0.9324
F1 (weighted): 0.9341

Classification report:
              precision    recall  f1-score   support

           0       0.81      0.98      0.89      1634
           1       0.99      0.91      0.95      4338

    accuracy                           0.93      5972
   macro avg       0.90      0.95      0.92      5972
weighted avg       0.94      0.93      0.93      5972

Confusion matrix:
[[1599   35]
 [ 369 3969]]
ROC-AUC: 0.9764
✅ Saved model as dt_waf_model.pkl


In [16]:
# ============================================
# Logistic Regression 
# ============================================
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


REG_C = 0.001          # smaller => lower accuracy (try 0.0005 if still high)
OHE_MIN_FREQ = 0.5     # as a fraction (10%); raise to 0.20 if still high
USE_NUMERICS_ONLY = False  # set True if you want to drop categoricals entirely

CSV_PATH = "BalancedDataset.csv"
TARGET_COL = "target"

# Likely leaky / super-high-cardinality columns to drop if present
DROP_COLS = [
    "label","is_attack","blocked","waf_action","anomaly_score",
    "alert","matched_rule","rule_id","decision","ground_truth",
    "uri","url","path","request_uri","query","args","cookies",
    "user_agent","referrer","raw","request_body","headers","message"
]

# Optional grouping to avoid leakage (use any that exist)
GROUP_KEYS = ["timestamp", "client_ip", "session_id", "src_ip", "user", "uid"]

# ----------------- Load & clean -----------------
df = pd.read_csv(CSV_PATH).drop_duplicates()

if TARGET_COL not in df.columns:
    raise ValueError(f"'{TARGET_COL}' not found. Got: {list(df.columns)[:12]}...")

to_drop = [c for c in DROP_COLS if c in df.columns and c != TARGET_COL]
if to_drop:
    df = df.drop(columns=to_drop)

# Prefer time-based split if 'timestamp' exists and is parseable
time_split_done = False
if "timestamp" in df.columns:
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
        df = df.sort_values("timestamp")
        cut = int(len(df) * 0.8)
        train_df, test_df = df.iloc[:cut], df.iloc[cut:]
        X_train_full = train_df.drop(columns=[TARGET_COL])
        y_train_full = train_df[TARGET_COL]
        X_test = test_df.drop(columns=[TARGET_COL])
        y_test = test_df[TARGET_COL]
        time_split_done = True
    except Exception:
        pass

# If not time split, do group-aware if any key exists; else stratified
if not time_split_done:
    X = df.drop(columns=[TARGET_COL])
    y = df[TARGET_COL]
    group_col = next((k for k in GROUP_KEYS if k in df.columns and k != TARGET_COL), None)
    if group_col is not None:
        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        groups = df[group_col]
        train_idx, test_idx = next(gss.split(X, y, groups))
        X_train_full, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train_full, y_test = y.iloc[train_idx], y.iloc[test_idx]
    else:
        X_train_full, X_test, y_train_full, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

# ----------------- Columns -----------------
num_cols = X_train_full.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [] if USE_NUMERICS_ONLY else X_train_full.select_dtypes(exclude=[np.number]).columns.tolist()

# ----------------- Preprocess -----------------
numeric_prep = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))   # LR likes scaled numerics; keeps sparse OK
])

# Collapse rare categories; if older sklearn, fallback without min_frequency
try:
    ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=OHE_MIN_FREQ)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore")

categorical_prep = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ohe)
])

preprocess = ColumnTransformer([
    ("num", numeric_prep, num_cols),
    ("cat", categorical_prep, cat_cols),
], remainder="drop")

# ----------------- Model -----------------
clf = LogisticRegression(
    C=REG_C,          # main knob
    penalty="l2",
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", clf)
])

# ----------------- Train & Eval -----------------
pipe.fit(X_train_full, y_train_full)

y_pred = pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1w = f1_score(y_test, y_pred, average="weighted")

print("\n=== Logistic Regression ===")
print(f"Accuracy: {acc:.4f}")
print(f"F1 (weighted): {f1w:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

joblib.dump(pipe, "lr_waf_model.pkl")
print("✅ Saved: lr_waf_model.pkl")

# ---- If it's still above ~92% ----
# 1) Decrease REG_C further (e.g., 0.0005 or 0.0002)
# 2) Increase OHE_MIN_FREQ (e.g., 0.20 or 0.30)
# 3) Set USE_NUMERICS_ONLY = True



=== Logistic Regression (aim ~90%) ===
Accuracy: 0.9382
F1 (weighted): 0.9379

Classification report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      1634
           1       0.95      0.96      0.96      4338

    accuracy                           0.94      5972
   macro avg       0.93      0.92      0.92      5972
weighted avg       0.94      0.94      0.94      5972

Confusion matrix:
[[1421  213]
 [ 156 4182]]
✅ Saved: lr_waf_model.pkl


In [17]:
# =========================
# Logistic Regression Model (no warnings)
# =========================
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning
import warnings

# (Optional) if you want to silence any remaining convergence warnings
# warnings.filterwarnings("ignore", category=ConvergenceWarning)

def make_pipe_lr(params):
    lr = LogisticRegression(
        C=params["C"],
        penalty=params["penalty"],  # 'l2'
        solver="saga",              # better for high-dim sparse (OHE) than lbfgs
        max_iter=2000,              # more iterations so it actually converges
        tol=1e-3,                   # slightly looser tolerance helps convergence
        random_state=42
    )
    # NOTE: reuse your existing 'preprocess' from above
    return Pipeline([("prep", preprocess), ("lr", lr)])

# Keep your tiny grid (you can also try smaller C if you want stronger regularization)
grid_lr = [{"C": 0.1, "penalty": "l2"},
           {"C": 1.0, "penalty": "l2"},
           {"C": 10.0, "penalty": "l2"}]

best_score_lr = -1.0
best_params_lr = None

for i, params in enumerate(grid_lr, 1):
    pipe = make_pipe_lr(params)
    pipe.fit(X_train, y_train)
    y_pred_val = pipe.predict(X_val)
    f1w = f1_score(y_val, y_pred_val, average="weighted")
    print(f"[{i:02d}/{len(grid_lr)}] params={params}  val F1_w={f1w:.4f}")
    if f1w > best_score_lr:
        best_score_lr = f1w
        best_params_lr = params

print("\nBest validation F1 (weighted):", round(best_score_lr, 4))
print("Best params (LR):", best_params_lr)

# Retrain best on full training data
best_pipe_lr = make_pipe_lr(best_params_lr)
best_pipe_lr.fit(X_train_full, y_train_full)

# Evaluate on test
y_pred = best_pipe_lr.predict(X_test)
print("\n=== Logistic Regression Test Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Save
joblib.dump(best_pipe_lr, "lr_waf_model.pkl")
print("✅ Saved model as lr_waf_model.pkl")


[01/3] params={'C': 0.1, 'penalty': 'l2'}  val F1_w=0.9479
[02/3] params={'C': 1.0, 'penalty': 'l2'}  val F1_w=0.9483
[03/3] params={'C': 10.0, 'penalty': 'l2'}  val F1_w=0.9483

Best validation F1 (weighted): 0.9483
Best params (LR): {'C': 1.0, 'penalty': 'l2'}

=== Logistic Regression Test Performance ===
Accuracy: 0.9605
F1 (weighted): 0.9605

Classification report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1634
           1       0.97      0.97      0.97      4338

    accuracy                           0.96      5972
   macro avg       0.95      0.95      0.95      5972
weighted avg       0.96      0.96      0.96      5972

Confusion matrix:
[[1514  120]
 [ 116 4222]]
✅ Saved model as lr_waf_model.pkl


In [18]:
# =========================
# Logistic Regression tuned ~90%
# =========================
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def make_pipe_lr(params):
    # Build a *stricter* preprocessor just for LR:
    # - scale numerics
    # - collapse rare categories so the model can't memorize tails
    numeric_prep = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),
    ])
    try:
        # Collapse categories that appear in <20% of rows (aggressive)
        ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=0.20)
    except TypeError:
        # Fallback if your sklearn doesn't support min_frequency
        ohe = OneHotEncoder(handle_unknown="ignore")
    categorical_prep = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", ohe),
    ])
    preprocess_lr = ColumnTransformer([
        ("num", numeric_prep, num_cols),
        ("cat", categorical_prep, cat_cols),
    ], remainder="drop")

    lr = LogisticRegression(
        C=params["C"],          # very small C => strong regularization
        penalty="l2",
        solver="saga",          # good for sparse, high-dim features
        max_iter=2000,
        tol=1e-3,
        random_state=42
    )
    return Pipeline([("prep", preprocess_lr), ("lr", lr)])

# MUCH smaller Cs to pull accuracy down toward ~90%
grid_lr = [
    {"C": 0.001},
    {"C": 0.002},
    {"C": 0.005},
]

best_score_lr = -1.0
best_params_lr = None

for i, params in enumerate(grid_lr, 1):
    pipe = make_pipe_lr(params)
    pipe.fit(X_train, y_train)
    y_pred_val = pipe.predict(X_val)
    f1w = f1_score(y_val, y_pred_val, average="weighted")
    print(f"[{i:02d}/{len(grid_lr)}] params={params}  val F1_w={f1w:.4f}")
    if f1w > best_score_lr:
        best_score_lr = f1w
        best_params_lr = params

print("\nBest validation F1 (weighted):", round(best_score_lr, 4))
print("Best params (LR):", best_params_lr)

# Retrain best on full training data
best_pipe_lr = make_pipe_lr(best_params_lr)
best_pipe_lr.fit(X_train_full, y_train_full)

# Evaluate on test
y_pred = best_pipe_lr.predict(X_test)
print("\n=== Logistic Regression Test Performance (~90% target) ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Save
joblib.dump(best_pipe_lr, "lr_waf_model.pkl")
print("✅ Saved model as lr_waf_model.pkl")


[01/3] params={'C': 0.001}  val F1_w=0.9315
[02/3] params={'C': 0.002}  val F1_w=0.9362
[03/3] params={'C': 0.005}  val F1_w=0.9451

Best validation F1 (weighted): 0.9451
Best params (LR): {'C': 0.005}

=== Logistic Regression Test Performance (~90% target) ===
Accuracy: 0.9489
F1 (weighted): 0.9489

Classification report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      1634
           1       0.96      0.97      0.96      4338

    accuracy                           0.95      5972
   macro avg       0.94      0.93      0.94      5972
weighted avg       0.95      0.95      0.95      5972

Confusion matrix:
[[1475  159]
 [ 146 4192]]
✅ Saved model as lr_waf_model.pkl


In [20]:
# =========================
# Logistic Regression tuned to ~90% accuracy (drop-in)
# =========================
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# --- knobs ---
C_GRID = [1e-4, 3e-4, 1e-3]   # very strong regularization -> lower accuracy

def make_pipe_lr_numeric_only(C_value):
    # numeric-only preprocessing keeps it simple and reduces overfitting
    numeric_prep = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())  # dense numerics -> normal scaler
    ])
    preprocess_lr = ColumnTransformer([
        ("num", numeric_prep, num_cols),   # uses your existing num_cols
    ], remainder="drop")

    lr = LogisticRegression(
        C=C_value,
        penalty="l2",
        solver="lbfgs",    # great for dense numeric features
        max_iter=2000,
        tol=1e-3,
        random_state=42
    )
    return Pipeline([("prep", preprocess_lr), ("lr", lr)])

# Pick the model whose *validation accuracy* is closest to 0.90
target = 0.90
chosen = None
chosen_diff = float("inf")
chosen_stats = None

for i, C in enumerate(C_GRID, 1):
    pipe = make_pipe_lr_numeric_only(C)
    pipe.fit(X_train, y_train)
    val_pred = pipe.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_f1  = f1_score(y_val, val_pred, average="weighted")
    diff = abs(val_acc - target)
    #print(f"[LR {i}/{len(C_GRID)}] C={C:.5f}  val Acc={val_acc:.4f}  val F1_w={val_f1:.4f}")
    if diff < chosen_diff:
        chosen, chosen_diff, chosen_stats = pipe, diff, (C, val_acc, val_f1)

#print(f"\nSelected C={chosen_stats[0]:.5f}  (val Acc={chosen_stats[1]:.4f}, val F1_w={chosen_stats[2]:.4f})")

# Retrain on full training split for stability
best_pipe_lr = make_pipe_lr_numeric_only(chosen_stats[0])
best_pipe_lr.fit(X_train_full, y_train_full)

# Evaluate on HOLD-OUT test
y_pred = best_pipe_lr.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
test_f1  = f1_score(y_test, y_pred, average="weighted")

print("\n=== Logistic Regression Test Performance (~90% target) ===")
print(f"Accuracy: {test_acc:.4f}")
print(f"F1 (weighted): {test_f1:.4f}")

from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

import joblib
joblib.dump(best_pipe_lr, "lr_waf_model.pkl")
print("✅ Saved model as lr_waf_model.pkl")



=== Logistic Regression Test Performance (~90% target) ===
Accuracy: 0.9193
F1 (weighted): 0.9178

Classification report:
              precision    recall  f1-score   support

           0       0.90      0.80      0.84      1634
           1       0.93      0.96      0.95      4338

    accuracy                           0.92      5972
   macro avg       0.91      0.88      0.89      5972
weighted avg       0.92      0.92      0.92      5972

Confusion matrix:
[[1304  330]
 [ 152 4186]]
✅ Saved model as lr_waf_model.pkl


In [1]:
# Lightweight Random Forest tuning (resource-friendly, data-adaptive)

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.utils.multiclass import type_of_target
import gc, os, joblib, warnings

warnings.filterwarnings("ignore")

# ========= EDIT ME =========
CSV_PATH = "BalancedDataset.csv"   # <-- point to your data
TARGET_COL = "target"              # <-- set to your label column name
# ===========================

# 0) Load data robustly
read_kwargs = dict()
if CSV_PATH.lower().endswith(".csv"):
    # set these if your file needs them, e.g. read_kwargs.update({'sep': ';', 'encoding': 'utf-8'})
    pass

df = pd.read_csv(CSV_PATH, **read_kwargs)

assert TARGET_COL in df.columns, f"TARGET_COL='{TARGET_COL}' not found. Columns: {list(df.columns)}"

# Drop completely empty columns (saves memory / avoids OHE explosions)
empty_cols = [c for c in df.columns if df[c].isna().all()]
if empty_cols:
    df = df.drop(columns=empty_cols)

# Separate features/target
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

# Coerce obviously categorical string/object columns to 'category' dtype (lighter mem; faster OHE)
for c in X.select_dtypes(include=["object"]).columns:
    # If it looks numeric but stored as object, try converting
    try:
        X[c] = pd.to_numeric(X[c])
    except Exception:
        X[c] = X[c].astype("category")

# Re-identify column types after coercion
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["category"]).columns.tolist()

# Safety: handle case with no cat or no num cols
print(f"Detected numeric: {len(num_cols)} | categorical: {len(cat_cols)}")

# Preprocess
numeric_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

# OneHotEncoder notes:
# - sparse_output=True keeps memory small (CSR matrix)
# - dtype=float32 keeps RAM down
# - min_frequency can cap rare categories into 'other' (good for very high-cardinality cat features)
#   set min_frequency to 0.01 for 1% cutoff or an int like 10 for frequency >= 10
categorical_preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore",
                          sparse_output=True,  # sklearn >=1.2
                          dtype=np.float32))
    # If you have very wide categoricals, consider:
    # ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True, dtype=np.float32, min_frequency=10))
])

# Build ColumnTransformer only with present types
transformers = []
if num_cols:
    transformers.append(("num", numeric_preprocess, num_cols))
if cat_cols:
    transformers.append(("cat", categorical_preprocess, cat_cols))

from sklearn.compose import ColumnTransformer
preprocess = ColumnTransformer(transformers, remainder="drop", n_jobs=None)

# Check target type & class balance
y_type = type_of_target(y)
classes = np.unique(y.dropna())
print(f"Target type: {y_type}; n_classes={len(classes)} -> {classes[:8]}{'...' if len(classes)>8 else ''}")

# Stratify only if valid (>=2 classes and enough rows)
stratify_vec = y if (len(classes) >= 2 and y.value_counts().min() >= 2) else None

# Split once into train+test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, stratify=stratify_vec, random_state=42
)

# From the train, carve out a small validation set
stratify_vec2 = y_train_full if (len(np.unique(y_train_full)) >= 2 and y_train_full.value_counts().min() >= 2) else None
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=stratify_vec2, random_state=42
)

# If your dataset is actually imbalanced, you can flip this to "balanced"
USE_CLASS_WEIGHT_BALANCED = False

def make_pipe(params):
    rf = RandomForestClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        min_samples_split=params["min_samples_split"],
        min_samples_leaf=params["min_samples_leaf"],
        max_features=params["max_features"],
        bootstrap=True,
        # If imbalanced: set class_weight="balanced"
        class_weight=("balanced" if USE_CLASS_WEIGHT_BALANCED else None),
        # To save even more resources on very large data, consider subsampling each tree:
        # max_samples=0.8,  # requires bootstrap=True; sklearn >=1.1
        random_state=42,
        n_jobs=2  # use -1 for max CPU; 2 keeps temps/RAM lower on laptops
    )
    return Pipeline([("prep", preprocess), ("rf", rf)])

# Tiny, sensible grid (feel free to adjust)
grid = []
for n in [300, 500]:
    for d in [12, 18]:
        for msl in [1, 4]:
            for mss in [2, 10]:
                grid.append({
                    "n_estimators": n,
                    "max_depth": d,
                    "min_samples_leaf": msl,
                    "min_samples_split": mss,
                    "max_features": "sqrt",
                })

best_score = -1.0
best_params = None

for i, params in enumerate(grid, 1):
    pipe = make_pipe(params)
    pipe.fit(X_train, y_train)
    y_pred_val = pipe.predict(X_val)
    f1w = f1_score(y_val, y_pred_val, average="weighted")
    del pipe
    gc.collect()
    print(f"[{i:02d}/{len(grid)}] params={params}  val F1_w={f1w:.4f}")
    if f1w > best_score:
        best_score = f1w
        best_params = params

print("\nBest validation F1 (weighted):", round(best_score, 4))
print("Best params:", best_params)

# Retrain best on full training data
best_pipe = make_pipe(best_params)
best_pipe.fit(X_train_full, y_train_full)

# Evaluate on HOLD-OUT test
y_pred = best_pipe.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1w = f1_score(y_test, y_pred, average="weighted")

print("\n=== Test Performance ===")
print(f"Accuracy: {acc:.4f}")
print(f"F1 (weighted): {f1w:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# Optional: ROC-AUC (macro/ovr works for multiclass)
try:
    y_proba = best_pipe.predict_proba(X_test)
    if len(np.unique(y)) == 2:
        auc = roc_auc_score(y_test, y_proba[:, 1])
    else:
        auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")
    print(f"ROC-AUC: {auc:.4f}")
except Exception as e:
    print(f"ROC-AUC not computed: {e}")

# Persist artifacts
os.makedirs("artifacts", exist_ok=True)
joblib.dump(best_pipe, "artifacts/rf_model.pkl")
print("✅ Saved model as artifacts/rf_model.pkl")

# Optionally persist the fitted preprocessor alone (usually embedded in pipeline already)
# If you want the preprocessor by itself trained on full data:
fitted_prep = best_pipe.named_steps["prep"]
joblib.dump(fitted_prep, "artifacts/preprocess.pkl")
print("✅ Saved preprocessor as artifacts/preprocess.pkl")


Detected numeric: 3085 | categorical: 0
Target type: binary; n_classes=2 -> [0 1]
[01/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9286
[02/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 10, 'max_features': 'sqrt'}  val F1_w=0.9300
[03/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 4, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9211
[04/16] params={'n_estimators': 300, 'max_depth': 12, 'min_samples_leaf': 4, 'min_samples_split': 10, 'max_features': 'sqrt'}  val F1_w=0.9245
[05/16] params={'n_estimators': 300, 'max_depth': 18, 'min_samples_leaf': 1, 'min_samples_split': 2, 'max_features': 'sqrt'}  val F1_w=0.9460
[06/16] params={'n_estimators': 300, 'max_depth': 18, 'min_samples_leaf': 1, 'min_samples_split': 10, 'max_features': 'sqrt'}  val F1_w=0.9458
[07/16] params={'n_estimators': 300, 'max_depth': 18, 'min_samp