In [1]:
import os
import glob
import time
import joblib
import numpy as np
import pandas as pd

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    log_loss,
    brier_score_loss,
    classification_report,
    average_precision_score,
)

# -----------------------------
# Config
# -----------------------------
data_folder = "Processed_UE_Datasets_unscaled"

label_column = "binary_label"   # <-- change if needed
feature_columns = [
    'epre','pusch_snr','p_ue','ul_mcs','cqi','ul_bitrate',
    'dl_mcs','dl_retx','ul_tx','dl_tx','ul_retx','dl_bitrate','dl_err','ul_err'
]

model_path = "threat_score_model.joblib"

# -----------------------------
# Load TRAIN files only
# -----------------------------
csv_files = glob.glob(os.path.join(data_folder, "*.csv"))
train_files = [f for f in csv_files if "test" not in os.path.basename(f).lower()]

if not train_files:
    raise RuntimeError("No training files found (no CSVs without 'test' in filename).")

dfs = []
for f in train_files:
    df = pd.read_csv(f)
    missing = [c for c in feature_columns + [label_column] if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns {missing} in file: {os.path.basename(f)}")
    dfs.append(df)

data = pd.concat(dfs, axis=0).reset_index(drop=True)

X = data[feature_columns].copy()
y = data[label_column].copy()

# Ensure numeric
for c in feature_columns:
    X[c] = pd.to_numeric(X[c], errors="coerce")
y = pd.to_numeric(y, errors="coerce").fillna(0).astype(int)

# -----------------------------
# Split: train / val
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print(f"Train: {len(X_train)} | Val: {len(X_val)}")
print(f"Attack rate (train): {y_train.mean():.4f} | (val): {y_val.mean():.4f}")

# -----------------------------
# Train base model (Threat Score)
# -----------------------------
model = HistGradientBoostingClassifier(
    max_depth=10,
    learning_rate=0.05,
    max_iter=300,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)

# Threat Score = uncalibrated proba for class 1
score_val = model.predict_proba(X_val)[:, 1]

# -----------------------------
# Metrics (imbalance-friendly)
# -----------------------------
def report_scores(name, probs):
    y_true = y_val.values
    print(f"\n=== {name} ===")
    print("ROC AUC:   ", roc_auc_score(y_true, probs))
    print("PR AUC:    ", average_precision_score(y_true, probs))
    print("Log Loss:  ", log_loss(y_true, probs))
    print("Brier:     ", brier_score_loss(y_true, probs))

    # Reference report at 0.5 (usually useless in heavy imbalance, but keep for context)
    preds_05 = (probs >= 0.5).astype(int)
    print("\n--- Classification report @ 0.5 (reference) ---")
    print(classification_report(y_true, preds_05, digits=4))

    # Threshold sweep for operational feel
    print("\n--- Threshold sweep ---")
    for t in [0.5, 0.2, 0.1, 0.05, 0.02, 0.01]:
        preds = (probs >= t).astype(int)
        tp = ((preds == 1) & (y_true == 1)).sum()
        fn = ((preds == 0) & (y_true == 1)).sum()
        alert_rate = preds.mean() * 100
        print(f"t={t:>4} | alerts={alert_rate:7.3f}% | TP={tp:5d} | FN={fn:5d}")

    # Quick distribution check
    pos = probs[y_true == 1]
    neg = probs[y_true == 0]
    print("\n--- Score quantiles ---")
    if len(neg):
        print("NEG [p50,p90,p99,p99.9,max]:", np.round(np.quantile(neg, [0.5, 0.9, 0.99, 0.999, 1.0]), 6))
    if len(pos):
        print("POS [p50,p90,p99,p99.9,max]:", np.round(np.quantile(pos, [0.5, 0.9, 0.99, 0.999, 1.0]), 6))

report_scores("Threat Score (uncalibrated proba)", score_val)

# -----------------------------
# Save model artifact (for score.py)
# -----------------------------
joblib.dump(
    {
        "model": model,
        "feature_columns": feature_columns,
        "label_column": label_column,
        "model_type": "HistGradientBoostingClassifier",
        "output": "threat_score",
        "calibrated": False,
        "trained_at_unix": int(time.time())
    },
    model_path
)

print(f"\n✅ Saved Threat Score model: {model_path}")


Train: 427228 | Val: 106807
Attack rate (train): 0.0127 | (val): 0.0127

=== Threat Score (uncalibrated proba) ===
ROC AUC:    0.9986799668613267
PR AUC:     0.962283079358411
Log Loss:   0.02912435662172654
Brier:      0.00997746210933732

--- Classification report @ 0.5 (reference) ---
              precision    recall  f1-score   support

           0     0.9999    0.9848    0.9923    105450
           1     0.4571    0.9926    0.6259      1357

    accuracy                         0.9849    106807
   macro avg     0.7285    0.9887    0.8091    106807
weighted avg     0.9930    0.9849    0.9877    106807


--- Threshold sweep ---
t= 0.5 | alerts=  2.759% | TP= 1347 | FN=   10
t= 0.2 | alerts=  3.213% | TP= 1353 | FN=    4
t= 0.1 | alerts=  3.394% | TP= 1353 | FN=    4
t=0.05 | alerts=  3.544% | TP= 1354 | FN=    3
t=0.02 | alerts=  3.679% | TP= 1354 | FN=    3
t=0.01 | alerts=  3.862% | TP= 1355 | FN=    2

--- Score quantiles ---
NEG [p50,p90,p99,p99.9,max]: [5.69000e-04 7.31000e-0