# Model Training

In [1]:
pip install scikit-learn xgboost joblib


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   -- ------------------------------------- 10.5/150.0 MB 59.8 MB/s eta 0:00:03
   ---- ----------------------------------- 18.1/150.0 MB 47.5 MB/s eta 0:00:03
   ------ --------------------------------- 23.6/150.0 MB 39.3 MB/s eta 0:00:04
   ------ --------------------------------- 24.9/150.0 MB 30.3 MB/s eta 0:00:05
   ------- -------------------------------- 26.5/150.0 MB 25.4 MB/s eta 0:00:05
   ------- -------------------------------- 28.3/150.0 MB 23.0 MB/s eta 0:00:06
   -------- ------------------------------- 30.7/150.0 MB 20.7 MB/s eta 0:00:06
   -------- ------------------------------- 33.6/150.0 MB 19.7 MB/s eta 0:00:06
   --------- ------------------------------ 36.4/150.0 MB 19.1 MB/s eta 0:00:06
   ---------- ----------------------------- 39.6/150.0 MB 18

DEPRECATION: Loading egg at c:\python312\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install pandas numpy scikit-learn xgboost lightgbm imbalanced-learn joblib

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 15.2 MB/s eta 0:00:00
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: lightgbm, sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 lightgbm-4.6.0 sklearn-compat-0.1.3


DEPRECATION: Loading egg at c:\python312\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, 
                            precision_score, recall_score, confusion_matrix)
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

# --- Load processed data ---
DATA_DIR = r"C:\prateek\projects\linkload\backend\ml_models\phishing_detection\data"

X_train = pd.read_csv(f"{DATA_DIR}/X_train.csv")
X_test = pd.read_csv(f"{DATA_DIR}/X_test.csv")
y_train = pd.read_csv(f"{DATA_DIR}/y_train.csv").values.ravel()
y_test = pd.read_csv(f"{DATA_DIR}/y_test.csv").values.ravel()

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# --- Handle class imbalance ---
print("\nClass distribution before SMOTE:")
print(pd.Series(y_train).value_counts())

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_res).value_counts())

# --- Define candidate models with balanced class weights ---
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200,
        scale_pos_weight=sum(y_train == 0)/sum(y_train == 1),  # Handle imbalance
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=200,
        random_state=42
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    "SVM": SVC(
        kernel='rbf',
        class_weight='balanced',
        probability=True,
        random_state=42
    )
}

# --- Evaluation function ---
def evaluate_model(model, X_test, y_test):
    """Evaluate model and return comprehensive metrics"""
    start_time = time.time()
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]
    end_time = time.time()
    
    return {
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds),
        "recall": recall_score(y_test, preds),
        "f1": f1_score(y_test, preds),
        "roc_auc": roc_auc_score(y_test, proba),
        "inference_time": end_time - start_time,
        "confusion_matrix": confusion_matrix(y_test, preds)
    }

# --- Train and evaluate with cross-validation ---
results = {}
trained_models = {}

print("\nStarting model training...")
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train_res, y_train_res, 
        cv=5, scoring='f1', n_jobs=-1
    )
    
    # Train final model
    start_time = time.time()
    model.fit(X_train_res, y_train_res)
    train_time = time.time() - start_time
    
    # Evaluate
    metrics = evaluate_model(model, X_test, y_test)
    
    # Store results
    results[name] = {
        "cv_mean_f1": np.mean(cv_scores),
        "cv_std_f1": np.std(cv_scores),
        "train_time": train_time,
        **metrics
    }
    
    trained_models[name] = model
    
    print(f"{name} results:")
    print(f"   F1 (CV): {results[name]['cv_mean_f1']:.4f} ± {results[name]['cv_std_f1']:.4f}")
    print(f"   Test F1: {results[name]['f1']:.4f}, ROC-AUC: {results[name]['roc_auc']:.4f}")
    print(f"   Precision: {results[name]['precision']:.4f}, Recall: {results[name]['recall']:.4f}")
    print(f"   Inference Time: {results[name]['inference_time']:.4f}s")

# --- Model selection ---
print("\nModel Comparison:")
results_df = pd.DataFrame(results).T.sort_values('roc_auc', ascending=False)
print(results_df[['roc_auc', 'f1', 'precision', 'recall', 'cv_mean_f1']])

# Select best model
best_model_name = results_df.index[0]
best_model = trained_models[best_model_name]

print(f"\nBest model: {best_model_name}")
print(f"ROC-AUC: {results[best_model_name]['roc_auc']:.4f}")
print(f"F1 Score: {results[best_model_name]['f1']:.4f}")
print(f"Confusion Matrix:\n{results[best_model_name]['confusion_matrix']}")

# --- Save best model ---
OUTPUT_MODEL_PATH = r"C:\prateek\projects\linkload\backend\ml_models\phishing_detection\phishing_detector_model.pkl"
joblib.dump(best_model, OUTPUT_MODEL_PATH)

print(f"\nBest model saved to: {OUTPUT_MODEL_PATH}")


Train shape: (84049, 15), Test shape: (21013, 15)

Class distribution before SMOTE:
1    42025
0    42024
Name: count, dtype: int64

Class distribution after SMOTE:
1    42025
0    42025
Name: count, dtype: int64

Starting model training...

Training RandomForest...
RandomForest results:
   F1 (CV): 0.9945 ± 0.0006
   Test F1: 0.9947, ROC-AUC: 0.9982
   Precision: 0.9979, Recall: 0.9914
   Inference Time: 0.1637s

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost results:
   F1 (CV): 0.9946 ± 0.0006
   Test F1: 0.9947, ROC-AUC: 0.9982
   Precision: 0.9981, Recall: 0.9912
   Inference Time: 0.0312s

Training LogisticRegression...
LogisticRegression results:
   F1 (CV): 0.9913 ± 0.0010
   Test F1: 0.9914, ROC-AUC: 0.9973
   Precision: 0.9982, Recall: 0.9848
   Inference Time: 0.0220s

Training GradientBoosting...
GradientBoosting results:
   F1 (CV): 0.9949 ± 0.0007
   Test F1: 0.9947, ROC-AUC: 0.9981
   Precision: 0.9981, Recall: 0.9914
   Inference Time: 0.1407s

Training LightGBM...
[LightGBM] [Info] Number of positive: 42025, number of negative: 42025
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 84050, number of used features: 14
[LightGBM] [Info] [binary:BoostFromSco