In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Stacking ensemble: CatBoost + LightGBM + XGBoost -> LogisticRegression meta-learner
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import joblib

# Libraries (ensure installed)
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping # Import early_stopping callback
from xgboost import XGBClassifier
# Removed: from xgboost.callback import EarlyStopping as XGBEarlyStopping # Import XGBoost's EarlyStopping

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Phishing_Detection_Dataset/Dataset.csv')  # for Colab just use "Dataset.csv"
print(df.head())
print(df.info())
print(df.isnull().sum())


   Type  url_length  number_of_dots_in_url  having_repeated_digits_in_url  \
0     0          37                      2                              0   
1     1          70                      5                              0   
2     0          42                      2                              0   
3     0          46                      2                              0   
4     0          51                      3                              0   

   number_of_digits_in_url  number_of_special_char_in_url  \
0                        0                              8   
1                        0                             12   
2                        6                              8   
3                        0                              7   
4                        0                              9   

   number_of_hyphens_in_url  number_of_underline_in_url  \
0                         0                           0   
1                         0                         

In [None]:
!pip install catboost



In [None]:
!pip install catboost lightgbm xgboost scikit-learn pandas numpy




In [None]:
label_col = "Type"
X = df.drop(columns=[label_col])
y = df[label_col].astype(int)


In [None]:
numeric_features = df.drop(columns=["Type"])

print("Numeric feature shape:", numeric_features.shape)

Numeric feature shape: (247950, 41)


In [None]:


# --------- Train / Test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)


In [None]:
scaler_meta = StandardScaler()

# --------- Stacking (OOF) setup ----------
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Prepare arrays to hold OOF train preds and test preds
n_train = X_train.shape[0]
n_test = X_test.shape[0]

# Each base model will output probability for class 1
oof_train = np.zeros((n_train, 3))   # 3 base models
oof_test = np.zeros((n_test, 3))

# To accumulate test predictions per fold for averaging
oof_test_folds = np.zeros((n_test, n_folds, 3))

In [None]:
cb_params = {
    "iterations": 1000,
    "learning_rate": 0.03,
    "depth": 8,
    "eval_metric": "AUC",
    "random_seed": 42,
    "verbose": 0,
    "early_stopping_rounds": 100
}

In [None]:
lgb_params = {
    "learning_rate": 0.07551248062419097,
    "num_leaves": 101,
    "max_depth": -1,
    "feature_fraction": 0.7645370255329039,
    "bagging_fraction": 0.900104184717939,
    "bagging_freq": 10,
    "lambda_l1": 1.7241496739273474,
    "lambda_l2": 1.544580545532398,
    "n_estimators": 5000,              # boosted for performance + early stopping
    "objective": 'binary',
    "metric": 'binary_logloss',
    "random_state": 42,
    "n_jobs": -1
}

In [None]:
xgb_params = {
    "n_estimators": 1000,
    "learning_rate": 0.03,
    "max_depth": 8,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "use_label_encoder": False,
    "eval_metric": "auc",
    "random_state": 42,
    "n_jobs": -1
}


In [None]:
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train.values


In [None]:
cb = CatBoostClassifier(**cb_params)
lgb = LGBMClassifier(**lgb_params)
xgb = XGBClassifier(**xgb_params)

In [None]:
fold_idx = 0
for train_idx, valid_idx in skf.split(X_train_np, y_train_np):
    print(f"\n--- Fold {fold_idx+1}/{n_folds} ---")
    X_tr, X_val = X_train_np[train_idx], X_train_np[valid_idx]
    y_tr, y_val = y_train_np[train_idx], y_train_np[valid_idx]

    # ----- CatBoost -----
    cb = CatBoostClassifier(**cb_params)
    # CatBoost can accept pandas so convert back to DataFrame if you want feature names
    cb.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=0)
    oof_train[valid_idx, 0] = cb.predict_proba(X_val)[:, 1]
    oof_test_folds[:, fold_idx, 0] = cb.predict_proba(X_test_np)[:, 1]

    # ----- LightGBM -----
    lgb = LGBMClassifier(**lgb_params)
    lgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[early_stopping(100, verbose=False)])
    oof_train[valid_idx, 1] = lgb.predict_proba(X_val)[:, 1]
    oof_test_folds[:, fold_idx, 1] = lgb.predict_proba(X_test_np)[:, 1]

    # ----- XGBoost -----
    xgb = XGBClassifier(**xgb_params)
    # Removed early stopping to resolve persistent TypeError issues
    xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
    oof_train[valid_idx, 2] = xgb.predict_proba(X_val)[:, 1]
    oof_test_folds[:, fold_idx, 2] = xgb.predict_proba(X_test_np)[:, 1]

    fold_idx += 1


--- Fold 1/5 ---
[LightGBM] [Info] Number of positive: 76422, number of negative: 82266
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1567
[LightGBM] [Info] Number of data points in the train set: 158688, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.481587 -> initscore=-0.073687
[LightGBM] [Info] Start training from score -0.073687








[0]	validation_0-auc:0.90363
[1]	validation_0-auc:0.92237
[2]	validation_0-auc:0.92506
[3]	validation_0-auc:0.92632
[4]	validation_0-auc:0.92834
[5]	validation_0-auc:0.92897


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[6]	validation_0-auc:0.93195
[7]	validation_0-auc:0.93194
[8]	validation_0-auc:0.93177
[9]	validation_0-auc:0.93142
[10]	validation_0-auc:0.93225
[11]	validation_0-auc:0.93326
[12]	validation_0-auc:0.93436
[13]	validation_0-auc:0.93434
[14]	validation_0-auc:0.93440
[15]	validation_0-auc:0.93533
[16]	validation_0-auc:0.93540
[17]	validation_0-auc:0.93532
[18]	validation_0-auc:0.93549
[19]	validation_0-auc:0.93637
[20]	validation_0-auc:0.93686
[21]	validation_0-auc:0.93672
[22]	validation_0-auc:0.93703
[23]	validation_0-auc:0.93735
[24]	validation_0-auc:0.93751
[25]	validation_0-auc:0.93745
[26]	validation_0-auc:0.93765
[27]	validation_0-auc:0.93799
[28]	validation_0-auc:0.93812
[29]	validation_0-auc:0.93857
[30]	validation_0-auc:0.93896
[31]	validation_0-auc:0.93938
[32]	validation_0-auc:0.93971
[33]	validation_0-auc:0.94010
[34]	validation_0-auc:0.94032
[35]	validation_0-auc:0.94053
[36]	validation_0-auc:0.94058
[37]	validation_0-auc:0.94078
[38]	validation_0-auc:0.94087
[39]	validatio







[0]	validation_0-auc:0.90953
[1]	validation_0-auc:0.92803
[2]	validation_0-auc:0.92908


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[3]	validation_0-auc:0.93088
[4]	validation_0-auc:0.93067
[5]	validation_0-auc:0.93112
[6]	validation_0-auc:0.93198
[7]	validation_0-auc:0.93141
[8]	validation_0-auc:0.93275
[9]	validation_0-auc:0.93282
[10]	validation_0-auc:0.93268
[11]	validation_0-auc:0.93330
[12]	validation_0-auc:0.93318
[13]	validation_0-auc:0.93292
[14]	validation_0-auc:0.93317
[15]	validation_0-auc:0.93452
[16]	validation_0-auc:0.93465
[17]	validation_0-auc:0.93478
[18]	validation_0-auc:0.93498
[19]	validation_0-auc:0.93496
[20]	validation_0-auc:0.93542
[21]	validation_0-auc:0.93532
[22]	validation_0-auc:0.93577
[23]	validation_0-auc:0.93599
[24]	validation_0-auc:0.93608
[25]	validation_0-auc:0.93614
[26]	validation_0-auc:0.93626
[27]	validation_0-auc:0.93676
[28]	validation_0-auc:0.93700
[29]	validation_0-auc:0.93727
[30]	validation_0-auc:0.93770
[31]	validation_0-auc:0.93843
[32]	validation_0-auc:0.93873
[33]	validation_0-auc:0.93905
[34]	validation_0-auc:0.93931
[35]	validation_0-auc:0.93959
[36]	validation_0







[0]	validation_0-auc:0.90391
[1]	validation_0-auc:0.92323
[2]	validation_0-auc:0.92514
[3]	validation_0-auc:0.92746
[4]	validation_0-auc:0.92769
[5]	validation_0-auc:0.92875


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[6]	validation_0-auc:0.93171
[7]	validation_0-auc:0.93239
[8]	validation_0-auc:0.93429
[9]	validation_0-auc:0.93424
[10]	validation_0-auc:0.93467
[11]	validation_0-auc:0.93554
[12]	validation_0-auc:0.93563
[13]	validation_0-auc:0.93549
[14]	validation_0-auc:0.93563
[15]	validation_0-auc:0.93689
[16]	validation_0-auc:0.93704
[17]	validation_0-auc:0.93721
[18]	validation_0-auc:0.93706
[19]	validation_0-auc:0.93707
[20]	validation_0-auc:0.93750
[21]	validation_0-auc:0.93752
[22]	validation_0-auc:0.93774
[23]	validation_0-auc:0.93806
[24]	validation_0-auc:0.93837
[25]	validation_0-auc:0.93847
[26]	validation_0-auc:0.93868
[27]	validation_0-auc:0.93907
[28]	validation_0-auc:0.93942
[29]	validation_0-auc:0.93991
[30]	validation_0-auc:0.94008
[31]	validation_0-auc:0.94047
[32]	validation_0-auc:0.94084
[33]	validation_0-auc:0.94099
[34]	validation_0-auc:0.94121
[35]	validation_0-auc:0.94145
[36]	validation_0-auc:0.94164
[37]	validation_0-auc:0.94178
[38]	validation_0-auc:0.94184
[39]	validatio

KeyboardInterrupt: 

In [None]:
import joblib

loaded_models = joblib.load("/content/drive/MyDrive/Phishing_Detection_Dataset/models_stack/stack_ensemble_models.pkl")

# You can now access individual models like:
# cb = loaded_models["catboost"]
# lgb = loaded_models["lgbm"]
# xgb = loaded_models["xgb"]
# meta = loaded_models["meta"]
# scaler_meta = loaded_models["scaler_meta"]


In [None]:


# Average test fold predictions to create stable test meta-features
oof_test = oof_test_folds.mean(axis=1)

# --------- Meta-learner ----------
# Scale meta features
scaler_meta.fit(oof_train)
oof_train_scaled = scaler_meta.transform(oof_train)
oof_test_scaled = scaler_meta.transform(oof_test)

meta = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
meta.fit(oof_train_scaled, y_train_np)

# Final predictions
y_pred_proba = meta.predict_proba(oof_test_scaled)[:, 1]
y_pred = (y_pred_proba >= 0.5).astype(int)

# --------- Evaluation ----------
print("\n--- Stacked model evaluation on test set ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred, digits=4))

# Save models if you want
joblib.dump({
    "catboost": cb,
    "lgbm": lgb,
    "xgb": xgb,
    "meta": meta,
    "scaler_meta": scaler_meta
}, "stack_ensemble_models.pkl")
print("Models saved to stack_ensemble_models.pkl")


In [None]:
import time

In [None]:
start = time.time()
_ = meta.predict(oof_test_scaled)       # predict entire batch
end = time.time()
batch_latency = end - start                           # seconds
avg_latency_ms = (batch_latency / len(X_test)) * 1000
print(f"⚡ Avg Latency per prediction: {avg_latency_ms:.6f} ms")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Error: NameError: name 'y_test' is not defined.
# This error occurs because the variables 'y_test' and 'y_pred'
# have not been defined in the current kernel session.
# These variables are generated in previous cells:
# - 'y_test' is defined in the data splitting cell (e.g., cell 'es8-2O_Xcf5X').
# - 'y_pred' is defined in the model evaluation cell after training the meta-learner (e.g., cell 'OII3SozabY5V').
# Please ensure all preceding cells, especially those defining X, y, X_train, y_train, X_test, y_test,
# and the model training/prediction cells, have been executed in order.

# Once the previous cells are executed, uncomment the lines below to generate the confusion matrix.
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1]) # Assuming binary classification with labels 0 and 1
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()