# **Detect Phishing Websites Modelling**

---

In [1]:
import time
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wandb
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV


In [2]:
np.random.seed(42)

In [3]:
BASE_DIR = Path(".").resolve()
DATA_DIR = BASE_DIR / "data"
PROCESSED_DIR = DATA_DIR / "processed"

In [4]:
training_data = pd.read_csv(PROCESSED_DIR / "training.csv")
X_train = training_data.drop(columns=["Result"])
y_train = training_data["Result"]

testing_data = pd.read_csv(PROCESSED_DIR / "testing.csv")
X_test = testing_data.drop(columns=["Result"])
y_test = testing_data["Result"]

In [None]:
wandb.init(project="Detect Phishing Websites Modelling", name="logistic-regression")

In [6]:
start_training = time.time()
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
end_training = time.time()

y_pred = logistic_regression.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

wandb.log(
    {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "confusion": wandb.plot.confusion_matrix(
            probs=None,
            y_true=y_test,
            preds=y_pred,
            class_names=["Phishing", "Legitimate"],
        ),
        "training_time": end_training - start_training,
    }
)

In [16]:
print(classification_report(y_test, y_pred, target_names=["Phishing", "Legitimate"]))

              precision    recall  f1-score   support

    Phishing       0.94      0.90      0.92       980
  Legitimate       0.92      0.95      0.94      1231

    accuracy                           0.93      2211
   macro avg       0.93      0.93      0.93      2211
weighted avg       0.93      0.93      0.93      2211



In [17]:
# Hyperparameter Tuning Parameters
penalty = ["l1", "l2"]
C = [.8, .9, 1.0]
tol = [1e-2, 1e-3, 1e-4]
max_iter = [100, 150, 200, 250, 300]

In [None]:
wandb.init(
    project="Detect Phishing Websites Modelling",
    name="logistic-regression-hyperparameter-tuning",
)
start_training = time.time()
randomized_search = RandomizedSearchCV(
    estimator=logistic_regression,
    param_distributions={
        "penalty": penalty,
        "C": C,
        "tol": tol,
        "max_iter": max_iter,
    },
    cv=5,
    random_state=42,
    n_jobs=-1,
)
randomized_search.fit(X_train, y_train)
end_training = time.time()

In [19]:
y_pred = randomized_search.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
wandb.log(
    {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "confusion": wandb.plot.confusion_matrix(
            probs=None,
            y_true=y_test,
            preds=y_pred,
            class_names=["Phishing", "Legitimate"],
        ),
        "training_time": end_training - start_training,
    }
)

In [24]:
print(f"Best Parameters: {randomized_search.best_params_}")
print(f"Best Score: {randomized_search.best_score_ * 100}")

Best Parameters: {'tol': 0.001, 'penalty': 'l2', 'max_iter': 150, 'C': 1.0}
Best Score: 92.67304047330983


In [20]:
print(classification_report(y_test, y_pred, target_names=["Phishing", "Legitimate"]))

              precision    recall  f1-score   support

    Phishing       0.94      0.90      0.92       980
  Legitimate       0.92      0.95      0.94      1231

    accuracy                           0.93      2211
   macro avg       0.93      0.93      0.93      2211
weighted avg       0.93      0.93      0.93      2211

