# Phishing Website Detection Model

In [9]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Load Dataset

In [13]:
df = pd.read_csv('final_phishing_dataset.csv')

In [14]:
df.head()

Unnamed: 0,having_IP_Address,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Request_URL,URL_of_Anchor,Links_in_tags,SFH,web_traffic,Google_Index,Statistical_report,Result
0,-1,-1,-1,-1,-1,1,-1,1,-1,-1,1,-1,-1
1,1,-1,0,1,-1,1,0,-1,-1,0,1,1,-1
2,1,-1,-1,-1,-1,1,0,-1,-1,1,1,-1,-1
3,1,-1,-1,-1,1,-1,0,0,-1,1,1,1,-1
4,1,-1,1,1,-1,1,0,0,-1,0,1,1,1


# Convert target column

In [15]:
df["Result"] = df["Result"].replace(-1, 0)

In [16]:
X = df.drop("Result", axis=1)
y = df["Result"]

# Train Test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define Models

In [18]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [19]:
results = {} 

#train and Evaluate model

for name, model in models.items():
    
    print("\n==============================")
    print(f"Training {name}")
    print("==============================")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = accuracy
    
    print("Accuracy :", accuracy)
    print("Precision:", precision)
    print("Recall   :", recall)
    print("F1 Score :", f1)
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Select best model

best_model_name = max(results, key=results.get)
print("\nBest Model:", best_model_name)

best_model = models[best_model_name]


Training Logistic Regression
Accuracy : 0.9153846153846154
Precision: 0.8963093145869947
Recall   : 0.9272727272727272
F1 Score : 0.9115281501340482

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.90      0.92       620
           1       0.90      0.93      0.91       550

    accuracy                           0.92      1170
   macro avg       0.91      0.92      0.92      1170
weighted avg       0.92      0.92      0.92      1170

Confusion Matrix:
 [[561  59]
 [ 40 510]]

Training Decision Tree
Accuracy : 0.9102564102564102
Precision: 0.9023508137432188
Recall   : 0.9072727272727272
F1 Score : 0.9048050770625566

Classification Report:

              precision    recall  f1-score   support

           0       0.92      0.91      0.92       620
           1       0.90      0.91      0.90       550

    accuracy                           0.91      1170
   macro avg       0.91      0.91      0.91      1170
weighted avg   

In [21]:
best_model