In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Importing dataset from google drive
df = pd.read_csv('Phishing_URL_Dataset.csv')

# Drop target column
X = df.drop(columns=['label'])
y = df['label']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Feature Importance from Random Forest for feature selection
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances from Random Forest
feature_importances_rf = pd.Series(rf.feature_importances_, index=X.columns)
selected_features_rf = feature_importances_rf.sort_values(ascending=False).index

print("\nSelected features using feature importance from Random Forest:")
print(selected_features_rf)


Selected features using feature importance from Random Forest:
Index(['URLSimilarityIndex', 'NoOfExternalRef', 'LineOfCode', 'NoOfSelfRef',
       'NoOfImage', 'NoOfJS', 'HasSocialNet', 'NoOfCSS', 'HasCopyrightInfo',
       'IsHTTPS', 'LargestLineLength', 'HasDescription', 'NoOfDegitsInURL',
       'NoOfOtherSpecialCharsInURL', 'NoOfLettersInURL', 'DegitRatioInURL',
       'DomainTitleMatchScore', 'IsResponsive', 'URLLength',
       'LetterRatioInURL', 'URLTitleMatchScore', 'SpacialCharRatioInURL',
       'HasSubmitButton', 'NoOfSubDomain', 'NoOfEmptyRef', 'URLCharProb',
       'DomainLength', 'HasFavicon', 'CharContinuationRate', 'HasHiddenFields',
       'NoOfQMarkInURL', 'TLDLength', 'TLDLegitimateProb', 'HasPasswordField',
       'NoOfiFrame', 'Bank', 'Pay', 'HasTitle', 'NoOfEqualsInURL',
       'NoOfAmpersandInURL', 'Robots', 'NoOfURLRedirect', 'NoOfPopup',
       'Crypto', 'NoOfObfuscatedChar', 'IsDomainIP', 'NoOfSelfRedirect',
       'HasExternalFormSubmit', 'HasObfuscation', '

In [4]:
# Use selected features for training and testing
X_train_selected = X_train[selected_features_rf]
X_test_selected = X_test[selected_features_rf]

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Define the classifiers
classifiers = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),  # Increased max_iter
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Loop through each classifier
for clf_name, clf in classifiers.items():
    # Fit the model on the scaled training data
    clf.fit(X_train_scaled, y_train)

    # Predict on the scaled test data
    y_pred = clf.predict(X_test_scaled)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    cv_scores = cross_val_score(clf, scaler.transform(X[selected_features_rf]), y, cv=5)  # Ensure CV is also done on scaled data

    # Print performance metrics
    print(f"\nPerformance Metrics for {clf_name}:")
    print("Accuracy: {:.2f}".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    print("F1 Score: {:.2f}".format(f1))
    print("ROC AUC: {:.2f}".format(roc_auc))
    print("Cross-Validated Accuracy Scores:", cv_scores)
    print("Average CV Accuracy: {:.2f}".format(np.mean(cv_scores)))


Performance Metrics for SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99980916 0.99974554 0.99980916 0.99961831 0.99983036]
Average CV Accuracy: 1.00

Performance Metrics for Logistic Regression:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99983036 0.99987277 0.99991518 0.99983036 0.99989398]
Average CV Accuracy: 1.00

Performance Metrics for Decision Tree:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [1. 1. 1. 1. 1.]
Average CV Accuracy: 1.00

Performance Metrics for Random Forest:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.9999788 1.        0.9999788 1.        1.       ]
Average CV Accuracy: 1.00
