In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import cross_val_score

In [19]:
# Load the dataset
df = pd.read_csv('Phishing_URL_Dataset.csv')

# Assuming the last column is the target label
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Importance from Information Gain for feature selection
information_gain = mutual_info_classif(X_train, y_train)
# Getting feature names from the DataFrame columns
feature_names = X_train.columns.tolist()

# Sort features based on their importance scores
mi_scores_df = pd.DataFrame({'Feature': feature_names, 'Importance': information_gain})
mi_scores_df = mi_scores_df.sort_values(by='Importance', ascending=False)

In [20]:
# Initialize classifiers
classifiers = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=10000),  # Adjust max_iter
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Define the list of top features you want to evaluate
top_features_list = [10, 20, 30, 40, len(feature_names)]

# Loop over different classifiers
for clf_name, clf in classifiers.items():
    print(f"Evaluating {clf_name}:\n")
    # Loop over different numbers of top features
    for num_features in top_features_list:
        if num_features == len(feature_names):
            X_selected = X  # All features
            X_train_subset, X_test_subset = X_train, X_test  # Use all features
        else:
            # Select top features
            selected_features_subset = mi_scores_df['Feature'][:num_features]
            X_selected = X[selected_features_subset]
            X_train_subset, X_test_subset = X_train[selected_features_subset], X_test[selected_features_subset]

        # Train the classifier
        clf.fit(X_train_subset, y_train)
        
        # Predictions
        y_pred = clf.predict(X_test_subset)
        
        # Calculate performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=1)
        recall = recall_score(y_test, y_pred, zero_division=1)
        f1 = f1_score(y_test, y_pred, zero_division=1)
        roc_auc = roc_auc_score(y_test, y_pred)
        
        # Cross-validation
        cv_scores = cross_val_score(clf, X_train_subset, y_train, cv=5)
        
        # Print performance metrics
        print(f"Performance Metrics for {num_features} top features using {clf_name}:")
        print("Accuracy: {:.2f}".format(accuracy))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F1 Score: {:.2f}".format(f1))
        print("ROC AUC: {:.2f}".format(roc_auc))
        print("Cross-Validated Accuracy Scores:", cv_scores)
        print("Average CV Accuracy: {:.2f}".format(np.mean(cv_scores)))
        print()

Evaluating SVM:

Performance Metrics for 10 top features using SVM:
Accuracy: 0.90
Precision: 0.98
Recall: 0.84
F1 Score: 0.90
ROC AUC: 0.91
Cross-Validated Accuracy Scores: [0.88817324 0.89074138 0.88928354 0.87976781 0.87923768]
Average CV Accuracy: 0.89

Performance Metrics for 20 top features using SVM:
Accuracy: 0.90
Precision: 0.98
Recall: 0.84
F1 Score: 0.90
ROC AUC: 0.91
Cross-Validated Accuracy Scores: [0.8894455  0.89227874 0.8912715  0.88117263 0.8810401 ]
Average CV Accuracy: 0.89

Performance Metrics for 30 top features using SVM:
Accuracy: 0.90
Precision: 0.98
Recall: 0.84
F1 Score: 0.90
ROC AUC: 0.91
Cross-Validated Accuracy Scores: [0.88955153 0.8925173  0.89129801 0.88162324 0.88133167]
Average CV Accuracy: 0.89

Performance Metrics for 40 top features using SVM:
Accuracy: 0.90
Precision: 0.98
Recall: 0.84
F1 Score: 0.90
ROC AUC: 0.91
Cross-Validated Accuracy Scores: [0.88955153 0.89243778 0.89132452 0.88162324 0.88130517]
Average CV Accuracy: 0.89

Performance Metrics