In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [3]:
# Importing dataset from google drive (assuming you have it mounted)
df = pd.read_csv('Phishing_URL_Dataset.csv')

#df = pd.read_csv('/home/Phishing_URL_Dataset.csv')

# Droping target column
X = df.drop(columns=['label'])
y = df['label']

# print(df.info)
# print(df.isnull().sum())

In [4]:
# Calculate mutual information between each feature and the target variable
mi_scores = mutual_info_classif(X, y, random_state=42)
mi_scores_df = pd.DataFrame({'Feature': X.columns, 'Mutual_Information': mi_scores})
mi_scores_df = mi_scores_df.sort_values(by='Mutual_Information', ascending=False)
print(mi_scores_df)  # Print mi_scores_df here to verify its creation

                       Feature  Mutual_Information
3           URLSimilarityIndex            0.680530
22                  LineOfCode            0.600887
49             NoOfExternalRef            0.561239
44                   NoOfImage            0.543638
47                 NoOfSelfRef            0.527161
46                      NoOfJS            0.500278
23           LargestLineLength            0.489187
45                     NoOfCSS            0.447350
36                HasSocialNet            0.417792
13            LetterRatioInURL            0.381220
43            HasCopyrightInfo            0.351523
32              HasDescription            0.306981
21                     IsHTTPS            0.257894
19  NoOfOtherSpecialCharsInURL            0.242362
25       DomainTitleMatchScore            0.216945
37             HasSubmitButton            0.209330
20       SpacialCharRatioInURL            0.205645
5            TLDLegitimateProb            0.197855
26          URLTitleMatchScore 

In [5]:
# Define the number of top features to evaluate
top_features_list = [10, 20, 30, 40]

# Total number of features available
total_features = len(mi_scores_df['Feature'])

# Add all features to the list after the top 50
top_features_list.append(total_features)

# Define classifiers
classifiers = {
    "SVM": SVC(kernel='linear', C=1.0, random_state=42),
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

In [6]:
# Loop through classifiers
for clf_name, clf in classifiers.items():
    print(f"Evaluating {clf_name}:")
    for n_features in top_features_list:
        selected_features = mi_scores_df['Feature'].head(n_features)
        X_selected = X[selected_features]

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

        # Initialize and train the classifier
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        roc_auc = roc_auc_score(y_test, y_pred)

        # Compute cross-validated scores for accuracy
        cv_scores = cross_val_score(clf, X_selected, y, cv=5, scoring='accuracy')

        # Print the evaluation results
        feature_description = "all" if n_features == total_features else f"{n_features}"
        print(f"\nPerformance Metrics for {feature_description} top features using {clf_name}:")
        print("Accuracy: {:.2f}".format(accuracy))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F1 Score: {:.2f}".format(f1))
        print("ROC AUC: {:.2f}".format(roc_auc))
        print("Cross-Validated Accuracy Scores:", cv_scores)
        print("Average CV Accuracy: {:.2f}".format(np.mean(cv_scores)))

Evaluating SVM:

Performance Metrics for 10 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99980916 0.99987277 0.99985157 0.99991518 0.99978795]
Average CV Accuracy: 1.00

Performance Metrics for 20 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99987277 0.99987277 0.99991518 0.99995759 0.99985157]
Average CV Accuracy: 1.00

Performance Metrics for 30 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99993639 0.99991518 0.99993639 0.99985157 0.99987277]
Average CV Accuracy: 1.00

Performance Metrics for 40 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99987277 0.99991518 0.99989398 0.99991518 0.99989398]
Average CV Accuracy: 1.00

Performance Metrics

In [7]:
# Loop through classifiers
for clf_name, clf in classifiers.items():
    print(f"Evaluating {clf_name}:")
    for n_features in top_features_list:
        selected_features = mi_scores_df['Feature'].head(n_features)
        X_selected = X[selected_features]

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

        # Initialize and train the classifier
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')
        roc_auc = roc_auc_score(y_test, y_pred)

        # Compute cross-validated scores for accuracy
        cv_scores = cross_val_score(clf, X_selected, y, cv=10, scoring='accuracy')


        # Print the evaluation results
        feature_description = "all" if n_features == total_features else f"{n_features}"
        print(f"\nPerformance Metrics for {feature_description} top features using {clf_name}:")
        print("Accuracy: {:.2f}".format(accuracy))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F1 Score: {:.2f}".format(f1))
        print("ROC AUC: {:.2f}".format(roc_auc))
        print("Cross-Validated Accuracy Scores:", cv_scores)
        print("Average CV Accuracy: {:.2f}".format(np.mean(cv_scores)))

Evaluating SVM:

Performance Metrics for 10 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99987277 0.99970314 0.99983036 0.99987277 0.99983036 0.99987277
 0.99991518 0.99991518 0.99983036 0.99978795]
Average CV Accuracy: 1.00

Performance Metrics for 20 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99995759 0.99978796 0.99987277 0.99987277 0.99987277 0.99991518
 0.99995759 0.99995759 0.99991518 0.99978795]
Average CV Accuracy: 1.00

Performance Metrics for 30 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99987277 0.99987277 0.99991518 0.99991518 0.99987277 0.99991518
 0.99987277 0.99987277 0.99991518 0.99983036]
Average CV Accuracy: 1.00

Performance Metrics for 40 top features using SVM:
Accuracy: 1.00
Precision: 1.00
Recall: 1.