In [19]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [20]:
# Load the dataset
df = pd.read_csv('Phishing_URL_Dataset.csv')

# Assuming 'label' is the target column in your DataFrame
feature_names = df.columns.drop('label').tolist()
X = df[feature_names]
y = df['label']

In [21]:
# Chi-square feature selection
chi2_scores, p_values = chi2(X, y)
sorted_features = sorted(zip(feature_names, chi2_scores, p_values), key=lambda x: x[1], reverse=True)
selected_features = [feature for feature, _, p in sorted_features if p < 0.05]

# Print Selected features
print("Selected features based on chi-square test:", selected_features)

Selected features based on chi-square test: ['LargestLineLength', 'LineOfCode', 'NoOfSelfRef', 'NoOfExternalRef', 'NoOfImage', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'URLSimilarityIndex', 'NoOfJS', 'NoOfCSS', 'NoOfLettersInURL', 'URLLength', 'NoOfDegitsInURL', 'NoOfEmptyRef', 'NoOfiFrame', 'NoOfOtherSpecialCharsInURL', 'HasSocialNet', 'DomainLength', 'HasCopyrightInfo', 'HasDescription', 'HasSubmitButton', 'HasHiddenFields', 'HasFavicon', 'NoOfPopup', 'Robots', 'IsResponsive', 'Pay', 'NoOfEqualsInURL', 'IsHTTPS', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfObfuscatedChar', 'DegitRatioInURL', 'Bank', 'HasTitle', 'HasExternalFormSubmit', 'HasPasswordField', 'CharContinuationRate', 'Crypto', 'NoOfSelfRedirect', 'SpacialCharRatioInURL', 'LetterRatioInURL', 'IsDomainIP', 'HasObfuscation', 'TLDLegitimateProb', 'NoOfURLRedirect', 'TLDLength', 'URLCharProb', 'ObfuscationRatio']


In [24]:
# For demonstration purposes, let's select features randomly
selected_features = feature_names[:20]  # Assuming you have selected 20 features

# Initialize classifiers
classifiers = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=10000),  # Adjust max_iter
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Define function to evaluate classifier
def evaluate_classifier(clf, X_train, X_test, y_train, y_test, num_features, name, print_result=True):
    # Train classifier
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) if hasattr(clf, "predict_proba") else 0

    # Cross-validation
    cv_scores = cross_val_score(clf, X_train, y_train, cv=5)

    # Print results only when specified
    if print_result:
        print(f"Performance Metrics for {num_features} top features using {name}:")
        print("Accuracy: {:.2f}".format(accuracy))
        print("Precision: {:.2f}".format(precision))
        print("Recall: {:.2f}".format(recall))
        print("F1 Score: {:.2f}".format(f1))
        print("ROC AUC: {:.2f}".format(roc_auc))
        print("Cross-Validated Accuracy Scores:", cv_scores)
        print("Average CV Accuracy: {:.2f}\n".format(np.mean(cv_scores)))

# Assuming you have total_features defined somewhere in your code
total_features = len(feature_names)

# Loop over different classifiers
for name, clf in classifiers.items():
    print(f"Evaluating {name}:\n")
    # Loop over different numbers of top features
    for num_features in [10, 20, 30, 40, total_features]:
        if num_features == total_features:
            X_selected = X  # All features
        else:
            # Select top features
            selected_features_subset = selected_features[:num_features]
            X_selected = X[selected_features_subset]

        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Evaluate classifier
        # Set print_result to False for num_features not in [10, 20, 30, 40, total_features]
        print_result = num_features in [10, 20, 30, 40, total_features]
        evaluate_classifier(clf, X_train_scaled, X_test_scaled, y_train, y_test, num_features, name, print_result)

Evaluating SVM:

Performance Metrics for 10 top features using SVM:
Accuracy: 1.00
Precision: 0.99
Recall: 1.00
F1 Score: 1.00
ROC AUC: 0.00
Cross-Validated Accuracy Scores: [0.99691021 0.99663748 0.99645573 0.99651631 0.99718276]
Average CV Accuracy: 1.00

Performance Metrics for 20 top features using SVM:
Accuracy: 1.00
Precision: 0.99
Recall: 1.00
F1 Score: 1.00
ROC AUC: 0.00
Cross-Validated Accuracy Scores: [0.99694051 0.99678895 0.99648602 0.9965769  0.99721305]
Average CV Accuracy: 1.00

Performance Metrics for 30 top features using SVM:
Accuracy: 1.00
Precision: 0.99
Recall: 1.00
F1 Score: 1.00
ROC AUC: 0.00
Cross-Validated Accuracy Scores: [0.99694051 0.99678895 0.99648602 0.9965769  0.99721305]
Average CV Accuracy: 1.00

Performance Metrics for 40 top features using SVM:
Accuracy: 1.00
Precision: 0.99
Recall: 1.00
F1 Score: 1.00
ROC AUC: 0.00
Cross-Validated Accuracy Scores: [0.99694051 0.99678895 0.99648602 0.9965769  0.99721305]
Average CV Accuracy: 1.00

Performance Metrics