In [1]:
# import libaries
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
# Features selction libaries
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.linear_model import Lasso
from sklearn.exceptions import NotFittedError
from sklearn.datasets import make_classification

In [2]:
# importing dataset from google drive
df = pd.read_csv('Phishing_URL_Dataset.csv')

# Droping target column
X = df.drop(columns=['label'])
y = df['label']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# L1 Regularization (Lasso Regression) for feature selection
lasso = Lasso(alpha=0.1)  # You can adjust the regularization strength (alpha) as needed
lasso.fit(X_train, y_train)

In [3]:
# Get selected features
lasso_coefficients = pd.Series(lasso.coef_, index=X.columns)
selected_features_lasso = lasso_coefficients[lasso_coefficients != 0].index.tolist()
print("Selected features using L1 regularization (Lasso Regression): ")
print(selected_features_lasso)

Selected features using L1 regularization (Lasso Regression): 
['URLLength', 'DomainLength', 'URLSimilarityIndex', 'NoOfLettersInURL', 'LineOfCode', 'LargestLineLength', 'URLTitleMatchScore', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']


In [4]:
# Define classifiers
classifiers = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),  # Increase max_iter to avoid convergence warning
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# Iterate over classifiers
for clf_name, clf in classifiers.items():
    clf.fit(X[selected_features_lasso], y)  # Assuming X is your feature matrix and y is your target variable
    y_pred = clf.predict(X[selected_features_lasso])
    
    # Calculate performance metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred)
    
    # Cross-validated accuracy
    cv_scores = cross_val_score(clf, X[selected_features_lasso], y, cv=5)
    
    # Print results
    feature_description = "selected"
    print(f"\nPerformance Metrics for {feature_description} features using {clf_name}:")
    print("Accuracy: {:.2f}".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    print("F1 Score: {:.2f}".format(f1))
    print("ROC AUC: {:.2f}".format(roc_auc))
    print("Cross-Validated Accuracy Scores:", cv_scores)
    print("Average CV Accuracy: {:.2f}".format(np.mean(cv_scores)))


Performance Metrics for selected features using SVM:
Accuracy: 0.91
Precision: 0.99
Recall: 0.86
F1 Score: 0.92
ROC AUC: 0.92
Cross-Validated Accuracy Scores: [0.89599016 0.89813185 0.89533281 0.90727115 0.89806824]
Average CV Accuracy: 0.90


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Performance Metrics for selected features using Logistic Regression:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99887614 0.99906699 0.99864289 0.99864289 0.99889735]
Average CV Accuracy: 1.00

Performance Metrics for selected features using Decision Tree:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99985157 0.99993639 0.99987277 0.99991518 0.99985157]
Average CV Accuracy: 1.00

Performance Metrics for selected features using Random Forest:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00
ROC AUC: 1.00
Cross-Validated Accuracy Scores: [0.99991518 0.9999788  0.99991518 0.99995759 0.99991518]
Average CV Accuracy: 1.00
