In [145]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [146]:
df = pd.read_csv('phishing.csv') 

In [147]:
df

Unnamed: 0,Index,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,HTTPS,DomainRegLen,...,UsingPopupWindow,IframeRedirection,AgeofDomain,DNSRecording,WebsiteTraffic,PageRank,GoogleIndex,LinksPointingToPage,StatsReport,class
0,0,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
1,1,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
2,2,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
3,3,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1
4,4,-1,0,-1,1,-1,-1,1,1,-1,...,1,1,1,1,1,-1,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11049,11049,1,-1,1,-1,1,1,1,1,-1,...,-1,-1,1,1,-1,-1,1,1,1,1
11050,11050,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,1,1,1,1,1,1,-1,1,-1
11051,11051,1,-1,1,1,1,-1,1,-1,-1,...,1,1,1,1,1,-1,1,0,1,-1
11052,11052,-1,-1,1,1,1,-1,-1,-1,1,...,-1,1,1,1,1,-1,1,1,1,-1


In [148]:
df.isnull().sum()

Index                  0
UsingIP                0
LongURL                0
ShortURL               0
Symbol@                0
Redirecting//          0
PrefixSuffix-          0
SubDomains             0
HTTPS                  0
DomainRegLen           0
Favicon                0
NonStdPort             0
HTTPSDomainURL         0
RequestURL             0
AnchorURL              0
LinksInScriptTags      0
ServerFormHandler      0
InfoEmail              0
AbnormalURL            0
WebsiteForwarding      0
StatusBarCust          0
DisableRightClick      0
UsingPopupWindow       0
IframeRedirection      0
AgeofDomain            0
DNSRecording           0
WebsiteTraffic         0
PageRank               0
GoogleIndex            0
LinksPointingToPage    0
StatsReport            0
class                  0
dtype: int64

In [149]:
class_counts = df['class'].value_counts()

In [150]:
class_counts

 1    6157
-1    4897
Name: class, dtype: int64

In [151]:
df = pd.DataFrame(df)

In [152]:
# Splitting the dataset into features (X) and target variable (y)
X = df.drop('class', axis=1)
y = df['class']

In [153]:
# Splitting the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Decision Tree

In [154]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,f1_score

In [155]:
# Define different values for max_depth
max_depth_values = [1, 5, 10, 20, 30]

In [156]:
# Initialize lists to store performance metrics
accuracies = []
precisions = []
recalls = []
specificities = []

In [157]:
# Train and evaluate the decision tree classifier for each max_depth value
for depth in max_depth_values:
    # Initialize the decision tree classifier with the current max_depth
    dt_classifier = DecisionTreeClassifier(max_depth=depth, random_state=42)
    
    # Train the classifier
    dt_classifier.fit(X_train, y_train)
    
    # Predict the target variable on the test set
    y_pred = dt_classifier.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # Append the metrics to the lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    specificities.append(specificity) 

In [158]:
# Print the results
for i, depth in enumerate(max_depth_values):              
    print(f"Max Depth: {depth}, Accuracy: {accuracies[i]}, Precision: {precisions[i]}, Recall: {recalls[i]}, Specificity: {specificities[i]}")

Max Depth: 1, Accuracy: 0.8896593307205306, Precision: 0.8920335429769392, Recall: 0.9140708915145005, Specificity: 0.8584192439862542
Max Depth: 5, Accuracy: 0.9225203497135966, Precision: 0.8974739970282318, Recall: 0.9731471535982814, Specificity: 0.8577319587628865
Max Depth: 10, Accuracy: 0.9403075067832378, Precision: 0.9546448087431694, Recall: 0.9382384532760473, Specificity: 0.9429553264604811
Max Depth: 20, Accuracy: 0.9484473922218872, Precision: 0.9553042541733979, Recall: 0.9527389903329753, Specificity: 0.9429553264604811
Max Depth: 30, Accuracy: 0.9484473922218872, Precision: 0.9553042541733979, Recall: 0.9527389903329753, Specificity: 0.9429553264604811


#  NAIVE BAYES

In [159]:
from sklearn.naive_bayes import GaussianNB

In [160]:
# Initialize Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

In [161]:
# Train the classifier
nb_classifier.fit(X_train, y_train)

In [162]:
# Predict the target variable on the test set
y_pred = nb_classifier.predict(X_test)


In [163]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [164]:
# Calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp)

# Print the results
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, Specificity: {specificity}, F1 Score: {f1}")

Accuracy: 0.8760928549894483, Precision: 0.9431887599266952, Recall: 0.8292158968850698, Specificity: 0.9360824742268041, F1 Score: 0.882537867962275


#   KNeighborsClassifier

In [165]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [166]:
# Define the function to calculate specificity
def specificity_score(y_true, y_pred):
    tn, fp, _, _ = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

In [167]:
# Define different values for max_iter
max_iter_values = [10, 20, 30, 40, 50]

In [168]:
# Perform experiments with different values of n_neighbors
for n_neighbors in [2, 3, 4]:
    # Create and train the K-Neighbors Classifier
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = knn_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    specificity = specificity_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Print the results
    print(f"Results for n_neighbors = {n_neighbors}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"Specificity: {specificity:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()

Results for n_neighbors = 2:
Accuracy: 0.61
Precision: 0.76
Recall: 0.44
Specificity: 0.83
F1-score: 0.56

Results for n_neighbors = 3:
Accuracy: 0.62
Precision: 0.67
Recall: 0.65
Specificity: 0.58
F1-score: 0.66

Results for n_neighbors = 4:
Accuracy: 0.59
Precision: 0.69
Recall: 0.47
Specificity: 0.73
F1-score: 0.56



# SUPPORT VECTOR MACHINE

In [169]:
from sklearn.svm import SVC

In [170]:
# Define max_iter values
max_iter_values = [10, 20, 30, 40, 50]

In [171]:
# Initialize lists to store evaluation metrics
accuracies = []
precisions = []
recalls = []
specificities = []
f1_scores = []

In [172]:
# Iterate over max_iter values
for max_iter in max_iter_values:
    # Initialize SVM model with current max_iter value
    svm_model = SVC(max_iter=max_iter, random_state=42)
    
    # Train the model
    svm_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm_model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)



In [173]:
# Print results
for i, max_iter in enumerate(max_iter_values):
    print(f"Max_iter={max_iter}:")
    print(f"  Accuracy: {accuracies[i]:.2f}")
    print(f"  Precision: {precisions[i]:.2f}")
    print(f"  Recall: {recalls[i]:.2f}")
    print(f"  Specificity: {specificities[i]:.2f}")
    print(f"  F1-score: {f1_scores[i]:.2f}")
    print()

Max_iter=10:
  Accuracy: 0.47
  Precision: 0.55
  Recall: 0.34
  Specificity: 0.64
  F1-score: 0.42

Max_iter=20:
  Accuracy: 0.51
  Precision: 0.56
  Recall: 0.63
  Specificity: 0.36
  F1-score: 0.59

Max_iter=30:
  Accuracy: 0.51
  Precision: 0.56
  Recall: 0.63
  Specificity: 0.36
  F1-score: 0.59

Max_iter=40:
  Accuracy: 0.51
  Precision: 0.55
  Recall: 0.68
  Specificity: 0.31
  F1-score: 0.61

Max_iter=50:
  Accuracy: 0.51
  Precision: 0.56
  Recall: 0.63
  Specificity: 0.36
  F1-score: 0.59



# RANDOM FOREST MODEL

In [174]:
from sklearn.ensemble import RandomForestClassifier

In [175]:
# Define max_depth values
max_depth_values = [10, 20, 30, 40, 50]

In [176]:
# Initialize lists to store evaluation metrics
accuracies = []
precisions = []
recalls = []
specificities = []
f1_scores = []

In [177]:
# Iterate over max_depth values
for max_depth in max_depth_values:
    # Initialize Random Forest model with current max_depth value
    rf_model = RandomForestClassifier(max_depth=max_depth, random_state=42)
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf_model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

In [178]:
# Print results
for i, max_depth in enumerate(max_depth_values):
    print(f"Max_depth={max_depth}:")
    print(f"  Accuracy: {accuracies[i]:.2f}")
    print(f"  Precision: {precisions[i]:.2f}")
    print(f"  Recall: {recalls[i]:.2f}")
    print(f"  Specificity: {specificities[i]:.2f}")
    print(f"  F1-score: {f1_scores[i]:.2f}")
    print()

Max_depth=10:
  Accuracy: 0.95
  Precision: 0.95
  Recall: 0.97
  Specificity: 0.93
  F1-score: 0.96

Max_depth=20:
  Accuracy: 0.97
  Precision: 0.97
  Recall: 0.97
  Specificity: 0.96
  F1-score: 0.97

Max_depth=30:
  Accuracy: 0.97
  Precision: 0.97
  Recall: 0.98
  Specificity: 0.96
  F1-score: 0.97

Max_depth=40:
  Accuracy: 0.97
  Precision: 0.97
  Recall: 0.98
  Specificity: 0.96
  F1-score: 0.97

Max_depth=50:
  Accuracy: 0.97
  Precision: 0.97
  Recall: 0.98
  Specificity: 0.96
  F1-score: 0.97



# GRADIENT BOOSTING MODEL

In [179]:
from sklearn.ensemble import GradientBoostingClassifier

In [180]:
# Define max_depth values
max_depth_values = [2, 5, 8, 10, 12]

In [181]:
# Initialize lists to store evaluation metrics
accuracies = []
precisions = []
recalls = []
specificities = []
f1_scores = []

In [182]:
# Iterate over max_depth values
for max_depth in max_depth_values:
    # Initialize Gradient Boosting model with current max_depth value
    gb_model = GradientBoostingClassifier(max_depth=max_depth, random_state=42)
    
    # Train the model
    gb_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = gb_model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    specificities.append(specificity)
    f1_scores.append(f1)

In [183]:
# Print results
for i, max_depth in enumerate(max_depth_values):
    print(f"Max_depth={max_depth}:")
    print(f"  Accuracy: {accuracies[i]:.2f}")
    print(f"  Precision: {precisions[i]:.2f}")
    print(f"  Recall: {recalls[i]:.2f}")
    print(f"  Specificity: {specificities[i]:.2f}")
    print(f"  F1-score: {f1_scores[i]:.2f}")
    print()

Max_depth=2:
  Accuracy: 0.94
  Precision: 0.94
  Recall: 0.96
  Specificity: 0.92
  F1-score: 0.95

Max_depth=5:
  Accuracy: 0.96
  Precision: 0.96
  Recall: 0.97
  Specificity: 0.95
  F1-score: 0.97

Max_depth=8:
  Accuracy: 0.96
  Precision: 0.96
  Recall: 0.97
  Specificity: 0.95
  F1-score: 0.97

Max_depth=10:
  Accuracy: 0.96
  Precision: 0.96
  Recall: 0.97
  Specificity: 0.95
  F1-score: 0.97

Max_depth=12:
  Accuracy: 0.96
  Precision: 0.97
  Recall: 0.96
  Specificity: 0.96
  F1-score: 0.96



# hybrid ensemble model (LR+SVC+DT)


In [184]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [185]:
# Perform canopy-based feature selection
canopy_selector = SelectFromModel(DecisionTreeClassifier())
X_selected = canopy_selector.fit_transform(X, y)

In [186]:
# Define the base models
lr = LogisticRegression()
svc = SVC(probability=True)
dt = DecisionTreeClassifier()

In [187]:
# Define the ensemble model with soft voting
soft_voting_clf = VotingClassifier(estimators=[('lr', lr), ('svc', svc), ('dt', dt)], voting='soft')

In [188]:
# Define the ensemble model with hard voting
hard_voting_clf = VotingClassifier(estimators=[('lr', lr), ('svc', svc), ('dt', dt)], voting='hard')

In [189]:
# Perform cross-fold validation
cv_scores_soft = cross_val_score(soft_voting_clf, X_train, y_train, cv=5)
cv_scores_hard = cross_val_score(hard_voting_clf, X_train, y_train, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [190]:
# Tune hyperparameters using grid search
param_grid = {'lr__C': [0.1, 1, 10]}  # Example hyperparameters for Logistic Regression
grid_search_soft = GridSearchCV(soft_voting_clf, param_grid, cv=5)
grid_search_hard = GridSearchCV(hard_voting_clf, param_grid, cv=5)

In [191]:
# Train the models
grid_search_soft.fit(X_train, y_train)
grid_search_hard.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [192]:
# Make predictions
y_pred_soft = grid_search_soft.predict(X_test)
y_pred_hard = grid_search_hard.predict(X_test)

In [193]:
# Evaluate the performance
accuracy_soft = accuracy_score(y_test, y_pred_soft)
precision_soft = precision_score(y_test, y_pred_soft)
recall_soft = recall_score(y_test, y_pred_soft)
f1_soft = f1_score(y_test, y_pred_soft)

In [194]:
accuracy_hard = accuracy_score(y_test, y_pred_hard)
precision_hard = precision_score(y_test, y_pred_hard)
recall_hard = recall_score(y_test, y_pred_hard)
f1_hard = f1_score(y_test, y_pred_hard)


In [195]:
# Print the evaluation metrics
print("Soft Voting - Accuracy:", accuracy_soft)
print("Soft Voting - Precision:", precision_soft)
print("Soft Voting - Recall:", recall_soft)
print("Soft Voting - F1-score:", f1_soft)


Soft Voting - Accuracy: 0.9442267108833283
Soft Voting - Precision: 0.9453000531067446
Soft Voting - Recall: 0.9559613319011815
Soft Voting - F1-score: 0.9506008010680909


In [196]:
print("Hard Voting - Accuracy:", accuracy_hard)
print("Hard Voting - Precision:", precision_hard)
print("Hard Voting - Recall:", recall_hard)
print("Hard Voting - F1-score:", f1_hard)

Hard Voting - Accuracy: 0.9354838709677419
Hard Voting - Precision: 0.9091360476663356
Hard Voting - Recall: 0.9833512352309345
Hard Voting - F1-score: 0.9447884416924665


# Proposed Approach(DT+RF+XGB)

In [197]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [198]:
# Perform canopy-based feature selection
canopy_selector = SelectFromModel(DecisionTreeClassifier())
X_selected = canopy_selector.fit_transform(X, y)

In [199]:
# Define the base models
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [200]:
# Define the ensemble model with soft voting
soft_voting_clf = VotingClassifier(estimators=[('dt', dt), ('rf', rf), ('xgb', xgb)], voting='soft')

In [201]:
# Define the ensemble model with hard voting
hard_voting_clf = VotingClassifier(estimators=[('dt', dt), ('rf', rf), ('xgb', xgb)], voting='hard')

In [202]:
# Perform cross-fold validation
cv_scores_soft = cross_val_score(soft_voting_clf, X_train, y_train, cv=5)
cv_scores_hard = cross_val_score(hard_voting_clf, X_train, y_train, cv=5)

In [205]:
# Tune hyperparameters using grid search
param_grid = {'dt__max_depth': [None, 5, 10], 'rf__n_estimators': [50, 100, 200], 'xgb__max_depth': [3, 5, 7]}
grid_search_soft = GridSearchCV(soft_voting_clf, param_grid, cv=5)
grid_search_hard = GridSearchCV(hard_voting_clf, param_grid, cv=5)

In [206]:
# Train the models
grid_search_soft.fit(X_train, y_train)
grid_search_hard.fit(X_train, y_train)


In [207]:
# Make predictions
y_pred_soft = grid_search_soft.predict(X_test)
y_pred_hard = grid_search_hard.predict(X_test)


In [208]:
# Evaluate the performance
accuracy_soft = accuracy_score(y_test, y_pred_soft)
precision_soft = precision_score(y_test, y_pred_soft)
recall_soft = recall_score(y_test, y_pred_soft)
f1_soft = f1_score(y_test, y_pred_soft)


In [209]:
accuracy_hard = accuracy_score(y_test, y_pred_hard)
precision_hard = precision_score(y_test, y_pred_hard)
recall_hard = recall_score(y_test, y_pred_hard)
f1_hard = f1_score(y_test, y_pred_hard)

In [210]:
# Print the evaluation metrics
print("Soft Voting - Accuracy:", accuracy_soft)
print("Soft Voting - Precision:", precision_soft)
print("Soft Voting - Recall:", recall_soft)
print("Soft Voting - F1-score:", f1_soft)


Soft Voting - Accuracy: 0.965330117576123
Soft Voting - Precision: 0.9683646112600536
Soft Voting - Recall: 0.9699248120300752
Soft Voting - F1-score: 0.9691440837134424


In [211]:
print("Hard Voting - Accuracy:", accuracy_hard)
print("Hard Voting - Precision:", precision_hard)
print("Hard Voting - Recall:", recall_hard)
print("Hard Voting - F1-score:", f1_hard)

Hard Voting - Accuracy: 0.96593307205306
Hard Voting - Precision: 0.9689008042895443
Hard Voting - Recall: 0.9704618689581096
Hard Voting - F1-score: 0.9696807083445131


In [None]:
c

# try and error

In [229]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [233]:
# Function for Canopy feature selection
def canopy_feature_selection(X, threshold):
    canopy_centers = []
    canopy_points = []
    for i, point in enumerate(X):
        if len(canopy_centers) == 0:
            canopy_centers.append(point)
        else:
            distances = [((c - point) ** 2).sum() for c in canopy_centers]
            min_distance = min(distances)
            closest_center = distances.index(min_distance)
            if min_distance < threshold:
                canopy_points[closest_center].append(point)
            else:
                canopy_centers.append(point)
                canopy_points.append([point])
    return canopy_centers

In [235]:
# Load dataset
dataset = pd.read_csv('phishing.csv')
dataset.dropna(inplace=True)
X = dataset.drop(columns=['class'])
y = dataset['class']

In [236]:
# Perform Canopy feature selection
X_canopy = canopy_feature_selection(X.values, threshold=0.5)

In [237]:
# Convert Canopy feature selection result back to DataFrame
X_canopy_df = pd.DataFrame(X_canopy, columns=X.columns)

In [238]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_canopy_df, y, test_size=0.3, random_state=42)

In [239]:
# Initialize models
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier()

In [240]:
# Combine models using voting
voting_model = VotingClassifier([('dt', dt_model), ('rf', rf_model), ('xgb', xgb_model)], voting='hard')

In [241]:
# Define parameter grid for grid search
param_grid = {
    'dt__max_depth': [5, 10, 15],
    'rf__n_estimators': [50, 100, 200],
    'xgb__n_estimators': [50, 100, 200]
}

In [242]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(voting_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [243]:
# Evaluate model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [244]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.9638227313837805
Precision: 0.9652777777777778
Recall: 0.9704618689581096
F1-score: 0.9678628816282807
