In [1]:
# Importing modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Reading in data
df = pd.read_csv("phishing.csv")

In [5]:
# Extracting labels
labels = df.loc[:, ~df.columns.str.contains('class')]
labels.shape





(11054, 31)

In [10]:
target = df['class']

In [6]:

# Create a correlation matrix
corr_matrix = labels.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
print(pd.DataFrame(upper))

                     Index  UsingIP   LongURL  ShortURL   Symbol@  \
Index                  NaN  0.38862  0.006441  0.006221  0.169437   
UsingIP                NaN      NaN  0.052159  0.403547  0.158766   
LongURL                NaN      NaN       NaN  0.097976  0.075205   
ShortURL               NaN      NaN       NaN       NaN  0.104433   
Symbol@                NaN      NaN       NaN       NaN       NaN   
Redirecting//          NaN      NaN       NaN       NaN       NaN   
PrefixSuffix-          NaN      NaN       NaN       NaN       NaN   
SubDomains             NaN      NaN       NaN       NaN       NaN   
HTTPS                  NaN      NaN       NaN       NaN       NaN   
DomainRegLen           NaN      NaN       NaN       NaN       NaN   
Favicon                NaN      NaN       NaN       NaN       NaN   
NonStdPort             NaN      NaN       NaN       NaN       NaN   
HTTPSDomainURL         NaN      NaN       NaN       NaN       NaN   
RequestURL             NaN      Na

In [7]:
relevant = [1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, 23, 24, 25, 28, 30]
features = np.array(labels.columns)[relevant]
features

array(['UsingIP', 'LongURL', 'ShortURL', 'Symbol@', 'Redirecting//',
       'PrefixSuffix-', 'SubDomains', 'DomainRegLen', 'Favicon',
       'HTTPSDomainURL', 'RequestURL', 'AnchorURL', 'LinksInScriptTags',
       'ServerFormHandler', 'InfoEmail', 'AbnormalURL',
       'IframeRedirection', 'AgeofDomain', 'DNSRecording', 'GoogleIndex',
       'StatsReport'], dtype=object)

In [8]:
labels[features]


Unnamed: 0,UsingIP,LongURL,ShortURL,Symbol@,Redirecting//,PrefixSuffix-,SubDomains,DomainRegLen,Favicon,HTTPSDomainURL,...,AnchorURL,LinksInScriptTags,ServerFormHandler,InfoEmail,AbnormalURL,IframeRedirection,AgeofDomain,DNSRecording,GoogleIndex,StatsReport
0,1,1,1,1,1,-1,0,-1,1,-1,...,0,-1,-1,1,1,1,-1,-1,1,1
1,1,0,1,1,1,-1,-1,-1,1,-1,...,0,-1,-1,-1,-1,1,1,-1,1,-1
2,1,0,1,1,1,-1,-1,1,1,-1,...,0,0,-1,1,1,1,-1,-1,1,1
3,1,0,-1,1,1,-1,1,-1,1,1,...,0,0,-1,1,1,1,-1,-1,1,1
4,-1,0,-1,1,-1,-1,1,-1,1,-1,...,0,0,-1,-1,-1,1,1,1,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11049,1,-1,1,-1,1,1,1,-1,-1,1,...,1,1,-1,-1,1,-1,1,1,1,1
11050,-1,1,1,-1,-1,-1,1,-1,-1,1,...,-1,-1,0,-1,-1,1,1,1,1,1
11051,1,-1,1,1,1,-1,1,-1,1,1,...,0,-1,-1,1,1,1,1,1,1,1
11052,-1,-1,1,1,1,-1,-1,1,-1,1,...,-1,1,-1,1,1,1,1,1,1,1


In [11]:
# Train test split
training_labels, testing_labels, training_target, testing_target  = train_test_split(labels, target, random_state = 42, )


In [12]:


# Define the range of hyperparameters for each classifier
param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}
param_grid_svc = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}
param_grid_dt = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_knn = {
    'n_neighbors': [5, 10, 15],
    'weights': ['uniform', 'distance']
}
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

# Create a list of classifiers with their corresponding hyperparameters
classifiers = [
    #(LogisticRegression(), param_grid_lr),
    #(SVC(), param_grid_svc),
    (RandomForestClassifier(random_state=42), param_grid_rf),
    (GradientBoostingClassifier(random_state=42), param_grid_gb),
    (DecisionTreeClassifier(random_state=42), param_grid_dt),
    (KNeighborsClassifier(), param_grid_knn),
    (GaussianNB(), None),
    #(AdaBoostClassifier(random_state=42), param_grid_ada),
    #(LinearDiscriminantAnalysis(), None),
    (QuadraticDiscriminantAnalysis(), None)
]

n_max = 1000
# Loop over the classifiers and perform grid search
for clf, param_grid in classifiers:
    if param_grid is not None:
        grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, n_jobs = 3, verbose=2)
        grid_search.fit(training_labels[:n_max], training_target[:n_max])
        print(clf.__class__.__name__)
        print("Best parameters:", grid_search.best_params_)
        print("Training accuracy:", grid_search.best_score_)
        print("Test accuracy:", grid_search.score(testing_labels[:n_max], testing_target[:n_max]))
        print("---")
    else:
        clf.fit(training_labels[:n_max], training_target[:n_max])
        print(clf.__class__.__name__)
        print("Training accuracy:", clf.score(training_labels[:n_max], training_target[:n_max]))
        print("Test accuracy:", clf.score(testing_labels[:n_max], testing_target[:n_max]))
        print("---")
        


Fitting 3 folds for each of 162 candidates, totalling 486 fits


KeyboardInterrupt: 

In [None]:
clf, param_grid = LogisticRegression(), param_grid_lr
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")



LogisticRegression
Best parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Training accuracy: 0.927503015681544
Test accuracy: 0.9276410998552822
---




In [None]:
clf, param_grid = SVC(), param_grid_svc
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")

KeyboardInterrupt: 

In [13]:
clf, param_grid = RandomForestClassifier(random_state=42), param_grid_rf
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")

RandomForestClassifier
Best parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Training accuracy: 0.9667068757539203
Test accuracy: 0.9670767004341534
---


In [None]:
clf, param_grid = GradientBoostingClassifier(random_state=42), param_grid_gb
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")

In [14]:
clf, param_grid = DecisionTreeClassifier(random_state=42), param_grid_dt
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")

DecisionTreeClassifier
Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Training accuracy: 0.9483715319662245
Test accuracy: 0.951519536903039
---


In [None]:
clf, param_grid = KNeighborsClassifier(), param_grid_knn
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")

In [None]:
clf, param_grid = AdaBoostClassifier(random_state=42), param_grid_ada
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(training_labels, training_target)
print(clf.__class__.__name__)
print("Best parameters:", grid_search.best_params_)
print("Training accuracy:", grid_search.best_score_)
print("Test accuracy:", grid_search.score(testing_labels, testing_target))
print("---")