In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pipeline_func import get_test_scores, get_xgb_classifier_test_scores
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [3]:
# Load in the phishing
phishing_df = pd.read_csv('../data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [4]:
# FEATURES SELECTED VIA TOP MUTUAL INFO (NON-LINEAR) & PEARSON CORR (LINEAR) in eda.ipynb
categorical_numerical_features = ['HasSocialNet']
categorical_nonnumerical_features = []
ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength']
minmax_continuous_features = ['LetterRatioInURL']
ordinal_features = []
nonnumeric_features = []
label = 'label'

X, y = phishing_df.drop(columns = [label])[categorical_numerical_features + ss_continuous_features + minmax_continuous_features], phishing_df[label]

In [5]:
preprocessor = ColumnTransformer(
    transformers = [
        ('ss', StandardScaler(), ss_continuous_features + categorical_numerical_features),
        ('mm', Pipeline(steps = [('sub_mm', MinMaxScaler()), ('sub_ss', StandardScaler())]), minmax_continuous_features)

    ]
)

In [6]:
log_reg = LogisticRegression(solver = 'saga', max_iter = 1000, n_jobs = -1, tol = 0.005)

log_reg_params = {
    'model__C': np.logspace(-3, 3, 7),
    'model__penalty': ['l1', 'l2'] # consider elasticnet with hyperparam l1_ratio if time permitting
}

In [10]:
lr_test_scores, lr_test_estimators, lr_unpreprocessed_test_sets, lr_preprocessed_test_sets, lr_predicted_labels, lr_baseline_scores = get_test_scores(X, y, preprocessor, log_reg, log_reg_params, n_splits = 10, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV 8/10] END model__C=0.001, model__penalty=l1;, score=0.977 total time=   2.1s
[CV 1/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.6s
[CV 3/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.7s
[CV 7/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.7s
[CV 9/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.5s
[CV 4/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   3.8s
[CV 2/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.9s
[CV 10/10] END model__C=0.001, model__penalty=l1;, score=0.970 total time=   3.7s
[CV 5/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   4.0s
[CV 6/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   3.9s
[CV 1/10] END model__C=0.001, model__penalty=l2;, score=0.970 total time=   2.2s
[CV 8/10] END model

In [11]:
with open('../results/logistic_regression_best_estimators.save', 'wb') as f:
    pickle.dump((lr_test_estimators, lr_test_scores, lr_baseline_scores), f)
with open('../results/logistic_regression_test_data.save', 'wb') as f:
    pickle.dump((lr_unpreprocessed_test_sets, lr_preprocessed_test_sets, lr_predicted_labels), f)

In [7]:
# # print best params
# print(lr_best_test_estimator.named_steps['model'].get_params())
# # get coefficients from best model
# print(lr_best_test_estimator.named_steps['model'].coef_)
# # print feature names 
# print(lr_best_test_estimator.named_steps['preprocessor'].transformers_[0][2] + lr_best_test_estimator.named_steps['preprocessor'].transformers_[1][2])

In [7]:
rf_classifier = RandomForestClassifier(n_estimators = 100, n_jobs = -1)

rf_params = {
    'model__max_depth': [3, 5, 7],
    'model__max_features': [2, 3, 5, 8] 
}

In [12]:
rf_test_scores, rf_test_estimators, rf_unpreprocessed_test_sets, rf_preprocessed_test_sets, rf_predicted_labels, rf_baseline_scores = get_test_scores(X, y, preprocessor, rf_classifier, rf_params, n_splits = 10, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 2/10] END model__max_depth=3, model__max_features=2;, score=0.987 total time=   5.5s
[CV 5/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   5.5s
[CV 8/10] END model__max_depth=3, model__max_features=2;, score=0.989 total time=   5.6s
[CV 4/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   5.6s
[CV 1/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   5.7s
[CV 7/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   5.7s
[CV 9/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   5.7s
[CV 6/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   5.9s
[CV 10/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   6.1s
[CV 3/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   6.3s
[CV 4/10] END model_



[CV 10/10] END model__max_depth=5, model__max_features=3;, score=0.991 total time=  10.3s
[CV 1/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  10.3s
[CV 2/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  10.1s
[CV 3/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  10.0s
[CV 4/10] END model__max_depth=5, model__max_features=5;, score=0.990 total time=   9.9s
[CV 5/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=   9.8s
[CV 6/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=   9.9s
[CV 9/10] END model__max_depth=5, model__max_features=5;, score=0.989 total time=  11.0s
[CV 7/10] END model__max_depth=5, model__max_features=5;, score=0.989 total time=  12.3s
[CV 1/10] END model__max_depth=5, model__max_features=8;, score=0.989 total time=  17.6s
[CV 2/10] END model__max_depth=5, model__max_features=8;, score=0.989 total time=  18.0s
[CV 3/10] END model_



[CV 7/10] END model__max_depth=5, model__max_features=3;, score=0.991 total time=  10.3s
[CV 8/10] END model__max_depth=5, model__max_features=3;, score=0.990 total time=   9.8s
[CV 9/10] END model__max_depth=5, model__max_features=3;, score=0.991 total time=   9.9s
[CV 10/10] END model__max_depth=5, model__max_features=3;, score=0.990 total time=   9.9s
[CV 3/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  14.3s
[CV 4/10] END model__max_depth=5, model__max_features=5;, score=0.989 total time=  14.2s
[CV 1/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  14.6s
[CV 2/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  14.6s
[CV 5/10] END model__max_depth=5, model__max_features=5;, score=0.989 total time=  14.1s
[CV 6/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  14.7s
[CV 7/10] END model__max_depth=5, model__max_features=5;, score=0.991 total time=  14.5s
[CV 8/10] END model_

In [19]:
with open('../results/random_forest_classifier_best_estimators.save', 'wb') as f:
    pickle.dump((rf_test_estimators, rf_test_scores, rf_baseline_scores), f)
with open('../results/random_forest_classifier_test_data.save', 'wb') as f:
    pickle.dump((rf_unpreprocessed_test_sets, rf_preprocessed_test_sets, rf_predicted_labels), f)

In [8]:
# # print best params
# print(rf_best_test_estimator.named_steps['model'].get_params())
# # print feature names 
# print(rf_best_test_estimator.named_steps['preprocessor'].transformers_[0][2] + rf_best_test_estimator.named_steps['preprocessor'].transformers_[1][2])

In [21]:
sv_classifier = SVC(max_iter = 10000, tol = 1)

svc_params = {
    # 'model__gamma': np.logspace(-3, 3, 7),
    # 'model__C': np.logspace(-2, 2, 5)
    'model__gamma': [1e-1, 1e0, 1e1],
    'model__C': [1e-1, 1e0, 1e1]
}

In [22]:
svc_test_scores, svc_test_estimators, svc_unpreprocessed_test_sets, svc_preprocessed_test_sets, svc_predicted_labels, svc_baseline_scores = get_test_scores(X, y, preprocessor, sv_classifier, svc_params, n_splits = 5, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 3/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time=  57.1s
[CV 4/5] END ....model__C=0.1, model__gamma=1.0;, score=0.991 total time= 1.0min
[CV 5/5] END ....model__C=0.1, model__gamma=0.1;, score=0.988 total time= 1.7min
[CV 1/5] END ....model__C=0.1, model__gamma=0.1;, score=0.989 total time= 2.0min
[CV 5/5] END ....model__C=0.1, model__gamma=1.0;, score=0.991 total time= 2.0min
[CV 1/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 1.3min
[CV 2/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time= 1.3min
[CV 2/5] END ....model__C=0.1, model__gamma=1.0;, score=0.989 total time= 2.6min
[CV 3/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time= 1.3min
[CV 1/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 3.1min
[CV 5/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 1.1min
[CV 4/5] END ...model__



[CV 5/5] END ....model__C=1.0, model__gamma=1.0;, score=0.994 total time=  35.7s
[CV 3/5] END ...model__C=1.0, model__gamma=10.0;, score=0.992 total time=  36.7s
[CV 4/5] END ...model__C=1.0, model__gamma=10.0;, score=0.993 total time=  36.8s
[CV 1/5] END ...model__C=1.0, model__gamma=10.0;, score=0.992 total time= 1.4min
[CV 4/5] END ....model__C=0.1, model__gamma=0.1;, score=0.990 total time= 5.2min
[CV 1/5] END ...model__C=10.0, model__gamma=0.1;, score=0.993 total time=  35.7s
[CV 2/5] END ...model__C=10.0, model__gamma=0.1;, score=0.993 total time=  33.7s
[CV 3/5] END ...model__C=10.0, model__gamma=0.1;, score=0.994 total time=  34.3s
[CV 5/5] END ...model__C=1.0, model__gamma=10.0;, score=0.993 total time= 1.5min
[CV 2/5] END ...model__C=1.0, model__gamma=10.0;, score=0.993 total time= 1.9min
[CV 1/5] END ...model__C=10.0, model__gamma=1.0;, score=0.993 total time=  31.2s
[CV 2/5] END ....model__C=1.0, model__gamma=0.1;, score=0.992 total time= 4.0min
[CV 2/5] END ...model__C=10.



[CV 2/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time= 1.5min
[CV 1/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time= 1.5min
[CV 2/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 7.6min
[CV 3/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 1.5min
[CV 4/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 7.7min
[CV 4/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 1.3min
[CV 2/5] END ....model__C=1.0, model__gamma=0.1;, score=0.992 total time= 1.2min
[CV 5/5] END ...model__C=0.1, model__gamma=10.0;, score=0.985 total time= 1.4min
[CV 4/5] END ....model__C=1.0, model__gamma=0.1;, score=0.992 total time= 1.4min
[CV 5/5] END ....model__C=0.1, model__gamma=0.1;, score=0.988 total time=11.4min
[CV 1/5] END ....model__C=1.0, model__gamma=1.0;, score=0.993 total time= 2.6min
[CV 2/5] END ....model__C=1.0, model__gamma=1.0;, score=0.993 total time= 2.7min
[CV 3/5] END ....model__C=0.



[CV 4/5] END ....model__C=0.1, model__gamma=1.0;, score=0.989 total time= 7.8min
[CV 4/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 4.4min
[CV 5/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 4.1min
[CV 4/5] END ....model__C=1.0, model__gamma=0.1;, score=0.992 total time= 1.7min
[CV 1/5] END ....model__C=1.0, model__gamma=1.0;, score=0.993 total time= 5.1min
[CV 1/5] END ....model__C=0.1, model__gamma=0.1;, score=0.987 total time=14.2min
[CV 2/5] END ....model__C=1.0, model__gamma=1.0;, score=0.994 total time= 4.6min
[CV 2/5] END ....model__C=0.1, model__gamma=0.1;, score=0.989 total time=14.2min
[CV 3/5] END ....model__C=1.0, model__gamma=1.0;, score=0.993 total time=  39.6s
[CV 2/5] END ....model__C=1.0, model__gamma=0.1;, score=0.992 total time=12.3min
[CV 1/5] END ....model__C=1.0, model__gamma=0.1;, score=0.992 total time=12.8min
[CV 5/5] END ....model__C=0.1, model__gamma=0.1;, score=0.988 total time=17.3min
[CV 4/5] END ....model__C=1.

In [24]:
with open('../results/support_vector_classifier_best_estimators.save', 'wb') as f:
    pickle.dump((svc_test_estimators, svc_test_scores, svc_baseline_scores), f)
with open('../results/support_vector_classifier_test_data.save', 'wb') as f:
    pickle.dump((svc_unpreprocessed_test_sets, svc_preprocessed_test_sets, svc_predicted_labels), f)

In [38]:
xgbc_params = {
    'max_depth': [3, 5, 8],
    'reg_alpha': [0.01, 0.1, 1], 
    'reg_lambda': [0.01, 0.1, 1],
    'colsample_bytree': [0.9],              
    'subsample': [0.66],
    'learning_rate': [0.03],
    'n_estimators': [10000]
}

In [39]:
xgbc_test_scores, xgbc_test_estimators, xgbc_unpreprocessed_test_sets, xgbc_preprocessed_test_sets, xgbc_predicted_labels, xgbc_baseline_scores = get_xgb_classifier_test_scores(X, y, preprocessor, xgbc_params, n_splits = 5, n_seeds = 5)

Processing Seed 1 of 5...
Processing Seed 2 of 5...
Processing Seed 3 of 5...
Processing Seed 4 of 5...
Processing Seed 5 of 5...


In [40]:
with open('../results/xgb_classifier_best_estimators.save', 'wb') as f:
    pickle.dump((xgbc_test_estimators, xgbc_test_scores, xgbc_baseline_scores), f)
with open('../results/xgb_classifier_test_data.save', 'wb') as f:
    pickle.dump((xgbc_unpreprocessed_test_sets, xgbc_preprocessed_test_sets, xgbc_predicted_labels), f)