In [93]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [252]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import mutual_info_classif
from pipeline_func import stratified_kfold_cv_pipe, test_pipe, get_test_scores, xgb_classifier_cv, get_xgb_classifier_test_scores
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from tqdm import tqdm
import pickle

In [95]:
# Load in the phishing
phishing_df = pd.read_csv('../data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [215]:
# FEATURES SELECTED VIA TOP MUTUAL INFO (NON-LINEAR) & PEARSON CORR (LINEAR) in eda.ipynb
categorical_numerical_features = ['HasSocialNet']
categorical_nonnumerical_features = []
ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength']
minmax_continuous_features = ['LetterRatioInURL']
ordinal_features = []
nonnumeric_features = []
label = 'label'

X, y = phishing_df.drop(columns = [label])[categorical_numerical_features + ss_continuous_features + minmax_continuous_features], phishing_df[label]

In [221]:
preprocessor = ColumnTransformer(
    transformers = [
        ('ss', StandardScaler(), ss_continuous_features + categorical_numerical_features),
        ('mm', Pipeline(steps = [('sub_mm', MinMaxScaler()), ('sub_ss', StandardScaler())]), minmax_continuous_features)

    ]
)

In [222]:
log_reg = LogisticRegression(solver = 'saga', max_iter = 1000, n_jobs = -1, tol = 0.005)

log_reg_params = {
    'model__C': np.logspace(-3, 3, 7),
    'model__penalty': ['l1', 'l2'] # consider elasticnet with hyperparam l1_ratio if time permitting
}

In [223]:
lr_test_scores, lr_test_estimators, lr_best_test_score, lr_best_test_estimator = get_test_scores(X, y, preprocessor, log_reg, log_reg_params, n_splits = 10, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV 10/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   3.4s
[CV 4/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   4.9s
[CV 2/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   5.0s
[CV 8/10] END model__C=0.001, model__penalty=l1;, score=0.971 total time=   5.3s
[CV 9/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   5.3s
[CV 3/10] END model__C=0.001, model__penalty=l1;, score=0.971 total time=   5.3s
[CV 7/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   5.5s
[CV 1/10] END model__C=0.001, model__penalty=l1;, score=0.970 total time=   5.4s
[CV 5/10] END model__C=0.001, model__penalty=l1;, score=0.971 total time=   5.5s
[CV 6/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   5.9s
[CV 1/10] END model__C=0.001, model__penalty=l2;, score=0.968 total time=   3.3s
[CV 3/10] END model

In [226]:
print(lr_test_scores)
print(lr_best_test_score)
print(lr_best_test_estimator)
print(lr_best_test_estimator.get_params()['model'].get_params()['penalty'])

[0.9773471935093809, 0.9773776816507862, 0.9766248685696484, 0.9780821387808795, 0.9779173675868574]
0.9780821387808795
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ss', StandardScaler(),
                                                  ['LineOfCode',
                                                   'NoOfExternalRef',
                                                   'NoOfImage', 'NoOfSelfRef',
                                                   'NoOfJS',
                                                   'LargestLineLength',
                                                   'HasSocialNet']),
                                                 ('mm',
                                                  Pipeline(steps=[('sub_mm',
                                                                   MinMaxScaler()),
                                                                  ('sub_ss',
                                                               

In [253]:
with open('../results/logistic_regression_best_estimator.save', 'wb') as f:
    pickle.dump((lr_best_test_estimator, lr_best_test_score), f)

In [None]:
# print best params
print(lr_best_test_estimator.named_steps['model'].get_params())
# get coefficients from best model
print(lr_best_test_estimator.named_steps['model'].coef_)
# print feature names 
print(lr_best_test_estimator.named_steps['preprocessor'].transformers_[0][2] + lr_best_test_estimator.named_steps['preprocessor'].transformers_[1][2])

{'C': 100.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': -1, 'penalty': 'l1', 'random_state': None, 'solver': 'saga', 'tol': 0.005, 'verbose': 0, 'warm_start': False}
[[-2.18151428 -2.15117335 -2.11426281 -2.61585988 -3.20045845  1.07084151
  -1.84038715  0.64144293]]
['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'HasSocialNet', 'LetterRatioInURL']


In [151]:
rf_classifier = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 42)

rf_params = {
    'model__max_depth': [3, 5, 7],
    'model__max_features': [2, 3, 5, 8] 
}

In [198]:
rf_test_scores, rf_test_estimators, rf_best_test_score, rf_best_test_estimator = get_test_scores(X, y, preprocessor, rf_classifier, rf_params, n_splits = 10, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 6/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   5.6s
[CV 4/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   5.6s
[CV 3/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   5.7s
[CV 7/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   5.7s
[CV 1/10] END model__max_depth=3, model__max_features=2;, score=0.984 total time=   5.7s
[CV 5/10] END model__max_depth=3, model__max_features=2;, score=0.984 total time=   5.7s
[CV 8/10] END model__max_depth=3, model__max_features=2;, score=0.984 total time=   5.7s
[CV 9/10] END model__max_depth=3, model__max_features=2;, score=0.987 total time=   5.7s
[CV 2/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   5.7s
[CV 10/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   5.8s
[CV 5/10] END model_



[CV 10/10] END model__max_depth=5, model__max_features=5;, score=0.992 total time=  15.2s
[CV 1/10] END model__max_depth=5, model__max_features=8;, score=0.989 total time=  20.5s
[CV 2/10] END model__max_depth=5, model__max_features=8;, score=0.989 total time=  20.6s
[CV 3/10] END model__max_depth=5, model__max_features=8;, score=0.987 total time=  20.6s
[CV 5/10] END model__max_depth=5, model__max_features=8;, score=0.987 total time=  20.7s
[CV 4/10] END model__max_depth=5, model__max_features=8;, score=0.987 total time=  20.9s
[CV 8/10] END model__max_depth=5, model__max_features=8;, score=0.988 total time=  20.8s
[CV 6/10] END model__max_depth=5, model__max_features=8;, score=0.988 total time=  21.0s
[CV 7/10] END model__max_depth=5, model__max_features=8;, score=0.988 total time=  21.1s
[CV 9/10] END model__max_depth=5, model__max_features=8;, score=0.988 total time=  21.3s
[CV 2/10] END model__max_depth=7, model__max_features=2;, score=0.993 total time=   9.2s
[CV 1/10] END model_

In [211]:
print(rf_test_scores)
print(rf_best_test_score)
print(rf_best_test_estimator)

[0.9930682774669506, 0.9926576090721079, 0.9920505443004183, 0.9932054945162843, 0.9923242368633951]
0.9932054945162843
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ss', StandardScaler(),
                                                  ['LineOfCode',
                                                   'NoOfExternalRef',
                                                   'NoOfImage', 'NoOfSelfRef',
                                                   'NoOfJS',
                                                   'LargestLineLength',
                                                   'HasSocialNet']),
                                                 ('mm',
                                                  Pipeline(steps=[('sub_mm',
                                                                   MinMaxScaler()),
                                                                  ('sub_ss',
                                                               

In [254]:
with open('../results/random_forest_classifier_best_estimator.save', 'wb') as f:
    pickle.dump((rf_best_test_estimator, rf_best_test_score), f)

In [204]:
# print best params
print(rf_best_test_estimator.named_steps['model'].get_params())
# print feature names 
print(rf_best_test_estimator.named_steps['preprocessor'].transformers_[0][2] + rf_best_test_estimator.named_steps['preprocessor'].transformers_[1][2])

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 2, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'HasSocialNet', 'LetterRatioInURL']


In [207]:
sv_classifier = SVC(max_iter = 10000, random_state = 42, tol = 1)

svc_params = {
    # 'model__gamma': np.logspace(-3, 3, 7),
    # 'model__C': np.logspace(-2, 2, 5)
    'model__gamma': [1e-1, 1e0, 1e1],
    'model__C': [1e-1, 1e0, 1e1]
}

In [213]:
svc_test_scores, svc_test_estimators, svc_best_test_score, svc_best_test_estimator = get_test_scores(X, y, preprocessor, sv_classifier, svc_params, n_splits = 5, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 1.4min
[CV 5/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 1.4min
[CV 2/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 1.8min
[CV 2/5] END ....model__C=0.1, model__gamma=0.1;, score=0.988 total time= 1.9min
[CV 4/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 1.9min
[CV 1/5] END ....model__C=0.1, model__gamma=0.1;, score=0.987 total time= 2.1min
[CV 5/5] END ....model__C=0.1, model__gamma=0.1;, score=0.988 total time= 2.2min
[CV 1/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time= 1.1min
[CV 2/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time= 1.1min
[CV 3/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time= 2.6min
[CV 3/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time= 1.2min
[CV 4/5] END ...model__

In [214]:
print(svc_test_scores)
print(svc_best_test_score)
print(svc_best_test_estimator)

[0.9938256946093564, 0.9933856679585541, 0.993647374972831, 0.9936498277503509, 0.9939441681847275]
0.9939441681847275
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ss', StandardScaler(),
                                                  ['LineOfCode',
                                                   'NoOfExternalRef',
                                                   'NoOfImage', 'NoOfSelfRef',
                                                   'NoOfJS',
                                                   'LargestLineLength',
                                                   'HasSocialNet']),
                                                 ('mm',
                                                  Pipeline(steps=[('sub_mm',
                                                                   MinMaxScaler()),
                                                                  ('sub_ss',
                                                                

In [255]:
with open('../results/support_vector_classifier_best_estimator.save', 'wb') as f:
    pickle.dump((svc_best_test_estimator, svc_best_test_score), f)

In [246]:
xgb_classifier = XGBClassifier(early_stopping_rounds = 10, n_jobs = -1, random_state = 42)

# Hyperparameter tuning
xgbc_params = {
    'max_depth': [3, 5, 8],
    'reg_alpha': [0.01, 0.1, 1], 
    'reg_lambda': [0.01, 0.1, 1],
    'colsample_bytree': [0.9],              
    'subsample': [0.66],
    'seed': [42],
    'learning_rate': [0.03],
    'n_estimators': [10000]
}

In [248]:
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

xgbc_best_model, fitted_preprocessor, xgbc_best_params, xgbc_best_score = xgb_classifier_cv(X_other, y_other, preprocessor, xgbc_params, n_splits = 5, random_state = 42)

Processing Parameter Combination {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 10000, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'seed': 42, 'subsample': 0.66}...
Processing Parameter Combination {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 10000, 'reg_alpha': 0.01, 'reg_lambda': 0.1, 'seed': 42, 'subsample': 0.66}...
Processing Parameter Combination {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 10000, 'reg_alpha': 0.01, 'reg_lambda': 1, 'seed': 42, 'subsample': 0.66}...
Processing Parameter Combination {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 10000, 'reg_alpha': 0.1, 'reg_lambda': 0.01, 'seed': 42, 'subsample': 0.66}...
Processing Parameter Combination {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 10000, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'seed': 42, 'subsample': 0.66}...
Processing Parameter Combination {'co

In [249]:
print(xgbc_best_model)
print(fitted_preprocessor)
print(xgbc_best_params)
print(xgbc_best_score)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, device=None, early_stopping_rounds=10,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.03, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=8, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=10000, n_jobs=-1,
              num_parallel_tree=None, random_state=None, ...)
ColumnTransformer(transformers=[('ss', StandardScaler(),
                                 ['LineOfCode', 'NoOfExternalRef', 'NoOfImage',
                                  'NoOfSelfRef', 'NoOfJS', 'LargestLineLength',
                                

In [250]:
xgbc_test_scores, xgbc_test_estimators, xgbc_best_score, xgbc_best_estimator = get_xgb_classifier_test_scores(X, y, preprocessor, xgbc_params, n_splits = 5, n_seeds = 5)

Processing Seed 1 of 5...
Processing Seed 2 of 5...
Processing Seed 3 of 5...
Processing Seed 4 of 5...
Processing Seed 5 of 5...


In [251]:
print(xgbc_test_scores)
print(xgbc_best_score)
print(xgbc_best_estimator)

[0.9971580500460455, 0.9974468843082342, 0.9964208736232228, 0.9966609378836239, 0.9967541116230428]
0.9974468843082342
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, device=None, early_stopping_rounds=10,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.03, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=8, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=10000, n_jobs=-1,
              num_parallel_tree=None, random_state=None, ...)


In [256]:
with open('../results/xgb_classifier_best_estimator.save', 'wb') as f:
    pickle.dump((xgbc_best_estimator, xgbc_best_score), f)