In [93]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [316]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pipeline_func import get_test_scores, get_xgb_classifier_test_scores
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [317]:
# Load in the phishing
phishing_df = pd.read_csv('../data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [318]:
# FEATURES SELECTED VIA TOP MUTUAL INFO (NON-LINEAR) & PEARSON CORR (LINEAR) in eda.ipynb
categorical_numerical_features = ['HasSocialNet']
categorical_nonnumerical_features = []
ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength']
minmax_continuous_features = ['LetterRatioInURL']
ordinal_features = []
nonnumeric_features = []
label = 'label'

X, y = phishing_df.drop(columns = [label])[categorical_numerical_features + ss_continuous_features + minmax_continuous_features], phishing_df[label]

In [319]:
preprocessor = ColumnTransformer(
    transformers = [
        ('ss', StandardScaler(), ss_continuous_features + categorical_numerical_features),
        ('mm', Pipeline(steps = [('sub_mm', MinMaxScaler()), ('sub_ss', StandardScaler())]), minmax_continuous_features)

    ]
)

In [320]:
log_reg = LogisticRegression(solver = 'saga', max_iter = 1000, n_jobs = -1, tol = 0.005)

log_reg_params = {
    'model__C': np.logspace(-3, 3, 7),
    'model__penalty': ['l1', 'l2'] # consider elasticnet with hyperparam l1_ratio if time permitting
}

In [322]:
lr_test_scores, lr_test_estimators, lr_unpreprocessed_test_sets, lr_preprocessed_test_sets, lr_predicted_labels, lr_baseline_scores = get_test_scores(X, y, preprocessor, log_reg, log_reg_params, n_splits = 10, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV 8/10] END model__C=0.001, model__penalty=l1;, score=0.977 total time=   2.3s
[CV 5/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   3.9s
[CV 9/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.9s
[CV 3/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   3.9s
[CV 2/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   4.0s
[CV 4/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   4.1s
[CV 7/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   4.1s
[CV 1/10] END model__C=0.001, model__penalty=l1;, score=0.972 total time=   4.2s
[CV 6/10] END model__C=0.001, model__penalty=l1;, score=0.973 total time=   4.2s
[CV 10/10] END model__C=0.001, model__penalty=l1;, score=0.970 total time=   4.2s
[CV 1/10] END model__C=0.001, model__penalty=l2;, score=0.970 total time=   2.4s
[CV 8/10] END model

In [286]:
with open('../results/logistic_regression_best_estimators.save', 'wb') as f:
    pickle.dump((lr_test_estimators, lr_test_scores, lr_baseline_scores), f)
with open('../results/logistic_regression_test_data.save', 'wb') as f:
    pickle.dump((lr_unpreprocessed_test_sets, lr_preprocessed_test_sets, lr_predicted_labels), f)

In [None]:
# # print best params
# print(lr_best_test_estimator.named_steps['model'].get_params())
# # get coefficients from best model
# print(lr_best_test_estimator.named_steps['model'].coef_)
# # print feature names 
# print(lr_best_test_estimator.named_steps['preprocessor'].transformers_[0][2] + lr_best_test_estimator.named_steps['preprocessor'].transformers_[1][2])

{'C': 100.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': -1, 'penalty': 'l1', 'random_state': None, 'solver': 'saga', 'tol': 0.005, 'verbose': 0, 'warm_start': False}
[[-2.18151428 -2.15117335 -2.11426281 -2.61585988 -3.20045845  1.07084151
  -1.84038715  0.64144293]]
['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'HasSocialNet', 'LetterRatioInURL']


In [151]:
rf_classifier = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state = 42)

rf_params = {
    'model__max_depth': [3, 5, 7],
    'model__max_features': [2, 3, 5, 8] 
}

In [300]:
rf_test_scores, rf_test_estimators, rf_unpreprocessed_test_sets, rf_preprocessed_test_sets, rf_predicted_labels, rf_baseline_scores = get_test_scores(X, y, preprocessor, rf_classifier, rf_params, n_splits = 10, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 9/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   7.6s
[CV 2/10] END model__max_depth=3, model__max_features=2;, score=0.987 total time=   7.6s
[CV 4/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   7.7s
[CV 8/10] END model__max_depth=3, model__max_features=2;, score=0.986 total time=   7.7s
[CV 1/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   7.7s
[CV 3/10] END model__max_depth=3, model__max_features=2;, score=0.984 total time=   7.7s
[CV 7/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   7.7s
[CV 6/10] END model__max_depth=3, model__max_features=2;, score=0.985 total time=   7.7s
[CV 5/10] END model__max_depth=3, model__max_features=2;, score=0.984 total time=   7.8s
[CV 10/10] END model__max_depth=3, model__max_features=2;, score=0.984 total time=   7.8s
[CV 1/10] END model_

In [307]:
with open('../results/random_forest_classifier_best_estimators.save', 'wb') as f:
    pickle.dump((rf_test_estimators, rf_test_scores, rf_baseline_scores), f)
with open('../results/random_forest_classifier_test_data.save', 'wb') as f:
    pickle.dump((rf_unpreprocessed_test_sets, rf_preprocessed_test_sets, rf_predicted_labels), f)

In [None]:
# # print best params
# print(rf_best_test_estimator.named_steps['model'].get_params())
# # print feature names 
# print(rf_best_test_estimator.named_steps['preprocessor'].transformers_[0][2] + rf_best_test_estimator.named_steps['preprocessor'].transformers_[1][2])

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 2, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'HasSocialNet', 'LetterRatioInURL']


In [207]:
sv_classifier = SVC(max_iter = 10000, random_state = 42, tol = 1)

svc_params = {
    # 'model__gamma': np.logspace(-3, 3, 7),
    # 'model__C': np.logspace(-2, 2, 5)
    'model__gamma': [1e-1, 1e0, 1e1],
    'model__C': [1e-1, 1e0, 1e1]
}

In [299]:
svc_test_scores, svc_test_estimators, svc_unpreprocessed_test_sets, svc_preprocessed_test_sets, svc_predicted_labels, svc_baseline_scores = get_test_scores(X, y, preprocessor, sv_classifier, svc_params, n_splits = 5, n_seeds = 5)

Processing Seed 1 of 5...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 4/5] END ....model__C=0.1, model__gamma=1.0;, score=0.991 total time=17.6min
[CV 3/5] END ....model__C=0.1, model__gamma=1.0;, score=0.990 total time=17.6min
[CV 1/5] END ....model__C=0.1, model__gamma=0.1;, score=0.989 total time=32.4min
[CV 5/5] END ....model__C=0.1, model__gamma=0.1;, score=0.988 total time=32.4min
[CV 1/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time=15.2min
[CV 2/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time=15.2min
[CV 5/5] END ....model__C=0.1, model__gamma=1.0;, score=0.991 total time=48.8min
[CV 3/5] END ...model__C=0.1, model__gamma=10.0;, score=0.986 total time=16.8min
[CV 4/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time=16.9min
[CV 2/5] END ....model__C=0.1, model__gamma=1.0;, score=0.989 total time=49.4min
[CV 5/5] END ...model__C=0.1, model__gamma=10.0;, score=0.987 total time=16.7min
[CV 1/5] END ....model_

In [315]:
with open('../results/support_vector_classifier_best_estimators.save', 'wb') as f:
    pickle.dump((svc_test_estimators, svc_test_scores, svc_baseline_scores), f)
with open('../results/support_vector_classifier_test_data.save', 'wb') as f:
    pickle.dump((svc_unpreprocessed_test_sets, svc_preprocessed_test_sets, svc_predicted_labels), f)

In [246]:
xgb_classifier = XGBClassifier(early_stopping_rounds = 10, n_jobs = -1, random_state = 42)

# Hyperparameter tuning
xgbc_params = {
    'max_depth': [3, 5, 8],
    'reg_alpha': [0.01, 0.1, 1], 
    'reg_lambda': [0.01, 0.1, 1],
    'colsample_bytree': [0.9],              
    'subsample': [0.66],
    'seed': [42],
    'learning_rate': [0.03],
    'n_estimators': [10000]
}

In [288]:
xgbc_test_scores, xgbc_test_estimators, xgbc_unpreprocessed_test_sets, xgbc_preprocessed_test_sets, xgbc_predicted_labels, xgbc_baseline_scores = get_xgb_classifier_test_scores(X, y, preprocessor, xgbc_params, n_splits = 5, n_seeds = 5)

Processing Seed 1 of 5...
Processing Seed 2 of 5...
Processing Seed 3 of 5...
Processing Seed 4 of 5...
Processing Seed 5 of 5...


In [298]:
with open('../results/xgb_classifier_best_estimators.save', 'wb') as f:
    pickle.dump((xgbc_test_estimators, xgbc_test_scores, xgbc_baseline_scores), f)
with open('../results/xgb_classifier_test_data.save', 'wb') as f:
    pickle.dump((xgbc_unpreprocessed_test_sets, xgbc_preprocessed_test_sets, xgbc_predicted_labels), f)