In [93]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from tqdm import tqdm
from sklearn.feature_selection import mutual_info_classif
from pipeline_func import stratified_kfold_cv_pipe, test_pipe
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

In [95]:
# Load in the phishing
phishing_df = pd.read_csv('../data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [126]:
# categorical_numerical_features = ['IsDomainIP', 'HasObfuscation', 'IsHTTPS', 'HasTitle', 'HasFavicon', 'Robots', 'IsResponsive', 'HasDescription', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain', 'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDigitsInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'LineOfCode', 'LargestLineLength', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'NoOfiFrame', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']
# minmax_continuous_features = ['CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'ObfuscationRatio', 'LetterRatioInURL', 'DigitRatioInURL', 'SpecialCharRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore']
# ordinal_features = []
# nonnumeric_features = ['URL', 'Domain', 'TLD', 'Title']
# label = 'label'

# categorical_numerical_features = ['HasSocialNet', 'HasCopyrightInfo', 'HasDescription', 'IsHTTPS', 'HasSubmitButton', 'IsResponsive']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'NoOfCSS', 'NoOfOtherSpecialCharsInURL']
# minmax_continuous_features = ['LetterRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'SpecialCharRatioInURL']
# ordinal_features = []
# nonnumeric_features = []
# label = 'label'

# categorical_numerical_features = ['HasSocialNet', 'HasCopyrightInfo']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'NoOfCSS']
# minmax_continuous_features = ['LetterRatioInURL']
# ordinal_features = []
# nonnumeric_features = []
# label = 'label'

categorical_numerical_features = ['HasSocialNet']
categorical_nonnumerical_features = []
ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength']
minmax_continuous_features = ['LetterRatioInURL']
ordinal_features = []
nonnumeric_features = []
label = 'label'

In [128]:
train_perc, _, test_perc = 0.9, 0.05, 0.05

X, y = phishing_df.drop(columns = [label])[categorical_numerical_features + ss_continuous_features + minmax_continuous_features], phishing_df[label]

# Perform stratified split to ensure equal representation of legitimate and phishing URLs
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = test_perc, stratify = y, random_state = 42)

In [None]:
preprocessor = ColumnTransformer(
    transformers = [
        ('ss', StandardScaler(), ss_continuous_features + categorical_numerical_features),
        ('mm', Pipeline(steps = [('sub_mm', MinMaxScaler()), ('sub_ss', StandardScaler())]), minmax_continuous_features)

    ]
)

log_reg = LogisticRegression(solver = 'saga', max_iter = 1000, n_jobs = -1, tol = 0.005)

log_reg_params = {
    'model__C': np.logspace(-3, 3, 7),
    'model__penalty': ['l1', 'l2'] # consider elasticnet with hyperparam l1_ratio if time permitting
}

In [130]:
best_log_reg, best_log_reg_params, best_log_reg_score = stratified_kfold_cv_pipe(X_other, y_other, preprocessor, log_reg, log_reg_params, n_splits = 10, random_state = 42)

Fitting 10 folds for each of 14 candidates, totalling 140 fits
[CV 2/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   3.2s
[CV 4/10] END model__C=0.001, model__penalty=l1;, score=0.975 total time=   3.8s
[CV 7/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   4.0s
[CV 9/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   4.0s
[CV 5/10] END model__C=0.001, model__penalty=l1;, score=0.976 total time=   4.0s
[CV 10/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   4.0s
[CV 6/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   4.1s
[CV 1/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   4.3s
[CV 3/10] END model__C=0.001, model__penalty=l1;, score=0.974 total time=   4.4s
[CV 8/10] END model__C=0.001, model__penalty=l1;, score=0.975 total time=   4.4s
[CV 1/10] END model__C=0.001, model__penalty=l2;, score=0.972 total time=   2.3s
[CV 2/10] END model__C=0.001, model__penalty=

In [None]:
# print best params
print(best_log_reg.named_steps['model'].get_params())
# get coefficients from best model
print(best_log_reg.named_steps['model'].coef_)

# print feature names 
print(best_log_reg.named_steps['preprocessor'].transformers_[0][2] + best_log_reg.named_steps['preprocessor'].transformers_[1][2])

{'C': 100.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': -1, 'penalty': 'l2', 'random_state': None, 'solver': 'saga', 'tol': 0.005, 'verbose': 0, 'warm_start': False}
[[-3.2723551  -3.19682595 -3.17774797 -3.77672452 -3.55079474  2.37563617
  -1.83694816  0.68133152]]
['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'HasSocialNet', 'LetterRatioInURL']
