In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from tqdm import tqdm
from sklearn.feature_selection import mutual_info_classif
from pipeline_func import stratified_kfold_cv_pipe
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

In [4]:
# Load in the phishing
phishing_df = pd.read_csv('../data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [75]:
# categorical_numerical_features = ['IsDomainIP', 'HasObfuscation', 'IsHTTPS', 'HasTitle', 'HasFavicon', 'Robots', 'IsResponsive', 'HasDescription', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain', 'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDigitsInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'LineOfCode', 'LargestLineLength', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'NoOfiFrame', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']
# minmax_continuous_features = ['CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'ObfuscationRatio', 'LetterRatioInURL', 'DigitRatioInURL', 'SpecialCharRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore']
# ordinal_features = []
# nonnumeric_features = ['URL', 'Domain', 'TLD', 'Title']
# label = 'label'

# categorical_numerical_features = ['HasSocialNet', 'HasCopyrightInfo', 'HasDescription', 'IsHTTPS', 'HasSubmitButton', 'IsResponsive']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'NoOfCSS', 'NoOfOtherSpecialCharsInURL']
# minmax_continuous_features = ['LetterRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'SpecialCharRatioInURL']
# ordinal_features = []
# nonnumeric_features = []
# label = 'label'

# categorical_numerical_features = ['HasSocialNet', 'HasCopyrightInfo']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'NoOfCSS']
# minmax_continuous_features = ['LetterRatioInURL']
# ordinal_features = []
# nonnumeric_features = []
# label = 'label'

categorical_numerical_features = ['HasSocialNet']
categorical_nonnumerical_features = []
ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength']
minmax_continuous_features = ['LetterRatioInURL']
ordinal_features = []
nonnumeric_features = []
label = 'label'

In [76]:
train_perc, _, test_perc = 0.9, 0.05, 0.05

X, y = phishing_df.drop(columns = [label])[categorical_nonnumerical_features + ss_continuous_features + minmax_continuous_features], phishing_df[label]

# Perform stratified split to ensure equal representation of legitimate and phishing URLs
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = test_perc, stratify = y, random_state = 42)

In [77]:
preprocessor = ColumnTransformer(
    transformers = [
        ('ss', StandardScaler(), ss_continuous_features),
        ('mm', MinMaxScaler(), minmax_continuous_features)
        # NOTE: Probably should be scaling the binary features for interpretability and scale-optimizers
    ]
)

# UP MAX_ITER
log_reg = LogisticRegression(solver = 'saga', max_iter = 1000, n_jobs = -1, tol = 0.005)

log_reg_params = {
    'model__C': np.logspace(-3, 3, 7),
    'model__penalty': ['l1', 'l2'] # consider elasticnet with hyperparam l1_ratio if time permitting
}

In [78]:
best_log_reg, best_log_reg_score = stratified_kfold_cv_pipe(X_other, y_other, preprocessor, log_reg, log_reg_params, n_splits = 5, random_state = 42)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 5/5] END .model__C=0.001, model__penalty=l1;, score=0.951 total time=   3.0s
[CV 3/5] END .model__C=0.001, model__penalty=l1;, score=0.953 total time=   3.1s
[CV 3/5] END .model__C=0.001, model__penalty=l2;, score=0.956 total time=   3.2s
[CV 1/5] END .model__C=0.001, model__penalty=l2;, score=0.954 total time=   3.2s
[CV 4/5] END .model__C=0.001, model__penalty=l1;, score=0.953 total time=   3.2s
[CV 2/5] END .model__C=0.001, model__penalty=l2;, score=0.954 total time=   3.3s
[CV 1/5] END .model__C=0.001, model__penalty=l1;, score=0.951 total time=   3.3s
[CV 2/5] END .model__C=0.001, model__penalty=l1;, score=0.950 total time=   3.3s
[CV 5/5] END .model__C=0.001, model__penalty=l2;, score=0.955 total time=   3.5s
[CV 4/5] END .model__C=0.001, model__penalty=l2;, score=0.956 total time=   3.5s
[CV 3/5] END ..model__C=0.01, model__penalty=l2;, score=0.960 total time=   4.3s
[CV 1/5] END ..model__C=0.01, model__penalty=l2;

In [79]:
# print best params
print(best_log_reg.named_steps['model'].get_params())
# get coefficients from best model
print(best_log_reg.named_steps['model'].coef_)

{'C': 10.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': -1, 'penalty': 'l1', 'random_state': None, 'solver': 'saga', 'tol': 0.005, 'verbose': 0, 'warm_start': False}
[[-4.09230306 -3.98947124 -4.4698022  -4.56123204 -4.05095058  2.80654527
   1.15568232]]
