In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm

In [11]:
# Load in the phishing
phishing_df = pd.read_csv('data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [12]:
categorical_numerical_features = ['IsDomainIP', 'HasObfuscation', 'IsHTTPS', 'HasTitle', 'HasFavicon', 'Robots', 'IsResponsive', 'HasDescription', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo']
categorical_nonnumerical_features = []
ss_continuous_features = ['URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain', 'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDigitsInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'LineOfCode', 'LargestLineLength', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'NoOfiFrame', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']
minmax_continuous_features = ['CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'ObfuscationRatio', 'LetterRatioInURL', 'DigitRatioInURL', 'SpecialCharRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore']
ordinal_features = []
nonnumeric_features = ['URL', 'Domain', 'TLD', 'Title']
label = 'label'

In [22]:
train_perc, val_perc, test_perc = 0.8, 0.10, 0.10

X, y = phishing_df.drop(columns = [label]), phishing_df[label]

# Perform stratified split to ensure equal representation of legitimate and phishing URLs
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = test_perc, stratify = y, random_state = 42)

# GroupKFold into train and val with 10 folds
skf = StratifiedKFold(n_splits = 10, shuffle = True)

for train_index, val_index in skf.split(X_other, y_other):
    X_train, X_val = X_other.iloc[train_index], X_other.iloc[val_index]
    y_train, y_val = y_other.iloc[train_index], y_other.iloc[val_index]

    minmax_scaler = MinMaxScaler()

    X_train_mm = pd.DataFrame(minmax_scaler.fit_transform(X_train[minmax_continuous_features]),
                              columns = minmax_continuous_features, index = X_train.index)
    X_val_mm = pd.DataFrame(minmax_scaler.transform(X_val[minmax_continuous_features]),
                            columns = minmax_continuous_features, index = X_val.index)

    ss_scaler = StandardScaler()

    X_train_ss = pd.DataFrame(ss_scaler.fit_transform(X_train[ss_continuous_features]),
                              columns=ss_continuous_features, index=X_train.index)
    X_val_ss = pd.DataFrame(ss_scaler.transform(X_val[ss_continuous_features]),
                            columns=ss_continuous_features, index=X_val.index)

    X_train_untransformed = X_train.drop(columns = minmax_continuous_features + ss_continuous_features)
    X_val_untransformed = X_val.drop(columns = minmax_continuous_features + ss_continuous_features)

    X_train_final = pd.concat([X_train_untransformed, X_train_mm, X_train_ss], axis = 1)
    X_val_final = pd.concat([X_val_untransformed, X_val_mm, X_val_ss], axis = 1)

    # Print shapes to verify correctness
    print(X_train_final.shape, X_val_final.shape, X_test.shape)
    print(y_train.shape, y_val.shape, y_test.shape)

    # TRAIN AND VALIDATE MODELS HERE

(190993, 53) (21222, 53) (23580, 53)
(190993,) (21222,) (23580,)
(190993, 53) (21222, 53) (23580, 53)
(190993,) (21222,) (23580,)
(190993, 53) (21222, 53) (23580, 53)
(190993,) (21222,) (23580,)
(190993, 53) (21222, 53) (23580, 53)
(190993,) (21222,) (23580,)
(190993, 53) (21222, 53) (23580, 53)
(190993,) (21222,) (23580,)
(190994, 53) (21221, 53) (23580, 53)
(190994,) (21221,) (23580,)
(190994, 53) (21221, 53) (23580, 53)
(190994,) (21221,) (23580,)
(190994, 53) (21221, 53) (23580, 53)
(190994,) (21221,) (23580,)
(190994, 53) (21221, 53) (23580, 53)
(190994,) (21221,) (23580,)
(190994, 53) (21221, 53) (23580, 53)
(190994,) (21221,) (23580,)


In [23]:
# Train on entire training set and evaluate on test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_perc, stratify = y, random_state = 42)

minmax_scaler = MinMaxScaler()

X_train_mm = pd.DataFrame(minmax_scaler.fit_transform(X_train[minmax_continuous_features]),
                          columns = minmax_continuous_features, index = X_train.index)
X_test_mm = pd.DataFrame(minmax_scaler.transform(X_test[minmax_continuous_features]),
                            columns = minmax_continuous_features, index = X_test.index)

ss_scaler = StandardScaler()

X_train_ss = pd.DataFrame(ss_scaler.fit_transform(X_train[ss_continuous_features]),
                            columns=ss_continuous_features, index=X_train.index)
X_test_ss = pd.DataFrame(ss_scaler.transform(X_test[ss_continuous_features]),
                            columns=ss_continuous_features, index=X_test.index)

X_train_untransformed = X_train.drop(columns = minmax_continuous_features + ss_continuous_features)
X_test_untransformed = X_test.drop(columns = minmax_continuous_features + ss_continuous_features)

X_train_final = pd.concat([X_train_untransformed, X_train_mm, X_train_ss], axis = 1)
X_test_final = pd.concat([X_test_untransformed, X_test_mm, X_test_ss], axis = 1)

# Print shapes to verify correctness
print(X_train_final.shape, X_test_final.shape)
print(y_train.shape, y_test.shape)

# TRAIN AND TEST MODELS HERE

(212215, 53) (23580, 53)
(212215,) (23580,)
