In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from tqdm import tqdm
from sklearn.feature_selection import mutual_info_classif

In [2]:
# Load in the phishing
phishing_df = pd.read_csv('../data/phishing_urls.csv', header = 0).drop(columns = ['FILENAME', 'URLSimilarityIndex'])

# Invert the label column (positive phishing should be 1, not 0)
phishing_df['label'] = phishing_df['label'].map({0: 1, 1: 0})

In [None]:
categorical_numerical_features = ['IsDomainIP', 'HasObfuscation', 'IsHTTPS', 'HasTitle', 'HasFavicon', 'Robots', 'IsResponsive', 'HasDescription', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo']
categorical_nonnumerical_features = []
ss_continuous_features = ['URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain', 'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDigitsInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'LineOfCode', 'LargestLineLength', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'NoOfiFrame', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef']
minmax_continuous_features = ['CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'ObfuscationRatio', 'LetterRatioInURL', 'DigitRatioInURL', 'SpecialCharRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore']
ordinal_features = []
nonnumeric_features = ['URL', 'Domain', 'TLD', 'Title']
label = 'label'

# categorical_numerical_features = ['HasSocialNet', 'HasCopyrightInfo', 'HasDescription', 'IsHTTPS', 'HasSubmitButton', 'IsResponsive']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'NoOfCSS', 'NoOfOtherSpecialCharsInURL']
# minmax_continuous_features = ['LetterRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'SpecialCharRatioInURL']
# ordinal_features = []
# nonnumeric_features = []
# label = 'label'

# categorical_numerical_features = ['HasSocialNet', 'HasCopyrightInfo']
# categorical_nonnumerical_features = []
# ss_continuous_features = ['LineOfCode', 'NoOfExternalRef', 'NoOfImage', 'NoOfSelfRef', 'NoOfJS', 'LargestLineLength', 'NoOfCSS']
# minmax_continuous_features = ['LetterRatioInURL']
# ordinal_features = []
# nonnumeric_features = []
# label = 'label'

In [None]:
# Feature Engineering
poly = PolynomialFeatures(2)
interaction_features = poly.fit_transform(phishing_df[ss_continuous_features + minmax_continuous_features])

# Create a new dataframe with the interaction features
interaction_df = pd.DataFrame(interaction_features, columns = poly.get_feature_names_out(phishing_df[ss_continuous_features + minmax_continuous_features].columns))

pearson_correlations = interaction_df.corrwith(phishing_df[label], method='pearson')

print(pearson_correlations.sort_values(ascending = False, key = abs).head(10))

# mi = mutual_info_classif(interaction_df, phishing_df[label])   

# print(pd.Series(mi, index = interaction_df.columns).sort_values(ascending = False).head(10))
# print(pd.Series(mi, index = interaction_df.columns).sort_values(ascending = False, key = abs).head(10) )

  c /= stddev[:, None]
  c /= stddev[None, :]


NoOfSubDomain DomainTitleMatchScore          -0.609812
NoOfSubDomain URLTitleMatchScore             -0.603864
LetterRatioInURL SpecialCharRatioInURL        0.602648
URLCharProb DomainTitleMatchScore            -0.596308
CharContinuationRate DomainTitleMatchScore   -0.589343
DomainTitleMatchScore                        -0.584905
DomainTitleMatchScore URLTitleMatchScore     -0.583674
DomainTitleMatchScore^2                      -0.583660
URLLength DomainTitleMatchScore              -0.575883
TLDLength DomainTitleMatchScore              -0.574455
dtype: float64


In [None]:
print(interaction_df.shape)

(235795, 561)
Index(['1', 'URLLength', 'DomainLength', 'TLDLength', 'NoOfSubDomain',
       'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDigitsInURL',
       'NoOfEqualsInURL', 'NoOfQMarkInURL',
       ...
       'DigitRatioInURL^2', 'DigitRatioInURL SpecialCharRatioInURL',
       'DigitRatioInURL DomainTitleMatchScore',
       'DigitRatioInURL URLTitleMatchScore', 'SpecialCharRatioInURL^2',
       'SpecialCharRatioInURL DomainTitleMatchScore',
       'SpecialCharRatioInURL URLTitleMatchScore', 'DomainTitleMatchScore^2',
       'DomainTitleMatchScore URLTitleMatchScore', 'URLTitleMatchScore^2'],
      dtype='object', length=561)


In [11]:
train_perc, _, test_perc = 0.9, 0.05, 0.05

X, y = phishing_df.drop(columns = [label]), phishing_df[label]

# Perform stratified split to ensure equal representation of legitimate and phishing URLs
X_other, X_test, y_other, y_test = train_test_split(X, y, test_size = test_perc, stratify = y, random_state = 42)

skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

for train_index, val_index in skf.split(X_other, y_other):
    X_train, X_val = X_other.iloc[train_index], X_other.iloc[val_index]
    y_train, y_val = y_other.iloc[train_index], y_other.iloc[val_index]

    minmax_scaler = MinMaxScaler()

    X_train_mm = pd.DataFrame(minmax_scaler.fit_transform(X_train[minmax_continuous_features]),
                              columns = minmax_continuous_features, index = X_train.index)
    X_val_mm = pd.DataFrame(minmax_scaler.transform(X_val[minmax_continuous_features]),
                            columns = minmax_continuous_features, index = X_val.index)

    ss_scaler = StandardScaler()

    X_train_ss = pd.DataFrame(ss_scaler.fit_transform(X_train[ss_continuous_features]),
                              columns = ss_continuous_features, index = X_train.index)
    X_val_ss = pd.DataFrame(ss_scaler.transform(X_val[ss_continuous_features]),
                            columns = ss_continuous_features, index = X_val.index)

    X_train_untransformed = X_train.drop(columns = minmax_continuous_features + ss_continuous_features)
    X_val_untransformed = X_val.drop(columns = minmax_continuous_features + ss_continuous_features)

    X_train_final = pd.concat([X_train_untransformed, X_train_mm, X_train_ss], axis = 1)
    X_val_final = pd.concat([X_val_untransformed, X_val_mm, X_val_ss], axis = 1)

    # Print shapes to verify correctness
    print(X_train_final.shape, X_val_final.shape, X_test.shape)
    print(y_train.shape, y_val.shape, y_test.shape)

    # TRAIN AND VALIDATE MODELS HERE

(201604, 53) (22401, 53) (11790, 53)
(201604,) (22401,) (11790,)
(201604, 53) (22401, 53) (11790, 53)
(201604,) (22401,) (11790,)
(201604, 53) (22401, 53) (11790, 53)
(201604,) (22401,) (11790,)
(201604, 53) (22401, 53) (11790, 53)
(201604,) (22401,) (11790,)
(201604, 53) (22401, 53) (11790, 53)
(201604,) (22401,) (11790,)
(201605, 53) (22400, 53) (11790, 53)
(201605,) (22400,) (11790,)
(201605, 53) (22400, 53) (11790, 53)
(201605,) (22400,) (11790,)
(201605, 53) (22400, 53) (11790, 53)
(201605,) (22400,) (11790,)
(201605, 53) (22400, 53) (11790, 53)
(201605,) (22400,) (11790,)
(201605, 53) (22400, 53) (11790, 53)
(201605,) (22400,) (11790,)


In [None]:
# Train on entire training set and evaluate on test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_perc, stratify = y, random_state = 42)

minmax_scaler = MinMaxScaler()

X_train_mm = pd.DataFrame(minmax_scaler.fit_transform(X_train[minmax_continuous_features]),
                          columns = minmax_continuous_features, index = X_train.index)
X_test_mm = pd.DataFrame(minmax_scaler.transform(X_test[minmax_continuous_features]),
                            columns = minmax_continuous_features, index = X_test.index)

ss_scaler = StandardScaler()

X_train_ss = pd.DataFrame(ss_scaler.fit_transform(X_train[ss_continuous_features]),
                            columns = ss_continuous_features, index = X_train.index)
X_test_ss = pd.DataFrame(ss_scaler.transform(X_test[ss_continuous_features]),
                            columns = ss_continuous_features, index = X_test.index)

X_train_untransformed = X_train.drop(columns = minmax_continuous_features + ss_continuous_features)
X_test_untransformed = X_test.drop(columns = minmax_continuous_features + ss_continuous_features)

X_train_final = pd.concat([X_train_untransformed, X_train_mm, X_train_ss], axis = 1)
X_test_final = pd.concat([X_test_untransformed, X_test_mm, X_test_ss], axis = 1)

# Print shapes to verify correctness
print(X_train_final.shape, X_test_final.shape)
print(y_train.shape, y_test.shape)

# TRAIN AND TEST MODELS HERE

(224005, 53) (11790, 53)
(224005,) (11790,)
