In [5]:
import pandas as pd
import numpy as np

# Model selection libraries
from sklearn.model_selection import train_test_split

# Model Evaluation Libraries
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Import Tensor Flow and keras
import tensorflow as tf
from tensorflow import keras

import tensorflow_hub as hub
import tensorflow_text

In [42]:
# Load data from phishing email csv
email_df = pd.read_csv('./data/phishing_all_data_clean_4.csv')

In [43]:
X = email_df.drop(columns=['phishing'])
y = email_df.loc[:, ['phishing']]

In [44]:
X_remainder, X_test, y_remainder, y_test = train_test_split(
    X,
    y,
    stratify=y,
    random_state=1337,
    test_size=0.2,
)
X_remainder.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_remainder.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [45]:
X_train, X_validation, y_train, y_validation = train_test_split(
    X_remainder,
    y_remainder,
    stratify=y_remainder,
    random_state=1337,
    test_size=0.2,
)
X_train.reset_index(drop=True, inplace=True)
X_validation.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_validation.reset_index(drop=True, inplace=True)

In [8]:
loaded_model = keras.models.load_model(
    './models/bert_model_5_relu_sig.h5',
    custom_objects={'KerasLayer':hub.KerasLayer},
    compile=False
)

In [10]:
loaded_model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),
    # Loss function to minimize
    loss=keras.losses.BinaryCrossentropy(),
    # Metric used to evaluate model
    metrics=[keras.metrics.BinaryAccuracy()]
)

In [46]:
y_val_pred = loaded_model.predict(X_validation['content'])



In [63]:
y_val_bin_pred = np.where(y_val_pred >= 0.5, 1, 0)

In [65]:
accuracy_score(y_validation['phishing'].values, y_val_bin_pred.flatten())

0.9551411290322581

In [67]:
conf_matrix = confusion_matrix(y_validation, y_val_bin_pred)
ConfusionMatrixDisplay(conf_matrix)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fee5f11ba90>

In [69]:
X_validation[(y_validation['phishing'].values == 0) & (y_val_bin_pred.flatten() != y_validation['phishing'].values)]

Unnamed: 0,content,unsecure_link_count,secure_link_count,numbers_count,word_count
64,Hi Wearing my other hat today were looking for...,2,0,10,162
455,You are this because you up to receive one of ...,3,0,41,929
467,Howdy all I have a friend with the problem out...,1,0,16,134
503,On at net wrote The setup is as I develop a pi...,1,0,11,534
919,dont like entering when sending If the mail cl...,2,0,0,71
1198,I will be out of the office starting and will ...,1,0,7,110
1599,Shopper Newsletter Electronics Edition Shopper...,0,0,70,193
1671,Hi everyone Just a friendly reminder to come t...,0,0,21,261
