## GPT2 + Cerebros for Phishing email detection

Initialization

In [1]:
%cd drive/MyDrive/Colab\ Notebooks/cerebros-core-algorithm-alpha

/content/drive/MyDrive/Colab Notebooks/cerebros-core-algorithm-alpha


In [2]:
!pip install -r requirements.txt



In [3]:
!pip install -q --upgrade keras-nlp

In [4]:
import tensorflow as tf
import tensorflow_text
from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten
import pandas as pd
import numpy as np
from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
    import SimpleCerebrosRandomSearch
import pendulum
from cerebros.units.units import DenseUnit
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
    import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval

Using TensorFlow backend


In [5]:
#
# Load the email data
#
df = pd.read_csv("Phishing_Email.csv")
#
# Get the rows where 'Email Text' is a string, remove everything else
#
df = df[df['Email Text'].apply(lambda x: isinstance(x, str))]
#
# Reset the index
#
df.reset_index(drop=True, inplace=True)

In [6]:
#
# Binary label for email type: positive type is "phishing"
#
label_mapping = {"Safe Email": 0, "Phishing Email": 1}
df["Binary Label"] = df["Email Type"].map(label_mapping)
#
# Data and labels ready
#
X = df["Email Text"].to_numpy()
y = df["Binary Label"].to_numpy()
#
# Shuffle the data
#
X, y = shuffle(X, y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Binary Label"] = df["Email Type"].map(label_mapping)


In [7]:
# Train / test split : we give 85% of the data for *testing*
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.85, shuffle=False)

In [8]:
#
# Tensors for training data and labels
#
training_x   = [tf.constant(X_train)]
train_labels = [tf.constant(y_train)]
#
# Input and output shapes
#
INPUT_SHAPES  = [()]
OUTPUT_SHAPES = [1]

### A custom GPT2 encoder layer for text embedding

In [9]:
class GPT2Layer(tf.keras.layers.Layer):

    def __init__(self, max_seq_length, **kwargs):
        #
        super(GPT2Layer, self).__init__(**kwargs)
        #
        # Load the GPT2 tokenizer, preprocessor and model
        self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en")
        self.preprocessor = GPT2Preprocessor(self.tokenizer,
                                             sequence_length=max_seq_length)
        self.encoder   = GPT2Backbone.from_preset("gpt2_base_en")
        #
        # Set whether the GPT2 model's layers are trainable
        #self.encoder.trainable = False
        for layer in self.encoder.layers:
            layer.trainable = False
        #
        self.encoder.layers[-2].trainable = True
        #
        # Set the maximum sequence length for tokenization
        self.max_seq_length = max_seq_length

    def call(self, inputs):
        #
        # Output the GPT2 embedding
        prep = self.preprocessor([inputs])
        embedding  = self.encoder(prep)
        avg_pool = tf.reduce_mean(embedding, axis=1)
        #
        return avg_pool

    def get_config(self):
        #
        config = super(GPT2Layer, self).get_config()
        config.update({'max_seq_length': self.max_seq_length})
        #
        return config

    @classmethod
    def from_config(cls, config):
        #
        return cls(max_seq_length=config['max_seq_length'])

In [10]:
# GPT2 configurables
max_seq_length = 96

# Base model
input_layer = Input(shape=(), dtype=tf.string)
gpt2_layer = GPT2Layer(max_seq_length)(input_layer)
#output = Flatten()(gpt2_layer)
base_model = Model(inputs=input_layer, outputs=gpt2_layer)
base_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 gpt2_layer (GPT2Layer)      (None, 768)               124439808 
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 7087872 (27.04 MB)
Non-trainable params: 117351936 (447.66 MB)
_________________________________________________________________


### Cerebros search for the best model

In [12]:
#
# Cerebros configurables
#
activation = 'gelu'
predecessor_level_connection_affinity_factor_first = 49.9999
predecessor_level_connection_affinity_factor_main = 0.31456
max_consecutive_lateral_connections = 22
p_lateral_connection = 0.39256
num_lateral_connection_tries_per_unit = 10
learning_rate = 0.0000511065
epochs = 6  # [1, 100]
batch_size = 13
maximum_levels = 4  # [3,7]
maximum_units_per_level = 8  # [2,10]
maximum_neurons_per_unit = 5  # [2,20]

In [13]:
#
# Logging
#
TIME = pendulum.now(tz='America/New_York').__str__()[:16]\
    .replace('T', '_')\
    .replace(':', '_')\
    .replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'

In [14]:
meta_trial_number = 42 # irrelevant unless in distributed training

In [15]:
cerebros_automl = SimpleCerebrosRandomSearch(
    unit_type=DenseUnit,
    input_shapes=INPUT_SHAPES,
    output_shapes=OUTPUT_SHAPES,
    training_data=training_x,
    labels=train_labels,
    validation_split=0.35,
    direction='maximize',
    metric_to_rank_by="val_binary_accuracy",
    minimum_levels=2,
    maximum_levels=maximum_levels,
    minimum_units_per_level=1,
    maximum_units_per_level=maximum_units_per_level,
    minimum_neurons_per_unit=1,
    maximum_neurons_per_unit=maximum_neurons_per_unit,
    activation=activation,
    final_activation='sigmoid',
    number_of_architecture_moities_to_try=2,
    number_of_tries_per_architecture_moity=1,
    minimum_skip_connection_depth=1,
    maximum_skip_connection_depth=7,
    predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
    predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
    predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
    predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
    predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
    seed=8675309,
    max_consecutive_lateral_connections=max_consecutive_lateral_connections,
    gate_after_n_lateral_connections=3,
    gate_activation_function=simple_sigmoid,
    p_lateral_connection=p_lateral_connection,
    p_lateral_connection_decay=zero_95_exp_decay,
    num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
    learning_rate=learning_rate,
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(),
             tf.keras.metrics.Precision(),
             tf.keras.metrics.Recall()],
    epochs=epochs,
    project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
    model_graphs='model_graphs',
    batch_size=batch_size,
    meta_trial_number=meta_trial_number,
    base_models=[base_model],
    train_data_dtype=tf.string)

In [16]:
%%time
result = cerebros_automl.run_random_search()

SimpleCerebrosRandomSearch.input_shapes: [()]
nan
>nnf>ceil
k is: 0 value is: [{'1': <class 'cerebros.units.units.InputUnit'>}]
0
k is: 1 value is: [{'3': <class 'cerebros.units.units.DenseUnit'>}, {'2': <class 'cerebros.units.units.DenseUnit'>}, {'4': <class 'cerebros.units.units.DenseUnit'>}, {'4': <class 'cerebros.units.units.DenseUnit'>}, {'3': <class 'cerebros.units.units.DenseUnit'>}, {'1': <class 'cerebros.units.units.DenseUnit'>}, {'3': <class 'cerebros.units.units.DenseUnit'>}]
1
Trying to create level 1
We think level 1's predecessors are: [0]
k is: 2 value is: [{'1': <class 'cerebros.units.units.FinalDenseUnit'>}]
2
Trying to create Final level 2
Trying to create level 2
We think level final level 2's predecessors are: [0, 1]
levels:
[0, 1, 2]
{'0': 'InputUnitModule'}
InputLevel.input_shapes [()]
{'3': <class 'cerebros.units.units.DenseUnit'>}
{'2': <class 'cerebros.units.units.DenseUnit'>}
{'4': <class 'cerebros.units.units.DenseUnit'>}
{'4': <class 'cerebros.units.units.De

                    x, approximate=True
                ),

  fn_config = serialization_lib.serialize_keras_object(activation)


returning trial 0 oracles
       loss  binary_accuracy  precision    recall  val_loss  \
0  0.387789         0.821586   0.722637  0.851906  0.382717   
1  0.224989         0.911894   0.857534  0.917889  0.247685   
2  0.193683         0.930066   0.891396  0.926686  0.171507   
3  0.169455         0.932269   0.890909  0.934018  0.163564   
4  0.146591         0.941079   0.904360  0.942815  0.150024   
5  0.133490         0.950441   0.928986  0.939883  0.171090   

   val_binary_accuracy  val_precision  val_recall  trial_number  \
0             0.849847       0.970370    0.653367             0   
1             0.891726       0.962382    0.765586             0   
2             0.935649       0.933333    0.907731             0   
3             0.945863       0.920290    0.950125             0   
4             0.943820       0.913876    0.952618             0   
5             0.947906       0.933168    0.940150             0   

   subtrial_number                                         mod

                    x, approximate=True
                ),

  fn_config = serialization_lib.serialize_keras_object(activation)


returning trial 1 oracles
       loss  binary_accuracy  precision    recall  val_loss  \
0  0.361933         0.874419   0.807563  0.887350  0.319987   
1  0.184330         0.933921   0.887052  0.944282  0.183077   
2  0.141241         0.947137   0.905817  0.958944  0.154442   
3  0.117919         0.959251   0.936782  0.956012  0.147507   
4  0.101640         0.966960   0.945559  0.967742  0.164993   
5  0.100299         0.962004   0.939742  0.960411  0.146696   

   val_binary_accuracy  val_precision  val_recall  trial_number  \
0             0.921348       0.924084    0.880299             1   
1             0.929520       0.932292    0.892768             1   
2             0.944842       0.918072    0.950125             1   
3             0.948927       0.922892    0.955112             1   
4             0.948927       0.946565    0.927681             1   
5             0.946885       0.918465    0.955112             1   

   subtrial_number                                         mod

In [17]:
print(f'Best accuracy achieved is {result}')
print(f'binary accuracy')

Best accuracy achieved is 0.9489274621009828
binary accuracy


### Testing the best model found

In [18]:
#
# Load the best model (taking into account that it has a custom layer)
#
best_model_found =\
tf.keras.models.load_model(cerebros_automl.best_model_path,\
custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)})

                    x, approximate=True
                ),

  fn_config = serialization_lib.serialize_keras_object(activation)


In [19]:
best_model_found.evaluate(X_test, y_test)



[0.14122413098812103,
 0.9547320008277893,
 0.9259660243988037,
 0.9617916345596313]

### Training the best model on a larger dataset, and testing again

In [27]:
# Train / test split : we give 75% of the data for *testing*,
# now that we have found the best model
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.75, shuffle=False)

In [28]:
optimizer = Adam(learning_rate=learning_rate)
#loss=tf.keras.losses.BinaryCrossentropy()
loss = tf.keras.losses.CategoricalHinge()
metrics=[tf.keras.metrics.BinaryAccuracy(),
         tf.keras.metrics.Precision(),
         tf.keras.metrics.Recall()]
best_model_found.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [29]:
best_model_found.fit(X_train, y_train, validation_split=0.35, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7b133414d2a0>

In [30]:
best_model_found.evaluate(X_test, y_test)



[0.6529397368431091,
 0.9619347453117371,
 0.9453561902046204,
 0.9581429362297058]