## GPT2 + Cerebros for Phishing email detection

Initialization

In [None]:
%cd drive/MyDrive/Colab\ Notebooks/cerebros-core-algorithm-alpha

/content/drive/MyDrive/Colab Notebooks/cerebros-core-algorithm-alpha


In [None]:
!pip install -r requirements.txt

Collecting pendulum (from -r requirements.txt (line 4))
  Downloading pendulum-2.1.2.tar.gz (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pyvis (from -r requirements.txt (line 9))
  Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-text (from -r requirements.txt (line 12))
  Downloading tensorflow_text-2.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
Collecting transformers (from -r requirements.txt (line 14))
  Downloading transformers

In [None]:
!pip install -q --upgrade keras-nlp

In [None]:
import tensorflow as tf
import tensorflow_text
from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten
import pandas as pd
import numpy as np
from cerebros.simplecerebrosrandomsearch.simple_cerebros_random_search\
    import SimpleCerebrosRandomSearch
import pendulum
from cerebros.units.units import DenseUnit
from cerebros.denseautomlstructuralcomponent.dense_automl_structural_component\
    import zero_7_exp_decay, zero_95_exp_decay, simple_sigmoid
from ast import literal_eval

Using TensorFlow backend


In [None]:
#
# Load the email data
#
df = pd.read_csv("Phishing_Email.csv")
#
# Get the rows where 'Email Text' is a string, remove everything else
#
df = df[df['Email Text'].apply(lambda x: isinstance(x, str))]
#
# Reset the index
#
df.reset_index(drop=True, inplace=True)

In [None]:
#
# Binary label for email type: positive type is "phishing"
#
label_mapping = {"Safe Email": 0, "Phishing Email": 1}
df["Binary Label"] = df["Email Type"].map(label_mapping)
#
# Data and labels ready
#
X = df["Email Text"].to_numpy()
y = df["Binary Label"].to_numpy()
#
# Shuffle the data
#
X, y = shuffle(X, y)

In [None]:
# Train / test split : we give 65% of the data for *testing*
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.65, shuffle=False)

In [None]:
#
# Tensors for training data and labels
#
training_x   = [tf.constant(X_train)]
train_labels = [tf.constant(y_train)]
#
# Input and output shapes
#
INPUT_SHAPES  = [()]
OUTPUT_SHAPES = [1]

### A custom GPT2 encoder layer for text embedding

In [None]:
class GPT2Layer(tf.keras.layers.Layer):

    def __init__(self, max_seq_length, **kwargs):
        #
        super(GPT2Layer, self).__init__(**kwargs)
        #
        # Load the GPT2 tokenizer, preprocessor and model
        self.tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en")
        self.preprocessor = GPT2Preprocessor(self.tokenizer,
                                             sequence_length=max_seq_length)
        self.encoder   = GPT2Backbone.from_preset("gpt2_base_en")
        #
        # Set whether the GPT2 model's layers are trainable
        #self.encoder.trainable = False
        for layer in self.encoder.layers:
            layer.trainable = False
        #
        self.encoder.layers[-2].trainable = True
        #
        # Set the maximum sequence length for tokenization
        self.max_seq_length = max_seq_length

    def call(self, inputs):
        #
        # Output the GPT2 embedding
        prep = self.preprocessor([inputs])
        embedding  = self.encoder(prep)
        avg_pool = tf.reduce_mean(embedding, axis=1)
        #
        return avg_pool

    def get_config(self):
        #
        config = super(GPT2Layer, self).get_config()
        config.update({'max_seq_length': self.max_seq_length})
        #
        return config

    @classmethod
    def from_config(cls, config):
        #
        return cls(max_seq_length=config['max_seq_length'])

In [None]:
# GPT2 configurables
max_seq_length = 96

# Base model
input_layer = Input(shape=(), dtype=tf.string)
gpt2_layer = GPT2Layer(max_seq_length)(input_layer)
#output = Flatten()(gpt2_layer)
base_model = Model(inputs=input_layer, outputs=gpt2_layer)
base_model.summary()

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/vocab.json
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/merges.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/model.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 gpt2_layer (GPT2Layer)      (None, 768)               124439808 
                                                                 
Total params: 124439808 (474.70 MB)
Trainable params: 7087872 (27.04 MB)
Non-trainable params: 117351936 (447.66 MB)
_________________________________________________________________


### Cerebros search for the best model

In [None]:
#
# Cerebros configurables
#
activation = 'swish'
predecessor_level_connection_affinity_factor_first = 2.0
predecessor_level_connection_affinity_factor_main = 0.97
max_consecutive_lateral_connections = 5
p_lateral_connection = 0.97
num_lateral_connection_tries_per_unit = 2
learning_rate = 0.001
epochs = 6  # [1, 100]
batch_size = 32
maximum_levels = 5  # [3,7]
maximum_units_per_level = 7  # [2,10]
maximum_neurons_per_unit = 6  # [2,20]

In [None]:
#
# Logging
#
TIME = pendulum.now(tz='America/New_York').__str__()[:16]\
    .replace('T', '_')\
    .replace(':', '_')\
    .replace('-', '_')
PROJECT_NAME = f'{TIME}_cerebros_auto_ml_phishing_email_test'

In [None]:
meta_trial_number = 42 # irrelevant unless in distributed training

In [None]:
cerebros_automl = SimpleCerebrosRandomSearch(
    unit_type=DenseUnit,
    input_shapes=INPUT_SHAPES,
    output_shapes=OUTPUT_SHAPES,
    training_data=training_x,
    labels=train_labels,
    validation_split=0.35,
    direction='maximize',
    metric_to_rank_by="val_binary_accuracy",
    minimum_levels=2,
    maximum_levels=maximum_levels,
    minimum_units_per_level=1,
    maximum_units_per_level=maximum_units_per_level,
    minimum_neurons_per_unit=1,
    maximum_neurons_per_unit=maximum_neurons_per_unit,
    activation=activation,
    final_activation='sigmoid',
    number_of_architecture_moities_to_try=2,
    number_of_tries_per_architecture_moity=1,
    minimum_skip_connection_depth=1,
    maximum_skip_connection_depth=7,
    predecessor_level_connection_affinity_factor_first=predecessor_level_connection_affinity_factor_first,
    predecessor_level_connection_affinity_factor_first_rounding_rule='ceil',
    predecessor_level_connection_affinity_factor_main=predecessor_level_connection_affinity_factor_main,
    predecessor_level_connection_affinity_factor_main_rounding_rule='ceil',
    predecessor_level_connection_affinity_factor_decay_main=zero_7_exp_decay,
    seed=8675309,
    max_consecutive_lateral_connections=max_consecutive_lateral_connections,
    gate_after_n_lateral_connections=3,
    gate_activation_function=simple_sigmoid,
    p_lateral_connection=p_lateral_connection,
    p_lateral_connection_decay=zero_95_exp_decay,
    num_lateral_connection_tries_per_unit=num_lateral_connection_tries_per_unit,
    learning_rate=learning_rate,
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(),
             tf.keras.metrics.Precision(),
             tf.keras.metrics.Recall()],
    epochs=epochs,
    project_name=f"{PROJECT_NAME}_meta_{meta_trial_number}",
    model_graphs='model_graphs',
    batch_size=batch_size,
    meta_trial_number=meta_trial_number,
    base_models=[base_model],
    train_data_dtype=tf.string)

In [None]:
%%time
result = cerebros_automl.run_random_search()

SimpleCerebrosRandomSearch.input_shapes: [()]
nan
>nnf>ceil
k is: 0 value is: [{'1': <class 'cerebros.units.units.InputUnit'>}]
0
k is: 1 value is: [{'6': <class 'cerebros.units.units.DenseUnit'>}, {'5': <class 'cerebros.units.units.DenseUnit'>}, {'5': <class 'cerebros.units.units.DenseUnit'>}, {'6': <class 'cerebros.units.units.DenseUnit'>}, {'3': <class 'cerebros.units.units.DenseUnit'>}, {'6': <class 'cerebros.units.units.DenseUnit'>}]
1
Trying to create level 1
We think level 1's predecessors are: [0]
k is: 2 value is: [{'1': <class 'cerebros.units.units.FinalDenseUnit'>}]
2
Trying to create Final level 2
Trying to create level 2
We think level final level 2's predecessors are: [0, 1]
levels:
[0, 1, 2]
{'0': 'InputUnitModule'}
InputLevel.input_shapes [()]
{'6': <class 'cerebros.units.units.DenseUnit'>}
{'5': <class 'cerebros.units.units.DenseUnit'>}
{'5': <class 'cerebros.units.units.DenseUnit'>}
{'6': <class 'cerebros.units.units.DenseUnit'>}
{'3': <class 'cerebros.units.units.Den

                    x, approximate=True
                ),

  fn_config = serialization_lib.serialize_keras_object(activation)


returning trial 0 oracles
       loss  binary_accuracy  precision    recall  val_loss  \
0  0.225377         0.914582   0.882835  0.902936  0.179565   
1  0.134204         0.951864   0.935753  0.942481  0.353785   
2  0.100342         0.963662   0.957704  0.949670  0.131919   
3  0.071753         0.972629   0.963582  0.967046  0.098000   
4  0.065943         0.974988   0.969934  0.966447  0.233555   
5  0.061122         0.976168   0.967780  0.971839  0.104695   

   val_binary_accuracy  val_precision  val_recall  trial_number  \
0             0.951380       0.926518    0.953947             0   
1             0.865966       0.748768    1.000000             0   
2             0.951818       0.953620    0.924342             0   
3             0.962330       0.946004    0.960526             0   
4             0.926413       0.954768    0.856360             0   
5             0.956636       0.979929    0.910088             0   

   subtrial_number                                         mod

                    x, approximate=True
                ),

  fn_config = serialization_lib.serialize_keras_object(activation)


returning trial 1 oracles
       loss  binary_accuracy  precision    recall  val_loss  \
0  0.084823         0.963809   0.961796  0.946145  0.113410   
1  0.067244         0.973101   0.965290  0.966447  0.102755   
2  0.050570         0.982539   0.976119  0.979629  0.105537   
3  0.044113         0.981595   0.976062  0.977232  0.266534   
4  0.049816         0.983483   0.977884  0.980228  0.110673   
5  0.037963         0.983719   0.980192  0.978430  0.185889   

   val_binary_accuracy  val_precision  val_recall  trial_number  \
0             0.961454       0.936441    0.969298             1   
1             0.959702       0.940860    0.959430             1   
2             0.963644       0.936776    0.974781             1   
3             0.939115       0.871278    0.994518             1   
4             0.954884       0.977568    0.907895             1   
5             0.937801       0.986111    0.856360             1   

   subtrial_number                                         mod

In [None]:
print(f'Best accuracy achieved is {result}')
print(f'binary accuracy')

Best accuracy achieved is 0.9636443257331848
binary accuracy


### Testing the best model found

In [None]:
#
# Load the best model (taking into account that it has a custom layer)
#
best_model_found =\
tf.keras.models.load_model(cerebros_automl.best_model_path,\
custom_objects={'GPT2Layer': GPT2Layer(max_seq_length)})


                    x, approximate=True
                ),

  fn_config = serialization_lib.serialize_keras_object(activation)


In [None]:
best_model_found.evaluate(X_test, y_test)



[0.20856627821922302,
 0.9346982836723328,
 0.9902936816215515,
 0.8410484194755554]

### Training the best model on a larger dataset, and testing again

In [None]:
# Train / test split : we give 65% of the data for training,
# now that we have found the best model
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.35, shuffle=False)

In [None]:
optimizer = Adam(learning_rate=0.0005)
#loss=tf.keras.losses.BinaryCrossentropy()
loss = tf.keras.losses.CategoricalHinge()
metrics=[tf.keras.metrics.BinaryAccuracy(),
         tf.keras.metrics.Precision(),
         tf.keras.metrics.Recall()]
best_model_found.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
best_model_found.fit(X_train, y_train, validation_split=0.35, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7cae322cba60>

In [None]:
best_model_found.evaluate(X_test, y_test)



[0.6463907361030579,
 0.9658080339431763,
 0.9631262421607971,
 0.9483031034469604]