In [1]:
# ----- Parameters -----

# Colab
IS_COLAB = False

# Mail type
NORMAL = "normal"
PHISHING = "phish"
EMAIL_TYPES = (NORMAL, PHISHING)

# Dataset Size
MIN_TOKEN_FOR_SENTENCES = 10
MAX_TOKEN_FOR_SENTENCES = 512 # BERT limit
MAX_SENTENCES_FOR_TYPE = 69000 # Phish dataset limit

# Training
USE_VAL = False
SIZE_VAL = 0.2
FORCE_TRAIN = False
EPOCHS = 4 # BERT paper
BATCH_SIZE = 8 # Colab Pro Limit
LEARNING_RATE = 5e-5 # BERT paper

In [2]:
!pip install transformers > /dev/null
!pip install prettytable > /dev/null

In [3]:
# ----- Modules -----

from sklearn.model_selection import train_test_split
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import prettytable
import random
import shutil
import tensorflow as tf
import transformers

if IS_COLAB:
  from google.colab import drive


2023-09-24 11:36:51.750467: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-24 11:36:51.827278: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-24 11:36:52.303389: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-24 11:36:52.306222: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ----- Colab GPU/TPU -----

if 'COLAB_TPU_ADDR' in os.environ:
  print(f"Found TPU at: {os.environ['COLAB_TPU_ADDR']}")
elif tf.test.gpu_device_name() == '/device:GPU:0':
  print(f"Found GPU at: {tf.test.gpu_device_name()}")
else:
  if IS_COLAB and FORCE_TRAIN:
    raise Exception("Missing GPU/TPU")
  else:
    print("Only CPU")

Only CPU


In [5]:
# ----- Load Dataset -----

if IS_COLAB:
  drive.mount("/content/gdrive", force_remount=True)

  for file_name in ["email_dataset.csv", "normal_fine_tuned_bert.zip", "phish_fine_tuned_bert.zip", "neopir.json"]:
    if os.path.exists(f"{file_name}"):
      print(f"{file_name} already loaded")
    else:
      if os.path.exists(f"gdrive/MyDrive/DataScience/{file_name}"):
        # Copy from GDrive to Colab VM
        shutil.copyfile(f"gdrive/MyDrive/DataScience/{file_name}", f'{file_name}')
        if ".zip" in file_name:
             shutil.unpack_archive(file_name, f"{file_name.split('.')[0]}/")
        print(f"{file_name} loaded")
      else:
        print(f"[!] {file_name} not present in Gdrive")

dataset = pd.read_csv("email_dataset.csv").set_index("id")

In [6]:
# ----- Load PreTrained BERT Model -----

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = {}
for email_type in EMAIL_TYPES:
  model[email_type] = transformers.TFBertForMaskedLM.from_pretrained('bert-base-uncased')

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [7]:
# ----- Tokenizing the data -----

inputs = {}

for email_type in EMAIL_TYPES:

    # Create a list of sentence
    raw_inputs = []
    for entry in dataset[dataset["type"] == email_type]["message"].to_list():
        try:
            entry_sentences = [x for x in entry.split(". ") if x!="" and len(x.split(" ")) >= MIN_TOKEN_FOR_SENTENCES]
            raw_inputs += entry_sentences
        except:
            pass#print(entry)

    inputs[email_type] = random.sample(raw_inputs, MAX_SENTENCES_FOR_TYPE)
    print(f"{email_type} dataset formed by {len(inputs[email_type])} sentences for {sum([len(x.split(' ')) for x in inputs[email_type]])} tokens (average sentence of {round(sum([len(x.split(' ')) for x in inputs[email_type]])/len([len(x.split(' ')) for x in inputs[email_type]]))} tokens)")

    if USE_VAL:

      # Split Dataset in Training and  Validation
      train_set, val_set = train_test_split(inputs[email_type], test_size=SIZE_VAL)
      inputs[email_type] = {}
      inputs[email_type]["train"] = train_set
      inputs[email_type]["val"] = val_set

      for dataset_type in ["train", "val"]:

        # Tokenizer
        inputs[email_type][dataset_type] = tokenizer(inputs[email_type][dataset_type],max_length=MAX_TOKEN_FOR_SENTENCES,truncation=True,padding='max_length',return_tensors='tf')

        # Masking input tokens
        inp_ids = []
        lbs = []
        idx = 0
        inputs[email_type][dataset_type]["labels"] = inputs[email_type][dataset_type]["input_ids"]
        for inp in inputs[email_type][dataset_type].input_ids.numpy():
            actual_tokens = list(set(range(MAX_TOKEN_FOR_SENTENCES)) -
                                set(np.where((inp == 101) | (inp == 102)
                                    | (inp == 0))[0].tolist()))
            #We need to select 15% random tokens from the given list
            num_of_token_to_mask = int(len(actual_tokens)*0.15)
            token_to_mask = np.random.choice(np.array(actual_tokens),
                                            size=num_of_token_to_mask,
                                            replace=False).tolist()
            #Now we have the indices where we need to mask the tokens
            inp[token_to_mask] = 103
            inp_ids.append(inp)
            idx += 1
        inp_ids = tf.convert_to_tensor(inp_ids)
        inputs[email_type][dataset_type]["input_ids"] = inp_ids

    else:

      # Tokenizer
      inputs[email_type] = tokenizer(inputs[email_type],max_length=MAX_TOKEN_FOR_SENTENCES,truncation=True,padding='max_length',return_tensors='tf')

      # Masking input tokens
      inp_ids = []
      lbs = []
      idx = 0
      inputs[email_type]["labels"] = inputs[email_type]["input_ids"]
      for inp in inputs[email_type].input_ids.numpy():
          actual_tokens = list(set(range(MAX_TOKEN_FOR_SENTENCES)) -
                              set(np.where((inp == 101) | (inp == 102)
                                  | (inp == 0))[0].tolist()))
          #We need to select 15% random tokens from the given list
          num_of_token_to_mask = int(len(actual_tokens)*0.15)
          token_to_mask = np.random.choice(np.array(actual_tokens),
                                          size=num_of_token_to_mask,
                                          replace=False).tolist()
          #Now we have the indices where we need to mask the tokens
          inp[token_to_mask] = 103
          inp_ids.append(inp)
          idx += 1
      inp_ids = tf.convert_to_tensor(inp_ids)
      inputs[email_type]["input_ids"] = inp_ids

normal dataset formed by 69000 sentences for 1177421 tokens (average sentence of 17 tokens)
phish dataset formed by 69000 sentences for 1220863 tokens (average sentence of 18 tokens)


In [8]:
# ----- Print random token -----

for email_type in EMAIL_TYPES:
    if USE_VAL:
      i = random.randint(0, len(inputs[email_type]["train"]["input_ids"]))
      print(email_type,":")
      print(' '.join([x for x in tokenizer.convert_ids_to_tokens(inputs[email_type]["train"]["input_ids"][i]) if x != "[PAD]"]))
      print(' '.join([x for x in tokenizer.convert_ids_to_tokens(inputs[email_type]["train"]["labels"][i]) if x != "[PAD]"]))
    else:
      i = random.randint(0, len(inputs[email_type]["input_ids"]))
      print(email_type,":")
      print(' '.join([x for x in tokenizer.convert_ids_to_tokens(inputs[email_type]["input_ids"][i]) if x != "[PAD]"]))
      print(' '.join([x for x in tokenizer.convert_ids_to_tokens(inputs[email_type]["labels"][i]) if x != "[PAD]"]))
    print()

normal :
[CLS] to access it enter this in your browser address [MASK] [SEP]
[CLS] to access it enter this in your browser address line [SEP]

phish :
[CLS] now that god has called me , i have willed [MASK] given most [SEP]
[CLS] now that god has called me , i have willed and given most [SEP]



In [9]:
# ----- Training the model -----

for email_type in EMAIL_TYPES:

  # Already trained
  if not FORCE_TRAIN and os.path.exists(f"{email_type}_fine_tuned_bert"):

    # Load fine tuned model
    print(f"{email_type} already trained: load fine tuned model")
    model[email_type] = transformers.TFBertForMaskedLM.from_pretrained(f"{email_type}_fine_tuned_bert")

  else:

    # Model Compiling
    model[email_type].compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

    # Model Training
    print(f"Training of {email_type} model...")
    history = None
    if USE_VAL:
      history = model[email_type].fit(
          [inputs[email_type]["train"].input_ids,inputs[email_type]["train"].attention_mask],
          inputs[email_type]["train"].labels,
          validation_data = (
              [inputs[email_type]["train"].input_ids,inputs[email_type]["train"].attention_mask],
              inputs[email_type]["train"].labels
          ),
          verbose=1,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS)
    else:
      history = model[email_type].fit(
        [inputs[email_type].input_ids,inputs[email_type].attention_mask],
        inputs[email_type].labels,
        verbose=1,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS)

    # Plot Train/Val Loss
    if USE_VAL:
      loss = [x for x in history.history['loss']]
      val_loss = [x for x in history.history['val_loss']]

      plt.subplot(2, 1, 2)
      plt.plot(loss, label='Training Loss')
      plt.plot(val_loss, label='Validation Loss')
      plt.legend(loc='upper right')
      plt.ylabel('Loss')
      #plt.ylim([0,1.0])
      plt.title('Training and Validation Loss')
      plt.xlabel('epoch')
      plt.show()

    # Save model
    model[email_type].save_pretrained(f"{email_type}_fine_tuned_bert")
    if IS_COLAB:
      shutil.make_archive(f"{email_type}_fine_tuned_bert", "zip", f"{email_type}_fine_tuned_bert")
      shutil.copyfile(f"{email_type}_fine_tuned_bert.zip",f"gdrive/MyDrive/DataScience/{email_type}_fine_tuned_bert.zip")

normal already trained: load fine tuned model


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at normal_fine_tuned_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


phish already trained: load fine tuned model


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at phish_fine_tuned_bert.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [14]:
# ----- Make Predictions -----

query = "Click on links provided in order to [MASK] your [MASK]";
inp = tokenizer(query,return_tensors='tf')

print(f"{query}:")
for email_type in EMAIL_TYPES:
  out = np.argmax(model[email_type](inp).logits[0].numpy(), axis=1).tolist()
  print(f"{email_type} => {tokenizer.decode(out)}")

Click on links provided in order to [MASK] your [MASK]:
normal => [CLS] click on links provided in order to view your feedback [SEP]
phish => [CLS] click on links provided in order to update your account [SEP]


In [11]:
# ----- Five Factor Model -----

# Utility
RESPONSE_TOKENS = tokenizer.encode("disagree agree")[1:-1]
QUESTIONNAIRE = json.load(open('neopir.json'))["questions"]
TEMPLATE = "I am [MASK] that [QUESTION]";
MAP = [1,5]

# Result Variable
result = {}
for email_type in EMAIL_TYPES:
  result[email_type] =  {"Openness": [], "Conscientiousness": [], "Extraversion": [], "Agreeableness": [], "Neuroticism": []}

# Questionnaire
for question in QUESTIONNAIRE:
  inp = tokenizer(TEMPLATE.replace("[QUESTION]", question[1]), return_tensors='tf')
  for email_type in EMAIL_TYPES:
    token_prob = [float(model[email_type](inp).logits[0][tokenizer.encode(query).index(103)][RESPONSE_TOKENS[x]]) for x in [0,1]]
    token_prob = np.exp(token_prob) / np.sum(np.exp(token_prob), axis=0)
    output = result[email_type][question[0]].append(sum(MAP * token_prob * question[2]))

# Print table
table = prettytable.PrettyTable(["trait", "phish", "normal"])
table.title = "Five Factor Model"
for trait in result[email_type].keys():
    tmp = [trait]
    for email_type in EMAIL_TYPES:
        # Range for each sections is (-20, 20)
        tmp.append(round(5*sum(result[email_type][trait])))
    table.add_row(tmp)
print(table)

+------------------------------------+
|         Five Factor Model          |
+-------------------+-------+--------+
|       trait       | phish | normal |
+-------------------+-------+--------+
|      Openness     |   44  |   7    |
| Conscientiousness |   -6  |   -6   |
|    Extraversion   |   13  |   41   |
|   Agreeableness   |   14  |   32   |
|    Neuroticism    |   -1  |  -23   |
+-------------------+-------+--------+


In [74]:
# ----- Five Factor Model Random Question-----

# Utility
RESPONSE_TOKENS = tokenizer.encode("disagree agree")[1:-1]
QUESTIONNAIRE = json.load(open('neopir.json'))["questions"]
TEMPLATE = "I am [MASK] that [QUESTION]";
MAP = [1,5]

# Test
question = random.choice(QUESTIONNAIRE) 
input_question = TEMPLATE.replace("[QUESTION]", question[1])
inp = tokenizer(input_question, return_tensors='tf')
print(input_question)
for email_type in EMAIL_TYPES:
  token_prob = [float(model[email_type](inp).logits[0][tokenizer.encode(query).index(103)][RESPONSE_TOKENS[x]]) for x in [0,1]]
  token_prob = np.exp(token_prob) / np.sum(np.exp(token_prob), axis=0)
  print(f"{email_type} => disagree: {round(token_prob[0],2)}, agree: {round(token_prob[1],2)}")
  output = result[email_type][question[0]].append(sum(MAP * token_prob * question[2]))


I am [MASK] that I have frequent mood swings.
normal => disagree: 0.63, agree: 0.37
phish => disagree: 0.21, agree: 0.79
