In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/naserabdullahalam/phishing-email-dataset?dataset_version_number=1...


100%|██████████| 77.1M/77.1M [00:04<00:00, 17.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/naserabdullahalam/phishing-email-dataset/versions/1


In [2]:
import os

# list all files and directories in given path
all_files = os.listdir(path)

# filter for CSV files
csv_files = [f for f in all_files if f.endswith('.csv')]

print("CSV files in the dataset directory:")
for csv_file in csv_files:
    print(csv_file)


CSV files in the dataset directory:
Enron.csv
Nigerian_Fraud.csv
Nazario.csv
Ling.csv
phishing_email.csv
SpamAssasin.csv
CEAS_08.csv


In [3]:
import pandas as pd
import os

# List to store individual dataframes
dfs = []

# Iterate through the list of CSV files
for file_name in csv_files:
    file_path = os.path.join(path, file_name)
    try:
        # Read the CSV file, handling potential encoding or parsing errors
        df = pd.read_csv(file_path, on_bad_lines='skip', encoding_errors='replace')

        # Check if the required columns exist
        if 'body' in df.columns and 'label' in df.columns:
            # Extract only the needed columns
            subset = df[['body', 'label']]
            dfs.append(subset)
            print(f"Processed {file_name}: {len(subset)} rows added.")
        else:
            print(f"Skipping {file_name}: Missing 'body' or 'label' columns.")

    except Exception as e:
        print(f"Error reading {file_name}: {e}")

# combine all dataframes
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)

    # save dataset
    output_file = 'combined_email_dataset.csv'
    combined_df.to_csv(output_file, index=False)

    print(f"\nSuccessfully saved combined dataset to '{output_file}'")
    print(f"Total rows: {len(combined_df)}")
else:
    print("No data found to combine.")

Processed Enron.csv: 29767 rows added.
Processed Nigerian_Fraud.csv: 3332 rows added.
Processed Nazario.csv: 1565 rows added.
Processed Ling.csv: 2859 rows added.
Skipping phishing_email.csv: Missing 'body' or 'label' columns.
Processed SpamAssasin.csv: 5809 rows added.
Processed CEAS_08.csv: 39154 rows added.

Successfully saved combined dataset to 'combined_email_dataset.csv'
Total rows: 82486


In [4]:
df = pd.read_csv('combined_email_dataset.csv')
df.head()

Unnamed: 0,body,label
0,( see attached file : hplno 525 . xls )\r\n- h...,0
1,- - - - - - - - - - - - - - - - - - - - - - fo...,0
2,"estimated actuals\r\nmarch 30 , 2001\r\nno flo...",0
3,( see attached file : hplno 530 . xls )\r\n- h...,0
4,( see attached file : hplno 601 . xls )\r\n- h...,0


In [5]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

# preparing data
print(f'Total samples before cleaning: {len(df)}')
df = df.dropna(subset=['body', 'label']).copy()

# Ensure labels are integers (0 or 1)
df['label'] = pd.to_numeric(df['label'], errors='coerce').astype(int)

# Extract sentences and labels as lists
sentences = df.body.values
labels = df.label.values

print(f'Total samples after cleaning: {len(df)}')

Total samples before cleaning: 82486
Total samples after cleaning: 82485


In [6]:
# loading tokenizer and checking max length of tokens from sample
# print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

lengths = [len(tokenizer.tokenize(t)) for t in df.body[:1000]]
print(max(lengths))  # shows that there are email bodies with more tokens than BERT's max of 512


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

3830


In [8]:
# tokenization

input_ids = []
attention_masks = []

for sent in sentences:
  encoded_dict = tokenizer.encode_plus(
      str(sent),
      add_special_tokens=True,
      max_length=128, # will not be able to use the entire email body
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
  )

  # adding encoded sentence and attention mask to their respective lists
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])


# convert to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)


In [9]:
# training split and dataloaders

dataset = TensorDataset(input_ids, attention_masks, labels)

# 80-10-10 train val split
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

generator = torch.Generator().manual_seed(42)  # to keep seed consistent
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size], generator=generator)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} test samples'.format(test_size))

batch_size = 32

# create dataloaders for both sets
# training samples in random order for variety in training
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), # batches taken sequentially
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
        )

print('data setup done')

65,988 training samples
8,248 validation samples
8,249 test samples
data setup done


In [10]:
import time
import datetime

# defining helper functions
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
  '''
  take time in seconds and return a string hh:mm:ss
  '''

  # first round to nearest second
  elapsed_rounded = int(round((elapsed)))

  return str(datetime.timedelta(seconds=elapsed_rounded))

In [11]:
from transformers import BertForSequenceClassification, BertConfig
import torch
from transformers import get_linear_schedule_with_warmup

# defining model and optimizer

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# run on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.cuda()

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import random
import time

# training
epochs = 3

total_steps = len(train_dataloader) * epochs  # [number of batches] x [number of epochs]

# learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps
                                            )

# training loop

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

for epoch in range(epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    t0 = time.time()  # measure duration of epoch
    train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        # update progress every 40 batches
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader
        # `batch` contains three pytorch tensors:
        #   [0]: input_ids
        #   [1]: attention_masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # clear previously calculated gradients before backward pass
        model.zero_grad()

        # forward pass (evaluate the model on this training batch)
        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        # accumulate training loss over all batches to
        # calculate average loss at the end
        train_loss += loss.item()

        # perform backward pass to calculate gradients
        loss.backward()

        # clip the norm of the gradients to 1.0.
        # to prevent "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters and take a step using computed gradient
        optimizer.step()

        # update learning rate
        scheduler.step()

    # calculate average loss over all batches
    avg_train_loss = train_loss / len(train_dataloader)

    # measure how long epoch took
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))


    # VALIDATION

    # after each training epoch, measure performance on
    # validation set

    print("")
    print("Running Validation...")

    t0 = time.time()

    # put model in evaluation mode
    model.eval()

    # tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training)
        with torch.no_grad():
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        # accumulate validation loss
        total_eval_loss += loss.item()

        # move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences and
        # accumulate over all batches
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # report final accuracy for validation run
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # calculate avg loss over all batches
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # duration of validation
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # stats from epoch
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of  2,063.    Elapsed: 0:00:12.
  Batch    80  of  2,063.    Elapsed: 0:00:24.
  Batch   120  of  2,063.    Elapsed: 0:00:35.
  Batch   160  of  2,063.    Elapsed: 0:00:47.
  Batch   200  of  2,063.    Elapsed: 0:00:59.
  Batch   240  of  2,063.    Elapsed: 0:01:10.
  Batch   280  of  2,063.    Elapsed: 0:01:22.
  Batch   320  of  2,063.    Elapsed: 0:01:34.
  Batch   360  of  2,063.    Elapsed: 0:01:46.
  Batch   400  of  2,063.    Elapsed: 0:01:59.
  Batch   440  of  2,063.    Elapsed: 0:02:11.
  Batch   480  of  2,063.    Elapsed: 0:02:23.
  Batch   520  of  2,063.    Elapsed: 0:02:35.
  Batch   560  of  2,063.    Elapsed: 0:02:47.
  Batch   600  of  2,063.    Elapsed: 0:02:59.
  Batch   640  of  2,063.    Elapsed: 0:03:11.
  Batch   680  of  2,063.    Elapsed: 0:03:23.
  Batch   720  of  2,063.    Elapsed: 0:03:35.
  Batch   760  of  2,063.    Elapsed: 0:03:47.
  Batch   800  of  2,063.    Elapsed: 0:03:59.
  Batch   840  of  2,063.    Elapsed: 0:04:12.


In [13]:
# TEST SET EVALUATION

print("Running Evaluation on Test Set...")

t0 = time.time()
model.eval() # put model in evaluation mode

# tracking variables
total_test_accuracy = 0
total_test_loss = 0
predictions , true_labels = [], []

# predict
for batch in test_dataloader:

    # add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    # no gradients
    with torch.no_grad():
        # forward pass, calculate logit predictions
        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)

    logits = result.logits
    loss = result.loss
    total_test_loss += loss.item()

    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # store predictions and true labels
    total_test_accuracy += flat_accuracy(logits, label_ids)
    predictions.extend(np.argmax(logits, axis=1).flatten())
    true_labels.extend(label_ids.flatten())

# final accuracy for test run
avg_test_accuracy = total_test_accuracy / len(test_dataloader)
print("  Test Accuracy: {0:.2f}".format(avg_test_accuracy))
print("  Test Loss: {0:.2f}".format(total_test_loss / len(test_dataloader)))
print("  Evaluation took: {:}".format(format_time(time.time() - t0)))

Running Evaluation on Test Set...
  Test Accuracy: 0.99
  Test Loss: 0.04
  Evaluation took: 0:00:26


In [14]:
from sklearn.metrics import classification_report, accuracy_score

# helper fxn to gather all predictions
def get_all_predictions(model, dataloader):
    model.eval() # model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_preds = []
    all_labels = []

    print("Running inference to calculate metrics...")

    for step, batch in enumerate(dataloader):
        # move batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # get model outputs (logits)
        with torch.no_grad():
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           return_dict=True)

        logits = result.logits

        # logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # logits to class predictions (0 or 1)
        batch_preds = np.argmax(logits, axis=1)

        # add to lists
        all_preds.extend(batch_preds)
        all_labels.extend(label_ids)

    return all_labels, all_preds


true_labels, predicted_labels = get_all_predictions(model, test_dataloader)

# Print the Standard Metrics Report
print("\n" + "="*30)
print("FINAL EVALUATION REPORT")
print("="*30)

# Calculate simple accuracy
acc = accuracy_score(true_labels, predicted_labels)
print(f"Overall Accuracy: {acc*100:.2f}%")

# Calculate Precision, Recall, and F1-Score
# target_names assumes 0 is Legitimate and 1 is Phishing/Spam
print("\nDetailed Metrics:")
print(classification_report(true_labels, predicted_labels, target_names=['Legitimate', 'Spam']))

Running inference to calculate metrics...

FINAL EVALUATION REPORT
Overall Accuracy: 99.28%

Detailed Metrics:
              precision    recall  f1-score   support

  Legitimate       0.99      0.99      0.99      3947
        Spam       0.99      0.99      0.99      4302

    accuracy                           0.99      8249
   macro avg       0.99      0.99      0.99      8249
weighted avg       0.99      0.99      0.99      8249

