In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import json
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from transformers import get_scheduler
from torch.utils.data import DataLoader, TensorDataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from transformers import BertTokenizer, BertForSequenceClassification


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/phishingemails/Phishing_Email.csv
/kaggle/input/phishing-email-dataset/SpamAssasin.csv
/kaggle/input/phishing-email-dataset/Nazario.csv
/kaggle/input/phishing-email-dataset/Nigerian_Fraud.csv
/kaggle/input/phishing-email-dataset/CEAS_08.csv
/kaggle/input/phishing-email-dataset/Enron.csv
/kaggle/input/phishing-email-dataset/Ling.csv
/kaggle/input/phishing-email-dataset/phishing_email.csv


In [2]:
# Process and modify input datasets so that all have same column names
# Also create joint dataset of all the data combined
input_files = ["/kaggle/input/phishingemails/Phishing_Email.csv", "/kaggle/input/phishing-email-dataset/CEAS_08.csv", 
               "/kaggle/input/phishing-email-dataset/Enron.csv", "/kaggle/input/phishing-email-dataset/Ling.csv",
               "/kaggle/input/phishing-email-dataset/Nazario.csv","/kaggle/input/phishing-email-dataset/Nigerian_Fraud.csv", 
               "/kaggle/input/phishing-email-dataset/SpamAssasin.csv"]
new_files = ["/kaggle/working/Phishing_Email.csv", "/kaggle/working/CEAS_08.csv", "/kaggle/working/Enron.csv", "/kaggle/working/Ling.csv", 
             "/kaggle/working/Nazario.csv", "/kaggle/working/Nigerian_Fraud.csv", "/kaggle/working/SpamAssassin.csv"]

# Combined data to be used for training, evaluation, etc.
combined_data = {"Email": [], "Label": []}

# Get data for each individual dataset
# Also combine the data into full dataset
for input_file, file in zip(input_files, new_files):
    # Load the dataset
    data = pd.read_csv(input_file)

    # Load the emails and the labels
    if input_file == "/kaggle/input/phishingemails/Phishing_Email.csv":   
        emails = [email for email in data["Email Text"].values]
        labels = [0 if label == "Safe Email" else 1 for label in data["Email Type"].values]
    else:
        emails = [email for email in data["body"].values]
        labels = [label for label in data["label"].values]

    # Get rid of empty strings (bad data) and strings that are too long (unusable data)
    cur = 0
    empty = 0
    invalid = 0
    long = 0
    total = len(emails)
    counter = 0
    while cur < len(emails):
        counter += 1
        if type(emails[cur]) != str or len(emails[cur]) == 0:
            emails.pop(cur)
            labels.pop(cur)
            empty += 1
        elif not (labels[cur] == 0 or labels[cur] == 1):
            emails.pop(cur)
            labels.pop(cur)
            invalid += 1
        elif len(emails[cur].split()) > 512:
            emails.pop(cur)
            labels.pop(cur)
            long += 1
        else:
            cur += 1

    # Modify dataset as described at the top
    df = pd.DataFrame({"Email": emails, "Label": labels})
    df.to_csv(file, index=False)
        
    # Get stats for this individual dataset
    print(input_file + " data:")
    total = len(labels)
    phishes = sum(labels)
    print(f"Phishing Emails: {phishes}")
    print(f"Valid Emails: {total - phishes}")
    print(f"Skipped over: {counter - total}")
    print(f"Empty Emails: {empty}")
    print(f"Invalid Emails: {invalid}")
    print(f"Long Emails: {long}\n")

    # Add to the combined dataset
    combined_data["Email"] += emails
    combined_data["Label"] += labels

    # Print out some random emails
    # for i in random.sample(range(1, len(emails) + 1), 10):
    #     print(emails[i])
    #     print(labels[i])

# Write the combined dataset to a csv
df = pd.DataFrame(combined_data)
df.to_csv("/kaggle/working/full_dataset.csv", index=False)
new_files.append("/kaggle/working/full_dataset.csv")

# Get combined data stats
print("Combined data:")
total = len(combined_data["Label"])
phishes = sum(combined_data["Label"])
print(f"Phishing Emails: {phishes}")
print(f"Valid Emails: {total - phishes}")


/kaggle/input/phishingemails/Phishing_Email.csv data:
Phishing Emails: 6279
Valid Emails: 9284
Skipped over: 3087
Empty Emails: 16
Invalid Emails: 0
Long Emails: 3071

/kaggle/input/phishing-email-dataset/CEAS_08.csv data:
Phishing Emails: 21731
Valid Emails: 14566
Skipped over: 2857
Empty Emails: 0
Invalid Emails: 0
Long Emails: 2857

/kaggle/input/phishing-email-dataset/Enron.csv data:
Phishing Emails: 12054
Valid Emails: 13348
Skipped over: 4365
Empty Emails: 0
Invalid Emails: 0
Long Emails: 4365

/kaggle/input/phishing-email-dataset/Ling.csv data:
Phishing Emails: 253
Valid Emails: 1441
Skipped over: 1165
Empty Emails: 0
Invalid Emails: 0
Long Emails: 1165

/kaggle/input/phishing-email-dataset/Nazario.csv data:
Phishing Emails: 1502
Valid Emails: 0
Skipped over: 63
Empty Emails: 0
Invalid Emails: 0
Long Emails: 63

/kaggle/input/phishing-email-dataset/Nigerian_Fraud.csv data:
Phishing Emails: 2272
Valid Emails: 0
Skipped over: 1060
Empty Emails: 0
Invalid Emails: 0
Long Emails: 106

In [3]:
# Specific Dataset format for use with bert
class PhishingDataset(Dataset):
    def __init__(self, emails, labels, tokenizer, max_length):
        self.emails = emails
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.emails)

    def __getitem__(self, idx):
        email = self.emails[idx]
        label = self.labels[idx]

        # Tokenize the email
        tokens = self.tokenizer(
            email,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": tokens["input_ids"].squeeze(0),  # Remove batch dimension
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [4]:
# Making sure we're using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize tokenizer and bert model
print("Initializing tokenizer")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print("Initializing model")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)  # Binary classification
print("Initializing optimizer")
optimizer = AdamW(model.parameters(), lr=5e-5)

# Read in CEAS_08 dataset for training
print("Reading in dataset")
data = pd.read_csv("/kaggle/working/CEAS_08.csv")
emails = [email for email in data["Email"].values]
labels = [label for label in data["Label"].values]

# Split up into training and validation data
print("Splitting dataset into training and validation data")
X_train, X_val, y_train, y_val = train_test_split(emails, labels, test_size=0.2, random_state=42)
train_dataset = PhishingDataset(X_train, y_train, tokenizer, max_length=512)
val_dataset = PhishingDataset(X_val, y_val, tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("Done!")

Using device: cuda
Initializing tokenizer


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Initializing model


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initializing optimizer
Reading in dataset
Splitting dataset into training and validation data
Done!


In [5]:
# Train the model
epochs = 3

# Learning rate scheduler
print("Setting up learning rate scheduler")
num_training_steps = len(train_loader) * epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
for epoch in range(epochs):
    print(f"Starting Epoch {epoch+1}")
    model.train()
    running_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

# Save the model for future use
model.save_pretrained("phishing_detector_bert")
tokenizer.save_pretrained("phishing_detector_bert")

Setting up learning rate scheduler
Starting Epoch 1
Epoch 1/3, Loss: 0.0289
Starting Epoch 2
Epoch 2/3, Loss: 0.0048
Starting Epoch 3
Epoch 3/3, Loss: 0.0009


('phishing_detector_bert/tokenizer_config.json',
 'phishing_detector_bert/special_tokens_map.json',
 'phishing_detector_bert/vocab.txt',
 'phishing_detector_bert/added_tokens.json')

In [6]:
# Evaluate the model on the validation data
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [7]:
# Test the model on each individual dataset
for file in new_files:
    # Read in dataset for evaluation
    print("Reading in " + file)
    data = pd.read_csv(file)
    emails = [email for email in data["Email"].values]
    labels = [label for label in data["Label"].values]

    # Set up PhishingDataset to run model on
    dataset = PhishingDataset(emails, labels, tokenizer, max_length=512)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

    # Evaluate the model on this dataset
    print("Evaluate the model on " + file)
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
    
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
    
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    print("Accuracy on " + file + f": {accuracy:.2f}")
    

Reading in /kaggle/working/Phishing_Email.csv
Evaluate the model on /kaggle/working/Phishing_Email.csv
Accuracy on /kaggle/working/Phishing_Email.csv: 0.86
Reading in /kaggle/working/CEAS_08.csv
Evaluate the model on /kaggle/working/CEAS_08.csv
Accuracy on /kaggle/working/CEAS_08.csv: 1.00
Reading in /kaggle/working/Enron.csv
Evaluate the model on /kaggle/working/Enron.csv
Accuracy on /kaggle/working/Enron.csv: 0.87
Reading in /kaggle/working/Ling.csv
Evaluate the model on /kaggle/working/Ling.csv
Accuracy on /kaggle/working/Ling.csv: 0.95
Reading in /kaggle/working/Nazario.csv
Evaluate the model on /kaggle/working/Nazario.csv
Accuracy on /kaggle/working/Nazario.csv: 0.25
Reading in /kaggle/working/Nigerian_Fraud.csv
Evaluate the model on /kaggle/working/Nigerian_Fraud.csv
Accuracy on /kaggle/working/Nigerian_Fraud.csv: 0.85
Reading in /kaggle/working/SpamAssassin.csv
Evaluate the model on /kaggle/working/SpamAssassin.csv
Accuracy on /kaggle/working/SpamAssassin.csv: 0.84
Reading in /k