In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import pickle
import os
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset

# Download NLTK data files (if not already installed)
nltk.download('stopwords')
nltk.download('punkt')



2024-08-19 21:10:19.784585: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-19 21:10:19.784683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-19 21:10:19.915044: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def clean_text(text, stop_words):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'http\S+|www\S+|https\S+', 'URLfound', text, flags=re.MULTILINE)
    text = text.lower()
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def ml_preprocess(df):
    stop_words = set(stopwords.words('english'))
    df['subject'].fillna('', inplace=True)
    df['content'] = df['subject'] + ' ' + df['body']
    df['content'] = df['content'].apply(lambda x: clean_text(x, stop_words))
    df['content'] = df['content'].apply(stem_text)
    df = df[['label', 'content']]
    return df

# Load the dataset
df = pd.read_csv("/kaggle/input/phishing-email-dataset-nazario-5-and-trec07/Nazario_5.csv")

# Preprocess the dataset
df = ml_preprocess(df)

# Split the data into training and testing sets
X = df['content']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the TfidfVectorizer
vectorizer = TfidfVectorizer()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['subject'].fillna('', inplace=True)


# Traditional ML Models

In [3]:
# Define the models
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(probability=True, random_state=42)
knn_model = KNeighborsClassifier()

# Create pipelines for each model
rf_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', rf_model)
])

svm_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', svm_model)
])

knn_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', knn_model)
])



# Random Forrest

In [4]:
# Define the hyperparameters for Grid Search
rf_params = {
    'classifier__n_estimators': [200, 300, 400],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Perform Grid Search CV for Random Forest
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=5, n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)
print("Best parameters for Random Forest:", rf_grid.best_params_)
rf_pred = rf_grid.predict(X_test)
rf_pred_prob = rf_grid.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred, digits=4))



Fitting 5 folds for each of 36 candidates, totalling 180 fits


  pid = os.fork()
  pid = os.fork()


Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
Random Forest Classification Report:
              precision    recall  f1-score   support

           0     0.9968    0.9840    0.9904       313
           1     0.9836    0.9967    0.9901       300

    accuracy                         0.9902       613
   macro avg     0.9902    0.9903    0.9902       613
weighted avg     0.9903    0.9902    0.9902       613



# SVM

In [5]:
# Define the hyperparameters for SVM
svm_params = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}
# Perform Grid Search CV for SVM
svm_grid = GridSearchCV(svm_pipeline, svm_params, cv=5, n_jobs=-1, verbose=1)
svm_grid.fit(X_train, y_train)
print("Best parameters for SVM:", svm_grid.best_params_)
svm_pred = svm_grid.predict(X_test)
svm_pred_prob = svm_grid.predict_proba(X_test)[:, 1]
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, digits=4))


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for SVM: {'classifier__C': 1, 'classifier__kernel': 'linear'}
SVM Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9968    0.9984       313
           1     0.9967    1.0000    0.9983       300

    accuracy                         0.9984       613
   macro avg     0.9983    0.9984    0.9984       613
weighted avg     0.9984    0.9984    0.9984       613



# KNN

In [6]:
# Define the hyperparameters for KNN
knn_params = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}
# Perform Grid Search CV for KNN
knn_grid = GridSearchCV(knn_pipeline, knn_params, cv=5, n_jobs=-1, verbose=1)
knn_grid.fit(X_train, y_train)
print("Best parameters for KNN:", knn_grid.best_params_)
knn_pred = knn_grid.predict(X_test)
knn_pred_prob = knn_grid.predict_proba(X_test)[:, 1]
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred, digits=4))



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for KNN: {'classifier__n_neighbors': 5, 'classifier__weights': 'distance'}
KNN Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9553    0.9771       313
           1     0.9554    1.0000    0.9772       300

    accuracy                         0.9772       613
   macro avg     0.9777    0.9776    0.9772       613
weighted avg     0.9782    0.9772    0.9772       613



# Save Traditional ML Models

In [7]:
# Save traditional ML models
if not os.path.exists('models'):
    os.makedirs('models')

with open('models/rf_pipeline.pkl', 'wb') as f:
    pickle.dump(rf_grid.best_estimator_, f)

with open('models/svm_pipeline.pkl', 'wb') as f:
    pickle.dump(svm_grid.best_estimator_, f)

with open('models/knn_pipeline.pkl', 'wb') as f:
    pickle.dump(knn_grid.best_estimator_, f)


# Deep Learning Models

In [3]:

# Tokenize and pad sequences for deep learning models
max_words = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

def build_cnn_model(hp):
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(Conv1D(
        filters=hp.Int('filters', min_value=32, max_value=256, step=32),
        kernel_size=hp.Choice('kernel_size', values=[3, 5, 7]),
        activation='relu'
    ))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(
        units=hp.Int('units', min_value=32, max_value=256, step=32),
        activation='relu'
    ))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

def build_lstm_model(hp):
    model = Sequential()
    model.add(Embedding(max_words, 128, input_length=max_len))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(
        units=hp.Int('lstm_units', min_value=32, max_value=256, step=32),
        dropout=0.2,
        recurrent_dropout=0.2
    ))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


# CNN

In [9]:
# Hyperparameter tuning for CNN
cnn_tuner = kt.Hyperband(
    build_cnn_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='cnn_tuning'
)



# Search for best hyperparameters
cnn_tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2)

# Retrieve best hyperparameters for CNN
best_cnn_hps = cnn_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search for CNN is complete. The optimal number of filters in the Conv1D layer is {best_cnn_hps.get('filters')},
the optimal kernel size is {best_cnn_hps.get('kernel_size')}, and the optimal number of units in the Dense layer is {best_cnn_hps.get('units')}.
""")


# Train with best hyperparameters for CNN
cnn_model = cnn_tuner.hypermodel.build(best_cnn_hps)
cnn_model.fit(X_train_pad, y_train, epochs=10, validation_split=0.2)
cnn_model.save('models/best_cnn_model.h5')

cnn_pred_prob = cnn_model.predict(X_test_pad)
cnn_pred = (cnn_pred_prob > 0.5).astype("int32")
print("CNN Classification Report:")
print(classification_report(y_test, cnn_pred, digits=4))



Trial 60 Complete [00h 00m 08s]
val_accuracy: 0.9918533563613892

Best val_accuracy So Far: 0.9938900470733643
Total elapsed time: 00h 06m 22s

The hyperparameter search for CNN is complete. The optimal number of filters in the Conv1D layer is 256,
the optimal kernel size is 3, and the optimal number of units in the Dense layer is 96.

Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.7364 - loss: 0.5566 - val_accuracy: 0.9796 - val_loss: 0.0684
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9858 - loss: 0.0465 - val_accuracy: 0.9817 - val_loss: 0.0418
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9994 - loss: 0.0047 - val_accuracy: 0.9857 - val_loss: 0.0401
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0021 - val_accuracy: 0.9857 - val_loss: 0.0355
Epoch 5/10
[1m62/62

# LSTM

In [10]:
# Hyperparameter tuning for LSTM
lstm_tuner = kt.Hyperband(
    build_lstm_model,
    objective='val_accuracy',
    max_epochs=10,
    hyperband_iterations=2,
    directory='my_dir',
    project_name='lstm_tuning'
)

# Search for best hyperparameters
lstm_tuner.search(X_train_pad, y_train, epochs=5, validation_split=0.2)

# Retrieve best hyperparameters for LSTM
best_lstm_hps = lstm_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search for LSTM is complete. The optimal number of units in the LSTM layer is {best_lstm_hps.get('lstm_units')}.
""")

# Train with best hyperparameters for LSTM
lstm_model = lstm_tuner.hypermodel.build(best_lstm_hps)
lstm_model.fit(X_train_pad, y_train, epochs=10, validation_split=0.2)
lstm_model.save('models/best_lstm_model.h5')

lstm_pred_prob = lstm_model.predict(X_test_pad)
lstm_pred = (lstm_pred_prob > 0.5).astype("int32")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, digits=4))


Trial 8 Complete [00h 01m 37s]
val_accuracy: 0.9877800345420837

Best val_accuracy So Far: 0.9898167252540588
Total elapsed time: 00h 14m 08s

The hyperparameter search for LSTM is complete. The optimal number of units in the LSTM layer is 128.

Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 716ms/step - accuracy: 0.7784 - loss: 0.5090 - val_accuracy: 0.9776 - val_loss: 0.0797
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 711ms/step - accuracy: 0.9819 - loss: 0.0600 - val_accuracy: 0.9613 - val_loss: 0.1293
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 705ms/step - accuracy: 0.9892 - loss: 0.0500 - val_accuracy: 0.9837 - val_loss: 0.0758
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 716ms/step - accuracy: 0.9978 - loss: 0.0107 - val_accuracy: 0.9878 - val_loss: 0.0460
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 712ms/step - accuracy: 1.0

# LMMs

In [6]:
# Define Hugging Face model training function with progress bar
def train_hf_model(model_name, tokenizer_class, model_class, X_train, y_train, X_test, y_test, save_path):
    os.environ['WANDB_DISABLED'] = 'true'
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=2)

    train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': y_train.tolist()})
    test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': y_test.tolist()})

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    print(f"Training {model_name} model...")
    trainer.train()

    # Evaluate the model
    predictions = trainer.predict(test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, preds, digits=4))

    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)


# BERT

In [7]:
# Train and save BERT model
train_hf_model(
    model_name='bert-base-uncased',
    tokenizer_class=BertTokenizer,
    model_class=BertForSequenceClassification,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    save_path='models/bert_model'
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training bert-base-uncased model...


Step,Training Loss
10,0.6744
20,0.6778
30,0.6545
40,0.6485
50,0.5883
60,0.58
70,0.4908
80,0.4339
90,0.3846
100,0.2787


bert-base-uncased Classification Report:
              precision    recall  f1-score   support

           0     0.9936    0.9936    0.9936       313
           1     0.9933    0.9933    0.9933       300

    accuracy                         0.9935       613
   macro avg     0.9935    0.9935    0.9935       613
weighted avg     0.9935    0.9935    0.9935       613



# DistilBERT

In [8]:
# Train and save DistilBERT model
train_hf_model(
    model_name='distilbert-base-uncased',
    tokenizer_class=DistilBertTokenizer,
    model_class=DistilBertForSequenceClassification,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    save_path='models/distilbert_model'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training distilbert-base-uncased model...


Step,Training Loss
10,0.6869
20,0.6966
30,0.6873
40,0.6784
50,0.677
60,0.668
70,0.6354
80,0.5834
90,0.5133
100,0.398


distilbert-base-uncased Classification Report:
              precision    recall  f1-score   support

           0     0.9968    0.9904    0.9936       313
           1     0.9901    0.9967    0.9934       300

    accuracy                         0.9935       613
   macro avg     0.9934    0.9935    0.9935       613
weighted avg     0.9935    0.9935    0.9935       613



# RoBERTa

In [11]:
# Train and save RoBERTa model
train_hf_model(
    model_name='roberta-base',
    tokenizer_class=RobertaTokenizer,
    model_class=RobertaForSequenceClassification,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    save_path='models/roberta_model'
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training roberta-base model...


Step,Training Loss
10,0.7096
20,0.6941
30,0.6802
40,0.6829
50,0.6563
60,0.6256
70,0.4671
80,0.3048
90,0.2264
100,0.2279


roberta-base Classification Report:
              precision    recall  f1-score   support

           0     0.9968    0.9872    0.9920       313
           1     0.9868    0.9967    0.9917       300

    accuracy                         0.9918       613
   macro avg     0.9918    0.9919    0.9918       613
weighted avg     0.9919    0.9918    0.9918       613



# BERT Large

In [12]:
# Train and save BERT Large model
train_hf_model(
    model_name='bert-large-uncased',
    tokenizer_class=BertTokenizer,
    model_class=BertForSequenceClassification,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    save_path='models/bert_large_model'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training bert-large-uncased model...


Step,Training Loss
10,0.7113
20,0.6774
30,0.6351
40,0.6537
50,0.5849
60,0.5512
70,0.4601
80,0.4038
90,0.3967
100,0.2786


bert-large-uncased Classification Report:
              precision    recall  f1-score   support

           0     0.9873    0.9904    0.9888       313
           1     0.9900    0.9867    0.9883       300

    accuracy                         0.9886       613
   macro avg     0.9886    0.9885    0.9886       613
weighted avg     0.9886    0.9886    0.9886       613



In [None]:
# Plot ROC curves
def plot_roc_curve(y_true, y_pred_prob, model_name):
    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.show()
    print(f'{model_name} AUROC: {roc_auc:.2f}')

In [3]:

# Download NLTK data files (if not already installed)
nltk.download('stopwords')
nltk.download('punkt')

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def clean_text(text, stop_words):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'http\S+|www\S+|https\S+', 'URLfound', text, flags=re.MULTILINE)
    text = text.lower()
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

def stem_text(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stemmed_text = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stemmed_text)

def ml_preprocess(df):
    stop_words = set(stopwords.words('english'))
    df['text'] = df['text'].apply(lambda x: clean_text(x, stop_words))
    df['text'] = df['text'].apply(stem_text)
    return df

# Load the dataset
df = pd.read_csv("/kaggle/input/phishing-email-dataset-nazario-5-and-trec07/email_text.csv")  # Update with the actual path

# Preprocess the dataset
df = ml_preprocess(df)

# Split the dataset into samples where the label is 1 (spam emails)
df_label_1 = df[df['label'] == 1]

# Change the label of these samples to 0
df_label_1['label'] = 0

# Define the data for testing
X_test = df_label_1['text']
y_test = df_label_1['label']

# Tokenize and pad sequences for deep learning models
max_words = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_test)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)




[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label_1['label'] = 0


In [4]:
# Load traditional ML models
with open('/kaggle/working/models/rf_pipeline.pkl', 'rb') as f:
    rf_model = pickle.load(f)

with open('/kaggle/working/models/svm_pipeline.pkl', 'rb') as f:
    svm_model = pickle.load(f)

with open('/kaggle/working/models/knn_pipeline.pkl', 'rb') as f:
    knn_model = pickle.load(f)
    
# Predict and print classification report and ROC curve for Random Forest
rf_pred = rf_model.predict(X_test)
rf_pred_prob = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred, digits=4))

# Predict and print classification report and ROC curve for SVM
svm_pred = svm_model.predict(X_test)
svm_pred_prob = svm_model.predict_proba(X_test)[:, 1]
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, digits=4))

# Predict and print classification report and ROC curve for KNN
knn_pred = knn_model.predict(X_test)
knn_pred_prob = knn_model.predict_proba(X_test)[:, 1]
print("KNN Classification Report:")
print(classification_report(y_test, knn_pred, digits=4))







# # Plot ROC curves for all models
# plot_roc_curve(y_test, rf_pred_prob, 'Random Forest')
# plot_roc_curve(y_test, svm_pred_prob, 'SVM')
# plot_roc_curve(y_test, knn_pred_prob, 'KNN')
# plot_roc_curve(y_test, cnn_pred_prob, 'CNN')
# plot_roc_curve(y_test, lstm_pred_prob, 'LSTM')
# plot_roc_curve(y_test, bert_pred_prob, 'BERT')
# plot_roc_curve(y_test, distilbert_pred_prob, 'DistilBERT')
# plot_roc_curve(y_test, roberta_pred_prob, 'RoBERTa')

Random Forest Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.7081    0.8291     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.7081     29923
   macro avg     0.5000    0.3540    0.4145     29923
weighted avg     1.0000    0.7081    0.8291     29923



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9059    0.9506     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.9059     29923
   macro avg     0.5000    0.4530    0.4753     29923
weighted avg     1.0000    0.9059    0.9506     29923



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KNN Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.6292    0.7724     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.6292     29923
   macro avg     0.5000    0.3146    0.3862     29923
weighted avg     1.0000    0.6292    0.7724     29923



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Load deep learning models
cnn_model = load_model('/kaggle/working/models/best_cnn_model.h5')
lstm_model = load_model('/kaggle/working/models/best_lstm_model.h5')

# Predict and print classification report and ROC curve for CNN
cnn_pred_prob = cnn_model.predict(X_test_pad)
cnn_pred = (cnn_pred_prob > 0.5).astype("int32")
print("CNN Classification Report:")
print(classification_report(y_test, cnn_pred, digits=4))

# Predict and print classification report and ROC curve for LSTM
lstm_pred_prob = lstm_model.predict(X_test_pad)
lstm_pred = (lstm_pred_prob > 0.5).astype("int32")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_pred, digits=4))

[1m936/936[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
CNN Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.6972    0.8216     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.6972     29923
   macro avg     0.5000    0.3486    0.4108     29923
weighted avg     1.0000    0.6972    0.8216     29923



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[1m936/936[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 174ms/step
LSTM Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.8786    0.9354     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.8786     29923
   macro avg     0.5000    0.4393    0.4677     29923
weighted avg     1.0000    0.8786    0.9354     29923



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
def predict_hf_model(model, tokenizer, X_test):
    encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
    outputs = model(**encodings)
    predictions = torch.argmax(outputs.logits, dim=1).detach().numpy()
    pred_prob = torch.softmax(outputs.logits, dim=1).detach().numpy()[:, 1]
    return predictions, pred_prob

def predict_hf_model_batch(model, tokenizer, dataloader, device):
    model = model.to(device)
    model.eval()
    all_preds = []
    all_probs = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting", unit="batch"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
            all_preds.extend(preds)
            all_probs.extend(probs)
    return np.array(all_preds), np.array(all_probs)
# Parameters
BATCH_SIZE = 64
MAX_LENGTH = 512

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# Load Hugging Face models using safetensors
bert_tokenizer = BertTokenizer.from_pretrained('/kaggle/working/models/bert_model')
bert_model = BertForSequenceClassification.from_pretrained('/kaggle/working/models/bert_model', use_safetensors=True)

# Create DataLoader
test_dataset = CustomDataset(X_test.tolist(), y_test.tolist(), bert_tokenizer, MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


# Predict and print classification report and ROC curve for BERT
bert_pred, bert_pred_prob = predict_hf_model_batch(bert_model, bert_tokenizer, test_dataloader, device)
print("BERT Classification Report:")
print(classification_report(y_test, bert_pred, digits=4))

Predicting: 100%|██████████| 468/468 [11:29<00:00,  1.47s/batch]

BERT Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.8589    0.9241     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.8589     29923
   macro avg     0.5000    0.4294    0.4620     29923
weighted avg     1.0000    0.8589    0.9241     29923




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:

distilbert_tokenizer = DistilBertTokenizer.from_pretrained('/kaggle/working/models/distilbert_model')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('/kaggle/working/models/distilbert_model', use_safetensors=True)

# Predict and print classification report and ROC curve for DistilBERT
distilbert_test_dataset = CustomDataset(X_test.tolist(), y_test.tolist(), distilbert_tokenizer, MAX_LENGTH)
distilbert_test_dataloader = DataLoader(distilbert_test_dataset, batch_size=BATCH_SIZE)

distilbert_pred, distilbert_pred_prob = predict_hf_model_batch(distilbert_model, distilbert_tokenizer, distilbert_test_dataloader, device)
print("DistilBERT Classification Report:")
print(classification_report(y_test, distilbert_pred, digits=4))

Predicting: 100%|██████████| 468/468 [07:39<00:00,  1.02batch/s]

DistilBERT Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.7095    0.8301     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.7095     29923
   macro avg     0.5000    0.3547    0.4150     29923
weighted avg     1.0000    0.7095    0.8301     29923




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('/kaggle/working/models/roberta_model')
roberta_model = RobertaForSequenceClassification.from_pretrained('models/roberta_model', use_safetensors=True)

# Predict and print classification report and ROC curve for RoBERTa
roberta_test_dataset = CustomDataset(X_test.tolist(), y_test.tolist(), roberta_tokenizer, MAX_LENGTH)
roberta_test_dataloader = DataLoader(roberta_test_dataset, batch_size=BATCH_SIZE)

roberta_pred, roberta_pred_prob = predict_hf_model_batch(roberta_model, roberta_tokenizer, roberta_test_dataloader, device)
print("RoBERTa Classification Report:")
print(classification_report(y_test, roberta_pred, digits=4))

Predicting: 100%|██████████| 468/468 [09:52<00:00,  1.27s/batch]

RoBERTa Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.7365    0.8482     29923
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.7365     29923
   macro avg     0.5000    0.3682    0.4241     29923
weighted avg     1.0000    0.7365    0.8482     29923




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
