## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from  sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

## Download and Process Dataset

In [2]:
# Download Dataset
path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

In [3]:
# Load the dataset
df1 = pd.read_csv(path + "/CEAS_08.csv")
df2 = pd.read_csv(path + "/Nazario.csv")
df3 = pd.read_csv(path + "/Nigerian_Fraud.csv")
df4 = pd.read_csv(path + "/SpamAssasin.csv")
df5 = pd.read_csv(path + "/Enron.csv")
df6 = pd.read_csv(path + "/Ling.csv")
df7 = pd.read_csv(path + "/phishing_email.csv")
print("Data loaded successfully.")

Data loaded successfully.


In [4]:
# Combine dataset and drop all cols except combined text and label
df7.rename(columns={"text_combined": "text"}, inplace=True)

processedframes = []
for df in [df1, df2, df3, df4, df5, df6]:
  df["text"] = df["subject"].fillna("") + " " + df["body"].fillna(" ")
  df = df[["text", "label"]]
  processedframes.append(df)
processedframes.append(df7)
df = pd.concat(processedframes, ignore_index=True)

# Process Data
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

print(df.shape)
df.head()

(164552, 2)


Unnamed: 0,text,label
0,"Never agree to be a loser Buck up, your troubl...",1
1,Befriend Jenna Jameson \nUpgrade your sex and ...,1
2,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...,1
3,Re: svn commit: r619753 - in /spamassassin/tru...,0
4,SpecialPricesPharmMoreinfo \nWelcomeFastShippi...,1


In [5]:
# Process Text
def process_text(text):
  text = text.lower()
  text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS]) # Should remove both doubled up whitespace AND stop words
  return text
df["text"] = df["text"].apply(process_text)

# Vectorisation and Train Test Splitting

In [6]:
vectoriser = TfidfVectorizer(max_features = 5000) # Feature number limited to 5000 for performance reasons
X_vect = vectoriser.fit_transform(df["text"])
y_vect = df["label"].values
X_train_vect, X_test_vect, y_train_vect, y_test_vect = train_test_split(X_vect, y_vect, test_size = 0.2, random_state = 42, stratify = y_vect)

# Model Training (Vectorisation)

In [7]:
svc_model_vect = SVC(kernel = "linear")
svc_model_vect.fit(X_train_vect, y_train_vect)
svc_predictions_vect = svc_model_vect.predict(X_test_vect)

print("SVC with Vectorisation results")
print(classification_report(y_test_vect, svc_predictions_vect))
print(confusion_matrix(y_test_vect, svc_predictions_vect))
svc_scores_vect = cross_val_score(svc_model_vect, X_train_vect, y_train_vect, cv=5, scoring="f1")
print(f"Cross-validation scores: {svc_scores_vect}")
print(f"Mean cross-validation score: {svc_scores_vect.mean()}")

SVC with Vectorisation results
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     15765
           1       0.99      0.99      0.99     17146

    accuracy                           0.99     32911
   macro avg       0.99      0.99      0.99     32911
weighted avg       0.99      0.99      0.99     32911

[[15544   221]
 [  183 16963]]
Cross-validation scores: [0.98778892 0.98675304 0.98634316 0.98680815 0.98816417]
Mean cross-validation score: 0.9871714905836338


# Bag of Words and Train test Splitting

In [8]:
bag_of_words = CountVectorizer(max_features = 5000)
X_vect_bow = bag_of_words.fit_transform(df["text"])
y_vect_bow = df["label"].values
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_vect_bow, y_vect_bow, test_size = 0.2, random_state = 42, stratify = y_vect_bow)

# Model Training (Bag of Words)

In [9]:
svc_model_bow = SVC(kernel = "linear")
svc_model_bow.fit(X_train_bow, y_train_bow)
svc_predictions_bow = svc_model_bow.predict(X_test_bow)

print("SVC with Bag of Words results")
print(classification_report(y_test_bow, svc_predictions_bow))
print(confusion_matrix(y_test_bow, svc_predictions_bow))
svc_scores_bow = cross_val_score(svc_model_bow, X_train_bow, y_train_bow, cv=5, scoring="f1")
print(f"Cross-validation scores: {svc_scores_bow}")
print(f"Mean cross-validation score: {svc_scores_bow.mean()}")

SVC with Bag of Words results
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     15765
           1       0.99      0.99      0.99     17146

    accuracy                           0.99     32911
   macro avg       0.99      0.99      0.99     32911
weighted avg       0.99      0.99      0.99     32911

[[15563   202]
 [  114 17032]]
Cross-validation scores: [0.98800436 0.98844729 0.98653406 0.9868732  0.9885492 ]
Mean cross-validation score: 0.9876816238083336


# Save Models

In [10]:
from pickle import dump
with open("svm_vect.pkl", "wb") as f:
  dump(svc_model_vect, f, protocol = 5)

with open("svm_bow.pkl", "wb") as f2:
  dump(svc_model_bow, f2, protocol = 5)