In [1]:
import kagglehub
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Final Dataset source from: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset?select=phishing_email.csv
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/ECE592B/archive/phishing_email.csv')

Mounted at /content/drive


In [3]:
# Data processing
df['text'] = df['text_combined']
df = df.drop(columns=['text_combined'])
df['text'] = df['text'].str.lower()

# Clean URLs special characters
def clean_text(text):
    text = re.sub(r'\S+@\S+', ' ', text)  # remove email addresses
    text = re.sub(r'http\S+|www\S+', ' ', text)  # remove URLs
    text = re.sub(r'\d+', ' ', text)  # remove numbers
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    text = re.sub(r'subject', '', text).strip()# remove subject
    return text

df['text'] = df['text'].apply(clean_text)

# Stop words + Lemmatization = Tokenize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(tokenize_and_lemmatize)

df['label_num'] = df['label']
df.drop(columns=['label'], inplace=True)

In [4]:
X = df['text']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42,
    stratify=y  # Keeps class distribution balanced
)

In [5]:
def pruned_vocab(input_data, percentile):
  # Build vocabulary set
  V = set()
  idf = {}
  tf = {}

  for i in range(len(input_data)):
    terms = input_data.iloc[i].split()
    V.update(terms)
    tf[i] = {}
    for term in terms:
        if term not in idf:
            idf[term] = 0
        idf[term] += 1

        if term not in tf[i]:
            tf[i][term] = 0
        tf[i][term] += 1

  # Generate IDF for each term by dividing by total number of documents
  N = len(input_data)
  for term in idf:
      idf[term] = np.log(N / idf[term])

  # Find TF-IDF threshold (Xth percentile)
  #print(len(input_data))
  tf_idfs = []
  for i in range(len(input_data)):
      for term in tf[i]:
          tf_idfs.append(tf[i][term] * idf[term])

  threshold = np.percentile(tf_idfs, percentile)

  # prune by tf-idf
  V_pruned = set()
  for i in range(len(input_data)):
      for term in tf[i]:
          tf_idf = tf[i][term] * idf[term]
          if tf_idf >= threshold:
              V_pruned.add(term)

  return V_pruned

In [6]:
# Get SPAM pruned Vocab
train_spam = X_train[y_train == 1]
V_spam = pruned_vocab(train_spam, 99)

# Get Ham pruned Vocab
train_ham = X_train[y_train == 0]
V_ham = pruned_vocab(train_ham, 99)

# Combine Spam and Ham Vocab
V = V_spam.union(V_ham)

In [8]:

from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# 1. Split 10% final test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# 2. TF-IDF Vectorizer on train_val data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_val_tfidf = vectorizer.fit_transform(X_train_val)
X_test_tfidf = vectorizer.transform(X_test)

# 3. K-Fold Cross Validation on training set (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val_tfidf)):
    X_train, X_val = X_train_val_tfidf[train_idx], X_train_val_tfidf[val_idx]
    y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    fold_accuracies.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")

print(f"\nAverage Cross-Validation Accuracy: {np.mean(fold_accuracies):.4f}")

# 4. Train on full training set and test on final 10%
final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train_val_tfidf, y_train_val)
y_test_pred = final_model.predict(X_test_tfidf)

print("\nFinal Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Fold 1 Accuracy: 0.9824
Fold 2 Accuracy: 0.9810
Fold 3 Accuracy: 0.9832
Fold 4 Accuracy: 0.9832
Fold 5 Accuracy: 0.9817

Average Cross-Validation Accuracy: 0.9823

Final Test Accuracy: 0.9854527821554128

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98      3960
           1       0.99      0.98      0.99      4289

    accuracy                           0.99      8249
   macro avg       0.99      0.99      0.99      8249
weighted avg       0.99      0.99      0.99      8249

