<a href="https://colab.research.google.com/github/samhzc/Summer-2025-ECE-597-Group10/blob/main/SPAM_HAM_Data_Proccessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SPAM/HAM Data Proccessing
### Overview
This notebook performs data preproccesing, partitioning and feature extraction for SPAM/HAM classification. It then performs a basic POC naives bayes classification.

### Link to Output Data
https://drive.google.com/drive/folders/1-brvtaq8TE4hl7w5rx1o7Xcq3x59Jn3A?usp=sharing

### Preproccesing
The data proccessing removes special characters, addresses and URLs, lowercases every character, removes stopwords and performs lemmatization.

### Feature Extraction
The vocabulary set is pruned to include the 99th percentile of vocabulary based on per class tf-idf score.

In [1]:
import kagglehub
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Final Dataset source from: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset?select=phishing_email.csv
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Collab_Files/SPAM/SPAM_Final_Dataset/archive/phishing_email.csv')

Mounted at /content/drive


In [4]:
df['text'] = df['text_combined']
df = df.drop(columns=['text_combined'])
df['text'] = df['text'].str.lower()

# Clean URLs special characters
def clean_text(text):
    text = re.sub(r'\S+@\S+', ' ', text)  # remove email addresses
    text = re.sub(r'http\S+|www\S+', ' ', text)  # remove URLs
    text = re.sub(r'\d+', ' ', text)  # remove numbers
    text = re.sub(r'[^\w\s]', ' ', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    text = re.sub(r'subject', '', text).strip()# remove subject
    return text

df['text'] = df['text'].apply(clean_text)

# Stop words + Lemmatization = Tokenize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(tokenize_and_lemmatize)
df['label_num'] = df['label']
df.drop(columns=['label'], inplace=True)

### Train Test Split


In [5]:
X = df['text']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42,
    stratify=y  # Keeps class distribution balanced
)

### Generate Class wise TF-IDF Rankings using Train Set to Prune Vocab

In [6]:
def pruned_vocab(input_data, percentile):
  # Build vocabulary set
  V = set()
  idf = {}
  tf = {}

  for i in range(len(input_data)):
    terms = input_data.iloc[i].split()
    V.update(terms)
    tf[i] = {}
    for term in terms:
        if term not in idf:
            idf[term] = 0
        idf[term] += 1

        if term not in tf[i]:
            tf[i][term] = 0
        tf[i][term] += 1

  # Generate IDF for each term by dividing by total number of documents
  N = len(input_data)
  for term in idf:
      idf[term] = np.log(N / idf[term])

  # Find TF-IDF threshold (Xth percentile)
  #print(len(input_data))
  tf_idfs = []
  for i in range(len(input_data)):
      for term in tf[i]:
          tf_idfs.append(tf[i][term] * idf[term])

  threshold = np.percentile(tf_idfs, percentile)

  # prune by tf-idf
  V_pruned = set()
  for i in range(len(input_data)):
      for term in tf[i]:
          tf_idf = tf[i][term] * idf[term]
          if tf_idf >= threshold:
              V_pruned.add(term)
  print("Original Vocab Size:", len(V))
  print("Pruned Vocab Size:", len(V_pruned))
  return V_pruned

In [7]:
# Get SPAM pruned Vocab
train_spam = X_train[y_train == 1]
V_spam = pruned_vocab(train_spam, 99)

# Get Ham pruned Vocab
train_ham = X_train[y_train == 0]
V_ham = pruned_vocab(train_ham, 99)

# Combine Spam and Ham Vocab
V = V_spam.union(V_ham)

Original Vocab Size: 518326
Pruned Vocab Size: 21818
Original Vocab Size: 195216
Pruned Vocab Size: 14278


### Export Proccessed and Pruned Data set

In [8]:
# Clear Nans
X_train = X_train.fillna('')
X_test = X_test.fillna('')

# Loop through train data
for i in range(len(X_train)):
  terms = X_train.iloc[i].split()
  # Remove all terms not in pruned vocabulary
  X_train.iloc[i] = ' '.join([term for term in terms if term in V])

# append labels into df
train_df = pd.concat([X_train, y_train], axis=1)
# Export as csv
pd.DataFrame(train_df).to_csv('/content/drive/MyDrive/Collab_Files/SPAM/SPAM_Final_Dataset/train_df.csv', index=False)

# Loop through test data
for i in range(len(X_test)):
  terms = X_test.iloc[i].split()
  # Remove all terms not in pruned vocabulary
  X_test.iloc[i] = ' '.join([term for term in terms if term in V])

# append labels into df
test_df = pd.concat([X_test, y_test], axis=1)
# Export as csv
pd.DataFrame(test_df).to_csv('/content/drive/MyDrive/Collab_Files/SPAM/SPAM_Final_Dataset/test_df.csv', index=False)