# Spam Email Classification

## Imports
All required libraries are imported here for clarity and maintainability.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, log_loss, roc_curve, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from typing import List
import math
import re

## Data Loading and Preprocessing
Load the spam email dataset and preprocess it by mapping 'ham' to 0 and 'spam' to 1.

In [2]:
df = pd.read_csv('mail_data.csv')
datalink = 'https://www.kaggle.com/datasets/shantanudhakadd/email-spam-detection-dataset-classification/data'
print(df.head())

# Map 'ham' to 0 and 'spam' to 1
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})
df.head()

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## TF-IDF Implementation
Implement TF-IDF from scratch and using scikit-learn to convert email messages into numerical vectors.

### Custom TF-IDF Implementation
A custom function to compute TF-IDF vectors for the email messages.

In [3]:
def compute_tfidf(documents):
    """Compute TF-IDF matrix for a list of documents.
    
    Args:
        documents: List of text documents.
    Returns:
        numpy.ndarray: TF-IDF matrix where rows are documents and columns are words.
    """
    # Tokenize and clean a document
    def tokenize(doc):
        doc = re.sub(r'[\t\n\r]', ' ', doc)                 # Replace tabs/newlines
        doc = re.sub(r'[^\w\s]', '', doc.lower())           # Remove punctuation and lowercase
        doc = re.sub(r'\s+', ' ', doc).strip()              # Normalize whitespace
        return doc.split()                                  # Tokenize by splitting words

    # Tokenize all documents
    tokenized_docs = [tokenize(doc) for doc in documents]

    # Build vocabulary
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))

    # Compute Term Frequency (TF)
    def compute_tf(doc_tokens):
        tf = {}
        total_terms = len(doc_tokens)
        if total_terms == 0:
            return {word: 0 for word in vocab}
        for word in vocab:
            tf[word] = doc_tokens.count(word) / total_terms
        return tf

    tf_list = [compute_tf(doc) for doc in tokenized_docs]

    # Compute Document Frequency (DF)
    def compute_df(docs):
        df = {}
        for word in vocab:
            df[word] = sum(1 for doc in docs if word in doc)
        return df

    df = compute_df(tokenized_docs)

    # Compute Inverse Document Frequency (IDF)
    def compute_idf(df, N):
        idf = {}
        for word, doc_count in df.items():
            idf[word] = math.log(N / (1 + doc_count))
        return idf

    idf = compute_idf(df, len(documents))

    # Compute TF-IDF for each document
    def compute_tfidf_vector(tf, idf):
        tfidf = {}
        for word in vocab:
            tfidf[word] = tf.get(word, 0) * idf.get(word, 0)
        return tfidf

    tfidf_list = [compute_tfidf_vector(tf, idf) for tf in tf_list]

    # Round and clean up result
    result = []
    for tfidf in tfidf_list:
        result.append({word: round(score, 4) for word, score in tfidf.items() if score > 0})

    # Turn list of dicts into matrix
    def tfidf_to_matrix(result, vocab):
        vocab_index = {word: idx for idx, word in enumerate(vocab)}
        matrix = np.zeros((len(result), len(vocab)))

        for doc_idx, tfidf_dict in enumerate(result):
            for word, tfidf in tfidf_dict.items():
                if word in vocab_index:
                    word_idx = vocab_index[word]
                    matrix[doc_idx][word_idx] = tfidf

        return matrix
    
    matrix = tfidf_to_matrix(result, vocab)

    return matrix

# Compute TF-IDF for the dataset
dataset = compute_tfidf(df['Message'])

### Scikit-learn TF-IDF Implementation
Use scikit-learn's TfidfVectorizer for comparison.

In [4]:
# Preprocess text: remove missing rows, convert to lowercase, remove punctuation
scikitlearn_data = df['Message'].dropna().astype(str)
scikitlearn_data = scikitlearn_data.str.lower().replace(r'[^\w\s]','',regex=True)

# Create and apply TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(scikitlearn_data)

## Logistic Regression Implementation
Implement logistic regression from scratch to classify emails as spam or ham.

In [5]:
def sigmoid(x):
    """Apply sigmoid function to input.
    
    Args:
        x: Input array.
    Returns:
        numpy.ndarray: Sigmoid of input.
    """
    return 1 / (1 + np.exp(-x))

def compute_loss(x_train, y_train, w, b):
    """Compute binary cross-entropy loss.
    
    Args:
        x_train: Training features.
        y_train: Training labels.
        w: Weight vector.
        b: Bias term.
    Returns:
        float: Loss value.
    """
    m = x_train.shape[0]
    z = np.dot(x_train, w) + b
    h = sigmoid(z)
    epsilon = 1e-15
    loss = -np.mean(y_train * np.log(h + epsilon) + (1 - y_train) * np.log(1 - h + epsilon))
    return loss

def compute_gradient(x_train, y_train, w, b):
    """Compute gradients for weights and bias.
    
    Args:
        x_train: Training features.
        y_train: Training labels.
        w: Weight vector.
        b: Bias term.
    Returns:
        tuple: Gradients for weights (dw) and bias (db).
    """
    m = x_train.shape[0]
    z = np.dot(x_train, w) + b
    h = sigmoid(z)
    error = h - y_train
    dw = np.dot(x_train.T, error) / m
    db = np.sum(error) / m
    return dw, db

def gradient_descent(x_train, y_train, w, b, iters, lr):
    """Perform gradient descent to optimize logistic regression parameters.\n\nInformaci√≥n adicional: This function uses gradient descent to minimize the binary cross-entropy loss for a logistic regression model.
    
    Args:
        x_train: Training features.
        y_train: Training labels.
        w: Initial weight vector.
        b: Initial bias term.
        iters: Number of iterations.
        lr: Learning rate.
    Returns:
        tuple: Optimized weights, bias, and loss history.
    """
    losses = []
    for i in range(iters):
        dw_dj, db_dj = compute_gradient(x_train, y_train, w, b)
        w = w - lr * dw_dj
        b = b - lr * db_dj
        loss = compute_loss(x_train, y_train, w, b)
        losses.append(loss)
        if i % math.ceil(iters / 10) == 0:
            print(f"Iteration: {i}, Cost value: {compute_loss(x_train, y_train, w, b)}")

    print("Complete gradient descent!")
    print(f"Weight: {w}, Bias: {b}\n")

    return w, b, losses

def check_threshold(x, threshold):
    """Apply threshold to classify predictions.
    
    Args:
        x: Input probability.
        threshold: Classification threshold.
    Returns:
        int: 0 if x < threshold, else 1.
    """
    return 0 if x < threshold else 1

## Data Splitting
Split the dataset into training, validation, and test sets.

In [6]:
# Use custom TF-IDF features
X = dataset
y = np.array(df['Category'])

# Split into train (70%), validation (15%), and test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Model Training
Train logistic regression models with different learning rates.

In [7]:
# Model 1: Learning rate = 0.01
iters = 10000
lr_1 = 0.01
m, n = X_train.shape
w_1 = np.zeros(n,)
b_1 = 0
w_1, b_1, loss_1 = gradient_descent(X_train, y_train, w_1, b_1, iters, lr_1)

Iteration: 0, Cost value: 0.6917698229501618
Iteration: 1000, Cost value: 0.3973526189635055
Iteration: 2000, Cost value: 0.38445490294125917
Iteration: 3000, Cost value: 0.3794840859741482
Iteration: 4000, Cost value: 0.3751333234050555
Iteration: 5000, Cost value: 0.3708971221503404
Iteration: 6000, Cost value: 0.36673145368222404
Iteration: 7000, Cost value: 0.36263166572609196
Iteration: 8000, Cost value: 0.3585966960138447
Iteration: 9000, Cost value: 0.354625832969232
Complete gradient descent!
Weight: [ 0.01478585  0.00952728  0.0052442  ... -0.00446129 -0.15805026
 -0.00854035], Bias: -1.7624358685777506



In [8]:
# Model 2: Learning rate = 0.05
lr_2 = 0.05
m, n = X_train.shape
w_2 = np.zeros(n,)
b_2 = 0
w_2, b_2, loss_2 = gradient_descent(X_train, y_train, w_2, b_2, iters, lr_2)

Iteration: 0, Cost value: 0.6862955321367522
Iteration: 1000, Cost value: 0.3708798228917829
Iteration: 2000, Cost value: 0.3507021697816155
Iteration: 3000, Cost value: 0.3320952718367653
Iteration: 4000, Cost value: 0.31497723712029835
Iteration: 5000, Cost value: 0.29926147523819974
Iteration: 6000, Cost value: 0.2848543210341894
Iteration: 7000, Cost value: 0.2716572491979031
Iteration: 8000, Cost value: 0.25957017455653236
Iteration: 9000, Cost value: 0.2484945905929313
Complete gradient descent!
Weight: [ 0.06928406  0.04515783  0.0237665  ... -0.02193612 -0.56465445
 -0.03202776], Bias: -1.6705334268295895



## Training Loss Visualization
Plot the training loss curves for the models. Note: Plotting for Model 1 is included but incomplete in the original code.

In [9]:
# Plot loss for Model 1 (incomplete in original code)
plt.plot(loss_1)
plt.xlabel('Epoch')
plt.ylabel('Loss value')
plt.title('Training loss curve with lr = 0.01')
plt.grid(True)
plt.show()

# TODO: Add plotting for Model 2 if needed

## Regularization
Implement L2 regularization for logistic regression to prevent overfitting.

In [10]:
def compute_loss_regularization(x_train, y_train, w, b, lambda_):
    """Compute binary cross-entropy loss with L2 regularization.
    
    Args:
        x_train: Training features.
        y_train: Training labels.
        w: Weight vector.
        b: Bias term.
        lambda_: Regularization parameter.
    Returns:
        float: Loss value with L2 penalty.
    """
    m = x_train.shape[0]
    z = np.dot(x_train, w) + b
    h = sigmoid(z)
    epsilon = 1e-15
    loss = -np.mean(y_train * np.log(h + epsilon) + (1 - y_train) * np.log(1 - h + epsilon))
    l2_term = (lambda_ / (2 * m)) * np.sum(w**2)
    return loss + l2_term

def compute_gradient_regularization(x_train, y_train, w, b, lambda_):
    """Compute gradients for weights and bias with L2 regularization.
    
    Args:
        x_train: Training features.
        y_train: Training labels.
        w: Weight vector.
        b: Bias term.
        lambda_: Regularization parameter.
    Returns:
        tuple: Gradients for weights (dw) and bias (db).
    """
    m = x_train.shape[0]
    z = np.dot(x_train, w) + b
    h = sigmoid(z)
    error = h - y_train
    dw = (np.dot(x_train.T, error) + lambda_ * w) / m
    db = np.sum(error) / m
    return dw, db

def gradient_descent_regularization(x_train, y_train, w, b, iters, lr, lambda_):
    """Perform gradient descent with L2 regularization.
    
    Args:
        x_train: Training features.
        y_train: Training labels.
        w: Initial weight vector.
        b: Initial bias term.
        iters: Number of iterations.
        lr: Learning rate.
        lambda_: Regularization parameter.
    Returns:
        tuple: Optimized weights, bias, and loss history.
    """
    losses = []
    for i in range(iters):
        dw_dj, db_dj = compute_gradient_regularization(x_train, y_train, w, b, lambda_)
        w = w - lr * dw_dj
        b = b - lr * db_dj
        loss = compute_loss_regularization(x_train, y_train, w, b, lambda_)
        losses.append(loss)
        if i % math.ceil(iters / 2) == 0:
            print(f"Iteration: {i}, Cost value: {compute_loss(x_train, y_train, w, b)}")

    print("Complete gradient descent!")
    print(f"Weight: {w}, Bias: {b}\n")

    return w, b, losses

## Model Evaluation with Regularization
Train models with different regularization parameters and evaluate performance.

**Note**: The original code included an evaluation loop with multiple `lambda_` values and thresholds, but it was incomplete (missing predictions). Below is a placeholder for the evaluation code, which needs to be completed with prediction generation.

In [11]:
# Placeholder for regularization evaluation
# TODO: Complete this section by generating predictions and evaluating metrics
iters = 10000
lr = 0.1
lambda_values = [0.001, 0.01, 0.1, 1, 10]
thresholds = np.arange(0.1, 0.9, 0.05)

# Example structure for evaluation (to be completed)
for lambda_ in lambda_values:
    m, n = X_train.shape
    w_init = np.zeros(n)
    b_init = 0
    w, b, loss = gradient_descent_regularization(X_train, y_train, w_init, b_init, iters, lr, lambda_)
    # Add prediction and evaluation code here

## Notes
- The dataset (`mail_data.csv`) must be available in the working directory.
- The evaluation section is incomplete and requires prediction generation and metric computation (e.g., accuracy, precision, recall, F1 score, ROC curve).
- Consider adding scikit-learn's LogisticRegression for comparison with the custom implementation.