In [30]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import skorch
import kagglehub
import string
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from skorch import NeuralNetBinaryClassifier
from skorch.dataset import ValidSplit
from models import MLPNet
from sklearn.base import clone
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
# Download latest version
path = kagglehub.dataset_download("meruvulikith/190k-spam-ham-email-dataset-for-classification")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\seanp\.cache\kagglehub\datasets\meruvulikith\190k-spam-ham-email-dataset-for-classification\versions\1


In [2]:
# Checking for GPU availability
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))  

True
NVIDIA GeForce RTX 4070 Ti SUPER


In [3]:
# Loading dataset
df = pd.read_csv("spam_Emails_data.csv")

# Dropping rows where label or text is missing
df.dropna(subset=['label', 'text'], inplace=True)

# Printing size of dataset
print(f"Shape of dataset: {df.shape}")

Shape of dataset: (193850, 2)


In [4]:
# Filtering out rows that aren't labeled correctly
df['label'] = df['label'].str.strip().str.lower()
df = df[df['label'].isin(['spam', 'ham'])]

# Encoding labels as binary
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

# Printing size of dataset
print(f"Shape of dataset: {df.shape}")

Shape of dataset: (193850, 2)


In [5]:
# Creating some features for the dataset

# Email length
df['email_length'] = df['text'].apply(len)

# Number of exclamation points
df['exclamation_points'] = df['text'].str.count('!')

# Number of capital letters
df['capital_letters'] = df['text'].apply(lambda x: sum(1 for c in x.split() if c.isupper()))

# Adding more features here to try and improve performance

# Word count
df['word_count'] = df['text'].str.split().apply(len)

# Puncuation count
df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in string.punctuation))

# Digit ratio
df['digit_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isdigit()) / len(x) if len(x) > 0 else 0)

# HTML presence
df['has_html'] = df['text'].str.contains(r'<[^>]+>', regex=True).astype(int)

# URL presence
df['has_url'] = df['text'].str.contains(r'http[s]?://', regex=True).astype(int)

# URL count
df['num_urls'] = df['text'].str.count(r'http[s]?://')

# Email count
df['num_emails'] = df['text'].str.count(r'\b[\w.-]+?@\w+?\.\w+?\b')

In [6]:
# Loading pretrained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2') # experimenting with a few different options here
embedder = embedder.to('cuda')

# Generating embeddings
embeddings = embedder.encode(
    df['text'].tolist(),
    batch_size=64,
    show_progress_bar=True
)

# Normalizing and reshaping engineered features
engineered_features = df[['email_length', 'exclamation_points', 'capital_letters', 'word_count', 'punctuation_count', 'digit_ratio', 'has_html', 'has_url', 'num_urls', 'num_emails']]
scaler = StandardScaler()
engineered_scaled = scaler.fit_transform(engineered_features)

# Combining embeddings with engineered features
x = np.hstack((embeddings, engineered_scaled))
y = df['label'].values

Batches:   0%|          | 0/3029 [00:00<?, ?it/s]

In [7]:
# Splitting the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [8]:
# Training several different models and comparing their performance
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Utilizing RandomSearchCV to find optimal parameters for each model
param_random = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['liblinear']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 150],
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'max_features': ['sqrt', 'log2']
    }
}

In [9]:
best_models = {}

for name, model in models.items():
    
    print(f"Running RandomSearchCV for {name}")
    random = RandomizedSearchCV(
        model, 
        param_random[name], 
        cv=3, 
        n_iter=10, 
        scoring='f1', 
        verbose=1, 
        n_jobs=-1
    )
    random.fit(x_train, y_train)
    
    best_model = random.best_estimator_
    best_models[name] = best_model
    
    print(f"Best parameters for {name}: {random.best_params_}")
    
    y_pred = best_model.predict(x_test)
    print(f"\n{name} Performance after tuning:\n")
    print(classification_report(y_test, y_pred, digits=4))

Running RandomSearchCV for Logistic Regression
Fitting 3 folds for each of 4 candidates, totalling 12 fits




Best parameters for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l2', 'C': 10}

Logistic Regression Performance after tuning:

              precision    recall  f1-score   support

           0     0.9438    0.9435    0.9437     20530
           1     0.9364    0.9368    0.9366     18240

    accuracy                         0.9403     38770
   macro avg     0.9401    0.9401    0.9401     38770
weighted avg     0.9403    0.9403    0.9403     38770

Running RandomSearchCV for Random Forest
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Random Forest: {'n_estimators': 150, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': 20}

Random Forest Performance after tuning:

              precision    recall  f1-score   support

           0     0.9481    0.9684    0.9581     20530
           1     0.9635    0.9404    0.9518     18240

    accuracy                         0.9552     38770
   macro avg     0.9558    0.9544    0.9550     38

In [21]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Handling MLP separately with GPU

# Wrapping in skorch for use with PyTorch
net = NeuralNetBinaryClassifier(
    module=MLPNet,
    module__input_dim=x_train.shape[1],
    max_epochs=20,
    train_split=ValidSplit(0.2),
    callbacks=[skorch.callbacks.EarlyStopping(patience=5)],
    lr=0.001,
    batch_size=128,
    optimizer=torch.optim.Adam,
    criterion=nn.BCEWithLogitsLoss,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    verbose=0
)

# Defining randomized search parameters
mlp_param_random = {
    'module__hidden_layer_sizes': [(64,), (128,), (128, 64)],
    'module__dropout': [0.3, 0.5],
    'lr': [0.001, 0.01],
    'max_epochs': [10, 20],
    'batch_size': [32, 64, 128]
}

# Running RandomizedSearchCV on GPU-backed model to search wider range quicker
random = RandomizedSearchCV(net, mlp_param_random, cv=3, n_iter=10, scoring='f1', verbose=1, n_jobs=1)
random.fit(x_train.astype('float32'), y_train.astype('float32'))

best_model = random.best_estimator_
best_models['MLP Neural Net'] = best_model

y_pred = best_model.predict(x_test.astype('float32'))

print(f"Best parameters for MLP Neural Net: {random.best_params_}")
print("\nMLP Neural Net Performance after tuning:\n")
print(classification_report(y_test, y_pred, digits=4))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for MLP Neural Net: {'module__hidden_layer_sizes': (128, 64), 'module__dropout': 0.3, 'max_epochs': 20, 'lr': 0.001, 'batch_size': 128}

MLP Neural Net Performance after tuning:

              precision    recall  f1-score   support

           0     0.9813    0.9779    0.9796     20530
           1     0.9752    0.9790    0.9771     18240

    accuracy                         0.9784     38770
   macro avg     0.9782    0.9784    0.9783     38770
weighted avg     0.9784    0.9784    0.9784     38770



In [33]:
# Wrapper for skorch model
class SkorchFloat32Wrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, skorch_model):
        self.skorch_model = skorch_model
        self.label_encoder = LabelEncoder()
        self.classes_ = None

    def fit(self, X, y):
        y = self.label_encoder.fit_transform(y)
        self.classes_ = self.label_encoder.classes_
        y = y.astype(np.float32)
        self.skorch_model.fit(X, y)
        return self

    def predict(self, X):
        preds = self.skorch_model.predict(X)
        return self.label_encoder.inverse_transform(preds.astype(int))

    def predict_proba(self, X):
        return self.skorch_model.predict_proba(X)

    def get_params(self, deep=True):
        return {'skorch_model': self.skorch_model}

    def set_params(self, **params):
        self.skorch_model = params['skorch_model']
        return self

# Testing model stacking with the optimal models from above
base_learners = [
    ('lr', best_models['Logistic Regression']),
    ('rf', best_models['Random Forest']),
    ('mlp', SkorchFloat32Wrapper(best_models['MLP Neural Net']))
]

# Using Logistic Regression model as the meta-model
meta_learner = LogisticRegression()

stacked_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)
stacked_model.fit(x_train, y_train)

y_pred = stacked_model.predict(x_test)
print("Stacked Model Performance:\n")
print(classification_report(y_test, y_pred, digits=4))

  cuda_attrs = torch.load(f, **load_kwargs)


Stacked Model Performance:

              precision    recall  f1-score   support

         0.0     0.9826    0.9811    0.9819     20530
         1.0     0.9788    0.9805    0.9796     18240

    accuracy                         0.9808     38770
   macro avg     0.9807    0.9808    0.9807     38770
weighted avg     0.9808    0.9808    0.9808     38770

