In [1]:
from src import util as util
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import collections
from pathlib import Path
import csv

In [2]:
def load_spam_dataset_csv(csv_path):

    messages = []
    labels = []

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        for message, label in reader:
            messages.append(message)
            labels.append(1 if label == '1' else 0)

    return messages, np.array(labels)

In [3]:
train_messages, train_labels = util.load_spam_dataset('data/train.tsv')
test_messages, test_labels = util.load_spam_dataset('data/test.tsv')
train2_messages, train2_labels = load_spam_dataset_csv('data/emails_new_train.csv')
train3_messages, train3_labels = load_spam_dataset_csv('data/emails_new_train.csv')

In [30]:
from sklearn.utils import resample
# Convert to numpy arrays for easier manipulation
messages = np.array(train2_messages)
labels = np.array(train2_labels)

# Separate majority (ham) and minority (spam) classes
ham_messages = messages[labels == 0]
ham_labels = labels[labels == 0]
spam_messages = messages[labels == 1]
spam_labels = labels[labels == 1]

# Undersample the majority class
ham_messages_downsampled, ham_labels_downsampled = resample(
    ham_messages,
    ham_labels,
    replace=False,                # sample without replacement
    n_samples=len(spam_labels),    # match number of spam samples
    random_state=42                # for reproducibility
)

# Combine the downsampled majority class with minority class
balanced_messages = np.concatenate([ham_messages_downsampled, spam_messages])
balanced_labels = np.concatenate([ham_labels_downsampled, spam_labels])

# Shuffle the dataset to mix ham and spam examples
shuffle_idx = np.random.permutation(len(balanced_labels))
balanced_messages = balanced_messages[shuffle_idx]
balanced_labels = balanced_labels[shuffle_idx]

In [32]:
import numpy as np

# Assuming train_labels is your label array
unique, counts = np.unique(balanced_labels, return_counts=True)
count_dict = dict(zip(unique, counts))

print(f"Ham (0) count: {count_dict.get(0, 0)}")
print(f"Spam (1) count: {count_dict.get(1, 0)}")

Ham (0) count: 2872
Spam (1) count: 2872


In [33]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Disable dropout for consistent embeddings

def get_embeddings_Pooling(email_bodies, batch_size=32):
    """Equivalent to sentence-transformers' encode() but with mean pooling"""
    embeddings = []
    
    for i in tqdm(range(0, len(email_bodies), batch_size), 
                 desc="Generating embeddings"):
        batch = email_bodies[i:i + batch_size]
        
        # Tokenize with BERT's conventions
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
            add_special_tokens=True  # Adds [CLS] and [SEP]
        )
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling (exclude special tokens)
        last_hidden = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        
        # Expand mask to match embedding dim
        mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        
        # Sum embeddings (ignoring padding)
        sum_embeddings = torch.sum(last_hidden * mask, dim=1)
        
        # Count non-padding tokens
        sum_mask = torch.sum(mask, dim=1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # Avoid division by zero
        
        # Mean pooling
        batch_embeddings = (sum_embeddings / sum_mask).numpy()
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

In [34]:
messages_list = balanced_messages.tolist()

In [36]:
matrix1 = get_embeddings_Pooling(messages_list)

Generating embeddings: 100%|█████████████████████████████████████████████████████████| 180/180 [24:20<00:00,  8.11s/it]


In [37]:
label_list = balanced_labels.tolist()

In [24]:
matrix2 = get_embeddings_Pooling(test_messages)

Generating embeddings: 100%|███████████████████████████████████████████████████████████| 18/18 [01:00<00:00,  3.37s/it]


In [38]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import RandomizedSearchCV
# 2. Optimized SVM Pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Critical for SVM
    ('svm', SVC(
        kernel='rbf',              # Better for complex patterns
        class_weight='balanced',
        probability=True,          # Enable predict_proba
        cache_size=1000,           # For large datasets
        random_state=42
    ))
])

# 3. Hyperparameter Tuning (Reduced search space for efficiency)
param_dist = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.001, 0.01]
}

search = RandomizedSearchCV(
    svm_pipeline,
    param_dist,
    n_iter=10,                    # Reduced for faster tuning
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1
)

In [39]:
search.fit(matrix1,label_list)
simple_svm_3 = search.predict(matrix2)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(test_labels, simple_svm_3))

Accuracy: 0.25985663082437277


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            classification_report)
import pandas as pd

# Example with synthetic data (replace with your actual data)
# X should be your feature matrix (n_samples × n_features)
# y should be your binary target (0 and 1)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)

# Initialize and train model
logreg = LogisticRegression(
    penalty='l2',          # Regularization type
    C=1.0,                # Inverse regularization strength
    solver='lbfgs',       # Optimization algorithm
    max_iter=1000,        # Maximum iterations
    random_state=42
)
logreg.fit(matrix1,label_list )

# Predictions
y_pred = logreg.predict(matrix2)
y_proba = logreg.predict_proba(matrix2)[:, 1]  # Probability estimates for class 1

# Evaluation metrics
print(f"Accuracy: {accuracy_score(test_labels, y_pred):.4f}")
print(f"Precision: {precision_score(test_labels, y_pred):.4f}")
print(f"Recall: {recall_score(test_labels, y_pred):.4f}")
print(f"F1 Score: {f1_score(test_labels, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(test_labels, y_proba):.4f}")

"""
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
"""

Accuracy: 0.4068
Precision: 0.1633
Recall: 0.9552
F1 Score: 0.2789
ROC AUC: 0.7829


'\nprint("\nConfusion Matrix:")\nprint(confusion_matrix(y_test, y_pred))\n\nprint("\nClassification Report:")\nprint(classification_report(y_test, y_pred))\n'

In [27]:
test_messages_simple = [
    # Non-Spam (Ham) - 15 examples
    "Hi John, just checking in to see if you're still on for lunch tomorrow at 12:30 PM.",
    "The quarterly financial report has been uploaded to the shared drive for your review.",
    "Team meeting reminder: Wednesday at 3 PM in Conference Room A. Agenda attached.",
    "Your Amazon order #12345 has shipped and will arrive on Friday.",
    "Thanks for your application! We'll review your resume and get back to you next week.",
    "The software update has been completed successfully on all servers.",
    "Mom: Don't forget we're having family dinner this Sunday at 6 PM.",
    "Your monthly bank statement is now available in your online banking portal.",
    "The project deadline has been extended to March 15th per client request.",
    "Password reset confirmation: Your password was changed successfully.",
    "Doctor's appointment reminder: You have a checkup scheduled for May 3rd at 10 AM.",
    "Your subscription to Tech Magazine has been renewed automatically.",
    "The attached document contains the meeting minutes from yesterday's call.",
    "Your flight LAX to JFK is confirmed for departure at 8:45 AM tomorrow.",
    "HR Notification: Please complete your benefits enrollment by Friday.",

    # Spam - 5 examples
    "URGENT: Your account will be suspended unless you verify your details now!",
    "CONGRATULATIONS! You've won a free iPhone - click here to claim your prize!",
    "Make $10,000 a week from home with this simple trick! No experience needed!",
    "Your package couldn't be delivered - click this link to reschedule immediately!",
    "Limited time offer! Act now to get 90% off - this deal expires in 1 hour!"
]

matrix_0 = get_embeddings_Pooling(test_messages_simple)

Generating embeddings: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.46it/s]


In [29]:
simple_svm_3 = search.predict(matrix_0)
simple_svm_3

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1])