In [10]:
from src import util as util
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import collections
from pathlib import Path
import csv

In [11]:
def load_spam_dataset_csv(csv_path):
    """Load the spam dataset from a TSV file

    Args:
         csv_path: Path to TSV file containing dataset.

    Returns:
        messages: A list of string values containing the text of each message.
        labels: The binary labels (0 or 1) for each message. A 1 indicates spam.
    """

    messages = []
    labels = []

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        for message, label in reader:
            messages.append(message)
            labels.append(1 if label == '1' else 0)

    return messages, np.array(labels)

In [66]:
train_messages, train_labels = util.load_spam_dataset('data/train.tsv')
test_messages, test_labels = util.load_spam_dataset('data/test.tsv')
train2_messages, train2_labels = load_spam_dataset_csv('data/emails_new_train.csv')

In [13]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set model to evaluation mode
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

Step-by-step feature extraction process
1. Tokenization: breaking down text

Generating embeddings
With tokenized input, you can extract contextual embeddings from different layers:

In [40]:
encoded_inputs = tokenizer(train_messages, padding=True, truncation=True, return_tensors="pt")

In [54]:
from simpletransformers.language_representation import RepresentationModel

# Example list of messages (can be any length)
sentences = [
    "BERT feature extraction transforms NLP",
    "This is another example sentence",
    "Spam classification using BERT embeddings"
]

# Initialize the model (only once)
model = RepresentationModel(
    model_type="bert",
    model_name="bert-base-uncased",
    use_cuda=False  # Set to True if you have a GPU
)

# Get embeddings for ALL sentences in one call
sentence_vectors = model.encode_sentences(train_messages, combine_strategy="mean")

# Output: List of numpy arrays (one 768D vector per sentence)
print(len(sentence_vectors))  # Number of sentences
print(sentence_vectors.shape)  # (768,) for each sentence

4457
(4457, 768)


In [56]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svm = SVC(kernel='linear', class_weight='balanced')  

# Train
svm.fit(sentence_vectors, train_labels)

In [57]:

y_pred = svm.predict(sentence_vectors)
print("Accuracy:", accuracy_score(train_labels, y_pred))

Accuracy: 1.0


In [58]:
sentence_vectors_test = model.encode_sentences(test_messages, combine_strategy="mean")

In [59]:

y_pred_test = svm.predict(sentence_vectors_test)
print("Accuracy:", accuracy_score(test_labels, y_pred_test))

Accuracy: 0.989247311827957


In [60]:
test_messages_simple = [
    # Non-Spam (Ham) - 15 examples
    "Hi John, just checking in to see if you're still on for lunch tomorrow at 12:30 PM.",
    "The quarterly financial report has been uploaded to the shared drive for your review.",
    "Team meeting reminder: Wednesday at 3 PM in Conference Room A. Agenda attached.",
    "Your Amazon order #12345 has shipped and will arrive on Friday.",
    "Thanks for your application! We'll review your resume and get back to you next week.",
    "The software update has been completed successfully on all servers.",
    "Mom: Don't forget we're having family dinner this Sunday at 6 PM.",
    "Your monthly bank statement is now available in your online banking portal.",
    "The project deadline has been extended to March 15th per client request.",
    "Password reset confirmation: Your password was changed successfully.",
    "Doctor's appointment reminder: You have a checkup scheduled for May 3rd at 10 AM.",
    "Your subscription to Tech Magazine has been renewed automatically.",
    "The attached document contains the meeting minutes from yesterday's call.",
    "Your flight LAX to JFK is confirmed for departure at 8:45 AM tomorrow.",
    "HR Notification: Please complete your benefits enrollment by Friday.",

    # Spam - 5 examples
    "URGENT: Your account will be suspended unless you verify your details now!",
    "CONGRATULATIONS! You've won a free iPhone - click here to claim your prize!",
    "Make $10,000 a week from home with this simple trick! No experience needed!",
    "Your package couldn't be delivered - click this link to reschedule immediately!",
    "Limited time offer! Act now to get 90% off - this deal expires in 1 hour!"
]

matrix_15_5 = model.encode_sentences(test_messages_simple, combine_strategy="mean")

In [65]:

y_pred = svm.predict(matrix_15_5)
y_pred

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0])

In [None]:
matrix_test_embeddings = get_bert_embeddings(test_messages)

In [None]:
LR_predictions_te = log_reg.predict(matrix_test_embeddings)

In [None]:

y_pred = svm.predict(matrix_test_embeddings)
print("Accuracy:", accuracy_score(test_labels, y_pred))

In [None]:
LR_accuracy_large = np.mean(LR_predictions_te == test_labels)

In [None]:
print('SimpLe Logistic Regression had an accuracy of {} on the testing set'.format(LR_accuracy_large))