In [1]:
# Check if GPU is found

import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

SystemError: GPU device not found

In [None]:
# specify the GPU

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Tesla P100-PCIE-16GB'

In [None]:
# Create sentence and label lists
sentences = df_new["Text"].values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df_new["Impact"].values

In [None]:
# Import BERT tokenizer
from pytorch_pretrained_bert import BertTokenizer, BertConfi
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'f', '##wc', '##r', '/', 'proxy', 'change', 'request', '[SEP]']

In [None]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

# Pad our input tokens
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)
  
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_hidden_states=True)
model.cuda()

In [3]:
from src import util as util
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import collections
from pathlib import Path
import csv

In [15]:
def load_spam_dataset_csv(csv_path):

    messages = []
    labels = []

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')

        for message, label in reader:
            messages.append(message)
            labels.append(1 if label == '1' else 0)

    return messages, np.array(labels)

In [16]:
train_messages, train_labels = util.load_spam_dataset('data/train.tsv')
test_messages, test_labels = util.load_spam_dataset('data/test.tsv')
train2_messages, train2_labels = load_spam_dataset_csv('data/emails_new_train.csv')

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight BERT variant
def get_embeddings(email_bodies):
    return model.encode(email_bodies, convert_to_numpy=True)

  from .autonotebook import tqdm as notebook_tqdm





In [25]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Disable dropout for consistent embeddings

def get_embeddings_Pooling(email_bodies, batch_size=32):
    """Equivalent to sentence-transformers' encode() but with mean pooling"""
    embeddings = []
    
    for i in tqdm(range(0, len(email_bodies), batch_size), 
                 desc="Generating embeddings"):
        batch = email_bodies[i:i + batch_size]
        
        # Tokenize with BERT's conventions
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
            add_special_tokens=True  # Adds [CLS] and [SEP]
        )
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Mean pooling (exclude special tokens)
        last_hidden = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        
        # Expand mask to match embedding dim
        mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        
        # Sum embeddings (ignoring padding)
        sum_embeddings = torch.sum(last_hidden * mask, dim=1)
        
        # Count non-padding tokens
        sum_mask = torch.sum(mask, dim=1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)  # Avoid division by zero
        
        # Mean pooling
        batch_embeddings = (sum_embeddings / sum_mask).numpy()
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

In [26]:
matrix1 = get_embeddings_Pooling(train_messages)

Generating embeddings: 100%|█████████████████████████████████████████████████████████| 140/140 [14:32<00:00,  6.23s/it]


In [27]:
matrix2 = get_embeddings_Pooling(test_messages)

Generating embeddings: 100%|███████████████████████████████████████████████████████████| 18/18 [00:53<00:00,  2.95s/it]


In [6]:
matrix_training = get_embeddings(train_messages)

In [8]:
matrix_test = get_embeddings(test_messages)

In [7]:
matrix_training.shape

(4457, 384)

In [10]:
from sklearn.svm import SVC
# 2. Train SVM
svm = SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=42)
svm.fit(matrix_training, train_labels)

# 3. Predict
predictions = svm.predict(matrix_test)

In [12]:

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(test_labels, predictions))

Accuracy: 0.9910394265232975


In [13]:
test_messages_simple = [
    # Non-Spam (Ham) - 15 examples
    "Hi John, just checking in to see if you're still on for lunch tomorrow at 12:30 PM.",
    "The quarterly financial report has been uploaded to the shared drive for your review.",
    "Team meeting reminder: Wednesday at 3 PM in Conference Room A. Agenda attached.",
    "Your Amazon order #12345 has shipped and will arrive on Friday.",
    "Thanks for your application! We'll review your resume and get back to you next week.",
    "The software update has been completed successfully on all servers.",
    "Mom: Don't forget we're having family dinner this Sunday at 6 PM.",
    "Your monthly bank statement is now available in your online banking portal.",
    "The project deadline has been extended to March 15th per client request.",
    "Password reset confirmation: Your password was changed successfully.",
    "Doctor's appointment reminder: You have a checkup scheduled for May 3rd at 10 AM.",
    "Your subscription to Tech Magazine has been renewed automatically.",
    "The attached document contains the meeting minutes from yesterday's call.",
    "Your flight LAX to JFK is confirmed for departure at 8:45 AM tomorrow.",
    "HR Notification: Please complete your benefits enrollment by Friday.",

    # Spam - 5 examples
    "URGENT: Your account will be suspended unless you verify your details now!",
    "CONGRATULATIONS! You've won a free iPhone - click here to claim your prize!",
    "Make $10,000 a week from home with this simple trick! No experience needed!",
    "Your package couldn't be delivered - click this link to reschedule immediately!",
    "Limited time offer! Act now to get 90% off - this deal expires in 1 hour!"
]

matrix_15_5 = get_embeddings(test_messages_simple)

In [14]:

predictions_finale = svm.predict(matrix_15_5)
predictions_finale

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1])

Large 10000 dataset

In [17]:

matrix_100 = get_embeddings(train2_messages)

In [28]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import RandomizedSearchCV
# 2. Optimized SVM Pipeline
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Critical for SVM
    ('svm', SVC(
        kernel='rbf',              # Better for complex patterns
        class_weight='balanced',
        probability=True,          # Enable predict_proba
        cache_size=1000,           # For large datasets
        random_state=42
    ))
])

# 3. Hyperparameter Tuning (Reduced search space for efficiency)
param_dist = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.001, 0.01]
}

search = RandomizedSearchCV(
    svm_pipeline,
    param_dist,
    n_iter=10,                    # Reduced for faster tuning
    scoring='f1_weighted',
    cv=3,
    n_jobs=-1
)

In [23]:
search.fit(matrix_100,train2_labels)
simple_svm_3 = search.predict(matrix_test)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(test_labels, simple_svm_3))

Accuracy: 0.46953405017921146


In [29]:
search.fit(matrix1,train_labels)
simple_svm_4 = search.predict(matrix2)
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(test_labels, simple_svm_4))

Accuracy: 0.996415770609319


In [30]:

matrix4 = get_embeddings_Pooling(test_messages_simple)
simple_svm_5 = search.predict(matrix4)
simple_svm_5

Generating embeddings: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]


array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1])