<h1>1. Data Loading</h1>

In [None]:
import kagglehub
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

# Download latest version
path = kagglehub.dataset_download("venky73/spam-mails-dataset")
print("Path to dataset files:", path)
df = pd.read_csv(path + "/spam_ham_dataset.csv")
df

Path to dataset files: /root/.cache/kagglehub/datasets/venky73/spam-mails-dataset/versions/1


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [None]:
for i in range(5):
  print(df.loc[i, 'text'])
  print(f"Label: {df.loc[i, 'label']}")
  print('\n')

Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .
Label: ham


Subject: hpl nom for january 9 , 2001
( see attached file : hplnol 09 . xls )
- hplnol 09 . xls
Label: ham


Subject: neon retreat
ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time !
i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute .
on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers confe

<h1>2. Preprocessing Data</h1>

In [None]:
# Statistics of labels
count_df = df['label'].value_counts().rename_axis('label').reset_index(name='count')
count_df

Unnamed: 0,label,count
0,ham,3672
1,spam,1499


In [None]:
df = df.drop(columns=['Unnamed: 0'], axis=1, errors='ignore')

In [None]:
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing Values': df.isnull().sum().values
})
missing_data

Unnamed: 0,Column,Missing Values
0,label,0
1,text,0
2,label_num,0


<h2>2.1 Clean text and Tokenize</h2>

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download stopwords (if not already downloaded)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
stop_words = set(stopwords.words('english'))
# ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()


# Remove all punctuations and extra whitespaces
def clean_and_tokenize(text):
  # convert to lower case
  text = text.lower()
  # remove punctuation
  text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
  # remove extra whitespace
  text = re.sub("\s+", " ", text).strip()

  # Remove stopwords and apply stemming
  tokens = text.split()
  tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens if word not in stop_words]
  return tokens


# Apply clean_text
df['tokens'] = df['text'].apply(clean_and_tokenize)
print(df.loc[0, 'tokens'])

['subject', 'enron', 'methanol', 'meter', '988291', 'follow', 'note', 'give', 'monday', '4', '3', '00', 'preliminary', 'flow', 'data', 'provide', 'daren', 'please', 'override', 'pop', 'daily', 'volume', 'presently', 'zero', 'reflect', 'daily', 'activity', 'obtain', 'gas', 'control', 'change', 'need', 'asap', 'economics', 'purpose']


<h2>2.2 Encoding</h2>

In [None]:
# Build vocabulary from all tokens
all_tokens = [token for email in df['tokens'] for token in email]
vocab = {token: idx for idx, token in enumerate(set(all_tokens))}

# Encoding tokens
def encoding(tokens):
  return [vocab.get(token, 0) for token in tokens]

# Apply encoding
df['encoded'] = df['tokens'].apply(encoding) # encoding is called for each row
print(df.loc[0, 'tokens'])
print(df.loc[0, 'encoded'])

['subject', 'enron', 'methanol', 'meter', '988291', 'follow', 'note', 'give', 'monday', '4', '3', '00', 'preliminary', 'flow', 'data', 'provide', 'daren', 'please', 'override', 'pop', 'daily', 'volume', 'presently', 'zero', 'reflect', 'daily', 'activity', 'obtain', 'gas', 'control', 'change', 'need', 'asap', 'economics', 'purpose']
[5778, 27040, 15033, 37716, 39683, 257, 12898, 8923, 22646, 38036, 19111, 6395, 25823, 21912, 5016, 4124, 44137, 11724, 15521, 40646, 21757, 41123, 2848, 14535, 15331, 21757, 10051, 16456, 44046, 20161, 6812, 15896, 26677, 26176, 37619]


<h1>2.3 Padding Sequence</h1>

In [None]:
max_len = 50

def pad_sequence(seq, max_len):
  if len(seq) < max_len:
    seq = seq + [0] * (max_len - len(seq)) # pad with zeros
  else:
    seq = seq[:max_len] # truncate longer sequences
  return seq

df['padded'] = df['encoded'].apply(lambda x: pad_sequence(x, max_len))
df

Unnamed: 0,label,text,label_num,tokens,encoded,padded
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,"[subject, enron, methanol, meter, 988291, foll...","[5778, 27040, 15033, 37716, 39683, 257, 12898,...","[5778, 27040, 15033, 37716, 39683, 257, 12898,..."
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,"[subject, hpl, nom, january, 9, 2001, see, att...","[5778, 46270, 41377, 30850, 41088, 11442, 1030...","[5778, 46270, 41377, 30850, 41088, 11442, 1030..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,"[subject, neon, retreat, ho, ho, ho, around, w...","[5778, 17388, 5512, 6683, 6683, 6683, 9122, 44...","[5778, 17388, 5512, 6683, 6683, 6683, 9122, 44..."
3,spam,"Subject: photoshop , windows , office . cheap ...",1,"[subject, photoshop, windows, office, cheap, m...","[5778, 11886, 34376, 22662, 3321, 34830, 12134...","[5778, 11886, 34376, 22662, 3321, 34830, 12134..."
4,ham,Subject: re : indian springs\r\nthis deal is t...,0,"[subject, indian, spring, deal, book, teco, pv...","[5778, 18732, 10345, 6390, 12089, 20210, 38260...","[5778, 18732, 10345, 6390, 12089, 20210, 38260..."
...,...,...,...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0,"[subject, put, 10, ft, transport, volumes, dec...","[5778, 8458, 34968, 32859, 37635, 43994, 45797...","[5778, 8458, 34968, 32859, 37635, 43994, 45797..."
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,"[subject, 3, 4, 2000, follow, noms, hpl, take,...","[5778, 19111, 38036, 16553, 257, 15742, 46270,...","[5778, 19111, 38036, 16553, 257, 15742, 46270,..."
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0,"[subject, calpine, daily, gas, nomination, jul...","[5778, 15279, 21757, 44046, 44430, 2098, 31541...","[5778, 15279, 21757, 44046, 44430, 2098, 31541..."
5169,ham,Subject: industrial worksheets for august 2000...,0,"[subject, industrial, worksheets, august, 2000...","[5778, 15229, 7775, 17916, 16553, 10051, 24782...","[5778, 15229, 7775, 17916, 16553, 10051, 24782..."


In [None]:
print(df.loc[0, 'encoded'])
print(df.loc[0, 'padded'])

[5778, 27040, 15033, 37716, 39683, 257, 12898, 8923, 22646, 38036, 19111, 6395, 25823, 21912, 5016, 4124, 44137, 11724, 15521, 40646, 21757, 41123, 2848, 14535, 15331, 21757, 10051, 16456, 44046, 20161, 6812, 15896, 26677, 26176, 37619]
[5778, 27040, 15033, 37716, 39683, 257, 12898, 8923, 22646, 38036, 19111, 6395, 25823, 21912, 5016, 4124, 44137, 11724, 15521, 40646, 21757, 41123, 2848, 14535, 15331, 21757, 10051, 16456, 44046, 20161, 6812, 15896, 26677, 26176, 37619, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


<h1>3. Processing Data</h1>

<h2>3.1 Splitting Data</h2>

In [None]:
from sklearn.model_selection import train_test_split

X = np.array(df['padded'].tolist(), dtype=np.int32)
y = np.array(df['label_num'].tolist(), dtype=np.int32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

<h2>3.2 Custom PyTorch Dataset and DataLoader</h2>

In [None]:
from torch.utils.data import Dataset, DataLoader

class SpamDataset(Dataset):
  def __init__(self, texts, labels):
    self.texts = texts
    self.labels = labels

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    # Convert arrays to torch tensors
    text_tensor = torch.tensor([token for token in self.texts[idx]], dtype=torch.long)
    label_tensor = torch.tensor(self.labels[idx], dtype=torch.float)
    return text_tensor, label_tensor

# Create dataset objects
train_dataset = SpamDataset(X_train, y_train)
test_dataset = SpamDataset(X_test, y_test)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

<h2>3.3 Defining the LSTM

In [None]:
# Embedding Layer: converts word indices to dense vectors
# LSTM Layer: processes sequences and captures context
# Fully Connected Layer: Maps the final state to a single output (with sigmoid activation)
class LSTMClassifier(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_rate=0.5):
    super(LSTMClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx = 0)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
    self.dropout = nn.Dropout(dropout_rate)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    embedded = self.embedding(x)
    lstm_out, (hidden, cell) = self.lstm(embedded)
    # Concatenate the final hidden states from both directions
    hidden = hidden[-1]
    hidden = self.dropout(hidden)
    out = self.fc(hidden)
    return self.sigmoid(out)


# Create the model
model = LSTMClassifier(vocab_size=len(vocab), embedding_dim=50, hidden_dim=64, output_dim=1, dropout_rate=0.5)

<h1>4. Training</h1>

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 20


for epoch in range(num_epochs):
  model.train()
  epoch_loss = 0
  for texts, labels in train_loader:
    optimizer.zero_grad()               # Reset gradients
    outputs = model(texts)              # Forward pass
    outputs = outputs.squeeze()         # Remove extra dimensions
    loss = criterion(outputs, labels)   # Compute loss
    loss.backward()                     # Backpropagation
    optimizer.step()                    # Update model weights
    epoch_loss += loss.item()
  print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")

Epoch 1/20, Loss: 0.5572
Epoch 2/20, Loss: 0.3640
Epoch 3/20, Loss: 0.2767
Epoch 4/20, Loss: 0.1900
Epoch 5/20, Loss: 0.1496
Epoch 6/20, Loss: 0.1203
Epoch 7/20, Loss: 0.0903
Epoch 8/20, Loss: 0.0807
Epoch 9/20, Loss: 0.0877
Epoch 10/20, Loss: 0.0700
Epoch 11/20, Loss: 0.0677
Epoch 12/20, Loss: 0.0640
Epoch 13/20, Loss: 0.0699
Epoch 14/20, Loss: 0.0632
Epoch 15/20, Loss: 0.0626
Epoch 16/20, Loss: 0.1142
Epoch 17/20, Loss: 0.0546
Epoch 18/20, Loss: 0.0521
Epoch 19/20, Loss: 0.0546
Epoch 20/20, Loss: 0.0569


<h1>5. Evaluation</h1>

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support

model.eval()
all_preds = []
all_labels = []
all_outputs = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        outputs_squeeze = outputs.squeeze()
        preds = (outputs.squeeze() > 0.5).long() # Threshold the sigmoid output
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())
        all_outputs.extend(outputs_squeeze.tolist())

cm = confusion_matrix(all_labels, all_preds)
print("\nStandard Threshold (0.5) Results:")
print("Confusion Matrix:", cm)
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['Ham', 'Spam']))

# Threshold optimization for spam detection
import numpy as np
thresholds = np.arange(0.1, 1.0, 0.05)
results = []

for threshold in thresholds:
    preds = [1 if output > threshold else 0 for output in all_outputs]
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels,
        preds,
        average=None,
        zero_division=0
    )

    # Get metrics for spam class (class 1)
    if len(precision) > 1 and len(recall) > 1 and len(f1) > 1:
        spam_precision = precision[1]
        spam_recall = recall[1]
        spam_f1 = f1[1]

        results.append({
            'threshold': threshold,
            'spam_precision': spam_precision,
            'spam_recall': spam_recall,
            'spam_f1': spam_f1
        })

# Find the threshold with the best F1 score for spam
if results:
    best_threshold = max(results, key=lambda x: x['spam_f1'])
    print(f"\nBest threshold for spam detection: {best_threshold['threshold']:.2f}")
    print(f"With this threshold, spam precision: {best_threshold['spam_precision']:.4f}")
    print(f"With this threshold, spam recall: {best_threshold['spam_recall']:.4f}")
    print(f"With this threshold, spam F1: {best_threshold['spam_f1']:.4f}")

    # Final evaluation with optimized threshold
    final_preds = [1 if output > best_threshold['threshold'] else 0 for output in all_outputs]
    print("\nFinal Classification Report with Optimized Threshold:")
    print(classification_report(all_labels, final_preds, target_names=['Ham', 'Spam']))

    # Print confusion matrix for optimized threshold
    final_cm = confusion_matrix(all_labels, final_preds)
    print("\nConfusion Matrix with Optimized Threshold:", final_cm)
else:
    print("Could not determine best threshold - check if both classes are represented in test data")


Standard Threshold (0.5) Results:
Confusion Matrix: [[711  24]
 [ 10 290]]

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.97      0.98       735
        Spam       0.92      0.97      0.94       300

    accuracy                           0.97      1035
   macro avg       0.95      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035


Best threshold for spam detection: 0.40
With this threshold, spam precision: 0.9238
With this threshold, spam recall: 0.9700
With this threshold, spam F1: 0.9463

Final Classification Report with Optimized Threshold:
              precision    recall  f1-score   support

         Ham       0.99      0.97      0.98       735
        Spam       0.92      0.97      0.95       300

    accuracy                           0.97      1035
   macro avg       0.96      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035


Confusion Matrix with Optim