# AT&T Spam Detector

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchinfo import summary

# Transformers
from transformers import pipeline

# ML
from sklearn.metrics import classification_report, confusion_matrix

# NLP
import tiktoken

# Data visualization
from plotly import graph_objects as go
import plotly.express as px 

# OS
import os

# Device setting
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


## EDA

Importing data in a `Pandas` `DataFrame` using `ISO-8859-1` encoding.

In [None]:
# Import dataset with Pandas 
dataset = pd.read_csv("../data/spam.csv", encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Removing useless columns and giving a more relevant name to the ones left.

In [3]:
# Keep only useful columns and rename columns
dataset = dataset.loc[:,["v1", "v2"]]
dataset.columns = ["label", "text"]

# Print first lines of the dataset to check updates
dataset.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Checking the distribution of label.

In [4]:
# Count and display each label
fig = go.Figure(
    data=[
        go.Bar(
            x=dataset["label"].value_counts().index,
            y=dataset["label"].value_counts().values)
])

fig.update_layout(
    title='Distribution of label',
    xaxis_title='Label',
    yaxis_title='Count'
)

fig.show()

Data are imbalanced, there's approximatively 14% of spam labeled SMS.

Transcoding labels into numbers.

In [5]:
# Create a transcoding dictionary for labels
label_id = {
    "ham" : 0, 
    "spam" : 1
}

# Associate each texted label with a number label
dataset["label_id"] = dataset["label"].apply(lambda x: label_id[x])
dataset.head()

Unnamed: 0,label,text,label_id
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


Creating a new column with tokenized text using `cl100k_base` from `tiktoken`.

In [6]:
# Instanciate the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

# Create a new columns with tokenized text
dataset["text_tokenized"] = [tokenizer.encode(text) for text in dataset["text"]]

# Print first lines of the dataset to check updates
dataset.head()

Unnamed: 0,label,text,label_id,text_tokenized
0,ham,"Go until jurong point, crazy.. Available only ...",0,"[11087, 3156, 16422, 647, 1486, 11, 14599, 497..."
1,ham,Ok lar... Joking wif u oni...,0,"[11839, 45555, 1131, 622, 10979, 289, 333, 577..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[11180, 4441, 304, 220, 17, 264, 74860, 398, 1..."
3,ham,U dun say so early hor... U c already then say...,0,"[52, 50116, 2019, 779, 4216, 4917, 1131, 549, ..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"[45, 1494, 358, 1541, 956, 1781, 568, 5900, 31..."


Checking the mean length of sequences in order to build the data loader, since all sequences need to be of the same length.

In [7]:
# Compute mean on all sequences length
seq_lens = [len(seq) for seq in dataset["text_tokenized"]]
np.mean(seq_lens)

np.float64(22.49551328068916)

Truncating and padding the sequence length to 25, which it's approximatively the average length.

In [8]:
# Truncate if the sequence is longer than 25 character else padding with 0
dataset["text_tokenized"] = [seq[:25] + [0] * (25 - len(seq)) for seq in dataset["text_tokenized"]]

Forming a torch dataset object based on the token sequences and labels, and split the data into a train and validation set.

In [9]:
# This class is used to convert tokenized text and their corresponding labels into a PyTorch Dataset object
class CustomDataset(Dataset):

    # Initializes the dataset by storing texts and labels as PyTorch tensors.
    def __init__(self, texts, labels):

         # Convert text sequences to a PyTorch tensor (long type since they are indices)
        self.texts = torch.tensor(texts, dtype=torch.long)

        # Convert labels to a PyTorch tensor (float32 for compatibility with loss functions)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
     # Returns the total number of samples in the dataset
    def __len__(self):
        
        return len(self.texts)
    
    # Retrieves a single data point (text and label) from the dataset based on an index
    def __getitem__(self, idx):

        return self.texts[idx], self.labels[idx]

# Instanciate a torch formated dataset using our CustomDataset class
torch_formated_dataset = CustomDataset(dataset["text_tokenized"], dataset["label_id"])

# Split dataset into training (80%) and validation (20%)
train_size = int(0.8 * len(torch_formated_dataset))
val_size = len(torch_formated_dataset) - train_size
train_dataset, val_dataset = random_split(torch_formated_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

text, label = next(iter(train_loader))

Building the model, it will be a classifier since we have to predict classes.

In [10]:
# This class is used to predict a classification (e.g. spam/ham, negative/positive reviews, etc.) by taking tokenized text as input 
class TextClassifier(nn.Module):
    
    # Initializes the model layers
    def __init__(self, vocab_size, embed_dim):

        super(TextClassifier, self).__init__()

        # Embedding layer: Maps word indices to dense vector representations
        # padding_idx=0 ensures that padding tokens (index 0) do not contribute to learning
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # Adaptive Average Pooling: Computes the average of the word embeddings along the sequence length
        # This helps reduce variable-length text into a fixed-size representation
        self.pooling = nn.AdaptiveAvgPool1d(1)

        # Fully Connected (Linear) layer: Maps the fixed-size vector to a single output value
        self.fc = nn.Linear(embed_dim, 1)

    # Defines the forward pass of the model
    def forward(self, text):

        # Convert input word indices into dense embeddings
        embedded = self.embedding(text)

        # Permute to match the expected shape for pooling: (batch, channels, sequence_length)
        # Then, apply average pooling to reduce sequence length to 1
        pooled = self.pooling(embedded.permute(0, 2, 1)).squeeze(2)

        # Pass the pooled embeddings through the linear layer
        # Apply a Sigmoid activate function since it's a classification problem
        return torch.sigmoid(self.fc(pooled))

# Create an instance of the model
model = TextClassifier(vocab_size=tokenizer.n_vocab, embed_dim=32)

# Print model summary
summary(model, input_data=text)

Layer (type:depth-idx)                   Output Shape              Param #
TextClassifier                           [32, 1]                   --
├─Embedding: 1-1                         [32, 25, 32]              3,208,864
├─AdaptiveAvgPool1d: 1-2                 [32, 32, 1]               --
├─Linear: 1-3                            [32, 1]                   33
Total params: 3,208,897
Trainable params: 3,208,897
Non-trainable params: 0
Total mult-adds (M): 102.68
Input size (MB): 0.01
Forward/backward pass size (MB): 0.21
Params size (MB): 12.84
Estimated Total Size (MB): 13.05

Defining the train function using a model, a training and validation Dataloader, a criterion and an optimizer. Since it's a classification problem with only two classes, we'll use `BCELoss` (Binary Cross Entropy) as criterion. We'll use `Adam` optimizer because he's an adaptive algorithm that adjusts learning rates during training.

In [11]:
# Define the loss function
criterion = nn.BCELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# This function is used to train a PyTorch model with training and validation datasets
def train(model, train_loader, val_loader, criterion, optimizer, epochs=100):

    # Dictionary to store training & validation loss and accuracy over epochs
    history = {'loss': [], 'val_loss': [], 'accuracy': [], 'val_accuracy': []}
    
    for epoch in range(epochs):  # Loop over the number of epochs
        model.train()  # Set model to training mode
        total_loss, correct = 0, 0  # Initialize total loss and correct predictions
        
        # Training loop
        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Reset gradients before each batch
            outputs = model(inputs).squeeze()  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation (compute gradients)
            optimizer.step()  # Update model parameters
            
            total_loss += loss.item()  # Accumulate batch loss
            correct += ((outputs > 0.5) == labels).sum().item()  # Count correct predictions
        
        # Compute average loss and accuracy for training
        train_loss = total_loss / len(train_loader)
        train_acc = correct / len(train_loader.dataset)
        
        # Validation phase (without gradient computation)
        model.eval()  # Set model to evaluation mode
        val_loss, val_correct = 0, 0
        with torch.no_grad():  # No need to compute gradients during validation
            for inputs, labels in val_loader:
                outputs = model(inputs).squeeze()  # Forward pass
                loss = criterion(outputs, labels)  # Compute loss
                val_loss += loss.item()  # Accumulate validation loss
                val_correct += ((outputs > 0.5) == labels).sum().item()  # Count correct predictions
        
        # Compute average loss and accuracy for validation
        val_loss /= len(val_loader)
        val_acc = val_correct / len(val_loader.dataset)
        
        # Store metrics in history dictionary
        history['loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['accuracy'].append(train_acc)
        history['val_accuracy'].append(val_acc)
        
        # Print training progress
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    return history  # Return training history

# Train the model on 20 epochs
history = train(model,
                train_loader=train_loader,
                val_loader=val_loader,
                criterion=criterion,
                optimizer=optimizer,
                epochs=20)

Epoch [1/20], Loss: 0.6974, Acc: 0.4501, Val Loss: 0.6578, Val Acc: 0.7399
Epoch [2/20], Loss: 0.6165, Acc: 0.8299, Val Loss: 0.5736, Val Acc: 0.8807
Epoch [3/20], Loss: 0.5297, Acc: 0.8914, Val Loss: 0.4875, Val Acc: 0.8951
Epoch [4/20], Loss: 0.4472, Acc: 0.9040, Val Loss: 0.4121, Val Acc: 0.9013
Epoch [5/20], Loss: 0.3785, Acc: 0.9154, Val Loss: 0.3520, Val Acc: 0.9121
Epoch [6/20], Loss: 0.3227, Acc: 0.9251, Val Loss: 0.3053, Val Acc: 0.9193
Epoch [7/20], Loss: 0.2793, Acc: 0.9376, Val Loss: 0.2685, Val Acc: 0.9283
Epoch [8/20], Loss: 0.2436, Acc: 0.9479, Val Loss: 0.2385, Val Acc: 0.9336
Epoch [9/20], Loss: 0.2150, Acc: 0.9554, Val Loss: 0.2141, Val Acc: 0.9435
Epoch [10/20], Loss: 0.1914, Acc: 0.9610, Val Loss: 0.1936, Val Acc: 0.9480
Epoch [11/20], Loss: 0.1715, Acc: 0.9663, Val Loss: 0.1765, Val Acc: 0.9534
Epoch [12/20], Loss: 0.1562, Acc: 0.9704, Val Loss: 0.1622, Val Acc: 0.9578
Epoch [13/20], Loss: 0.1409, Acc: 0.9749, Val Loss: 0.1500, Val Acc: 0.9623
Epoch [14/20], Loss: 

Displaying the evolution of loss and accuracy for both training and validation using `plotly` `graph_objects`.

In [12]:
# Create one graph with two traces
fig = go.Figure(
    data=[
        go.Scatter(
            y=history["loss"],
            name="Training loss",
            mode="lines"),
        go.Scatter(
            y=history["val_loss"],
            name="Validation loss",
            mode="lines")
])

# Update the global and axes title
fig.update_layout(
    title='Training and validation loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy'
)

# Display the graph
fig.show()

# Create one graph with two traces
fig = go.Figure(
    data=[
        go.Scatter(
            y=history["accuracy"],
            name="Training accuracy",
            mode="lines"),
        go.Scatter(
            y=history["val_accuracy"],
            name="Validation accuracy",
            mode="lines")
])

# Update the global and axes title
fig.update_layout(
    title='Training and validation accuracy across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy'
)

# Display the graph
fig.show()

Evaluating the model through a function and saving predictions.

In [20]:
# Function to evaluate the model and get predictions
def evaluate_model_predictions(model, dataloader):
    model.eval()  # Set model to evaluation mode
    predictions = []
    labels = []

    with torch.no_grad():  # No gradients needed during evaluation
        for batch in dataloader:
            input, label = batch  # Assuming (input, label) in DataLoader

            # Forward pass
            outputs = torch.round(model(input).squeeze())
            
            # Save results
            labels.extend(label.numpy())
            predictions.extend(outputs.numpy())

    # Convert to DataFrame for analysis
    df_results = pd.DataFrame({
        "True_Label": labels,
        "Predicted": predictions
    })

    return df_results

model_predictions_train = evaluate_model_predictions(model, train_loader)
model_predictions_val = evaluate_model_predictions(model, val_loader)

Displaying model results with a classification report (using `classification_report` from `sklearn`).

In [14]:
print("Training classification report")
print(classification_report(model_predictions_train["True_Label"], model_predictions_train["Predicted"]))
print()
print("Validation classification report")
print(classification_report(model_predictions_val["True_Label"], model_predictions_val["Predicted"]))

Training classification report
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      3864
         1.0       1.00      0.90      0.95       593

    accuracy                           0.99      4457
   macro avg       0.99      0.95      0.97      4457
weighted avg       0.99      0.99      0.99      4457


Validation classification report
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       961
         1.0       0.98      0.87      0.92       154

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



Displaying detailed model results through a confusion matrix  (using `confusion_matrix` from `sklearn`).

In [15]:
# Function to easily display a confusion matrix with a specific name
def display_confusion_matrix(true_labels, predicted_labels, title) :
    
    cm = confusion_matrix(true_labels, predicted_labels)

    fig = px.imshow(
        cm, 
        labels=dict(x="Predicted labels", y="True labels"),
        x=['Ham', 'Spam'],
        y=['Ham', 'Spam'],
        title=title
    )

    fig.update_traces(text=cm, texttemplate="%{text}")

    fig.show()

display_confusion_matrix(model_predictions_train["True_Label"], model_predictions_train["Predicted"], "Confusion Matrix on train set")
display_confusion_matrix(model_predictions_val["True_Label"], model_predictions_val["Predicted"], "Confusion Matrix on validation set")

We already have a pretty accurate model which perform well on both set. It only takes 10 seconds for 20 epochs to train, so a fair time considering the results. We'll see if we can have better results with a `transformer`.

## Transformer

Using the `roberta-base-finetuned-sms-spam-detection` from `HuggingFace` since it perfectly suit our needs (i.e. spam detection within SMS).

In [16]:
# Import the transformer
classifier = pipeline("text-classification", model="mariagrandury/roberta-base-finetuned-sms-spam-detection")

# Make predictions on all dataset
predictions = classifier(dataset["text"].to_list())

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use mps:0


Saving transformer model results on a `DataFrame`, reworking labels to match the existing pattern.

In [17]:
# Create a DatFrame from transformer's predictions
transformer_predictions = pd.DataFrame(predictions)
transformer_predictions["label"] = transformer_predictions["label"].apply(lambda x : int(x.split("_")[1]))
transformer_predictions.head()

Unnamed: 0,label,score
0,0,0.999803
1,0,0.999831
2,1,0.999383
3,0,0.999819
4,0,0.999817


Displaying model results with a classification report.

In [18]:
print("Classification report")
print(classification_report(dataset["label_id"], transformer_predictions["label"]))

Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4825
           1       0.99      0.98      0.98       747

    accuracy                           1.00      5572
   macro avg       0.99      0.99      0.99      5572
weighted avg       1.00      1.00      1.00      5572



Displaying detailed model results through a confusion matrix.

In [19]:
display_confusion_matrix(dataset["label_id"], transformer_predictions["label"], "Confusion Matrix on all set")

The pre-trained RoBERTa performed better than our model. As a more sophisticated model with much more training, it proved to be the most effective solution for spam filtering. Implementing it using the Transformers library was very easy and efficient.

Our custom, lightweight embedding model performed very well, although it's hard to be certain how well it would generalize to a much larger and more diverse set of messages, given the limited data we currently have.