### 1. Download and Import the spam classification dataset

In [None]:
import pandas as pd

In [None]:
# Raw GitHub URL
url = "https://raw.githubusercontent.com/rachnadevraj/Spam-Classification/main/Dataset/SPAM%20text%20message%2020170820%20-%20Data.csv"

# Load dataset
df = pd.read_csv(url)

# Display first few rows
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


### 2. Data Profiling, Converting to sequence of tokens and Splitting

In [None]:
!pip install ydata_profiling

Collecting ydata_profiling
  Downloading ydata_profiling-4.12.2-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.8.0,>=0.7.5 (from visions[type_image_path]<0.8.0,>=0.7.5->ydata_profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata_profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata_profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata_profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata_profiling)
  Downloading dacite-1.9.2-py3-none-any.whl.metadata (17 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata_profiling)
  Downloading pywavelets-1.

In [None]:
# Import Profile Report
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Spam Classification Report")

profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode labels ('spam' → 1, 'ham' → 0)
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")  # Limit vocabulary size, handle out-of-vocab words
tokenizer.fit_on_texts(df['Message'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['Message'])

max_length = max(df['Message'].apply(lambda x: len(x.split())))
print(f"Max message length: {max_length}")

# Padding sequences to a fixed length
max_length = max_length
X_padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")

# Define target variable
y = df['Category'].values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Print shape of processed data
print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")

Max message length: 171
Training data shape: (4457, 171), Test data shape: (1115, 171)


In [None]:
pip install torch torchvision torchaudio pandas scikit-learn nltk

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from collections import Counter

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Load dataset
df.columns = ['Category', 'Message']

# Encode labels (Spam = 1, Ham = 0)
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Tokenization
df['Message'] = df['Message'].apply(lambda x: word_tokenize(x.lower()))

# Build vocabulary
word_counts = Counter(word for message in df['Message'] for word in message)
vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}  # Reserve 0 for padding

# Convert text to sequences
df['Message'] = df['Message'].apply(lambda x: [vocab[word] for word in x if word in vocab])

# Define max length based on 95th percentile
max_length = int(np.percentile(df['Message'].apply(len), 95))
print(f"Max Sequence Length: {max_length}")

# Pad sequences
def pad_message(msg, max_len):
    return msg[:max_len] + [0] * (max_len - len(msg)) if len(msg) < max_len else msg[:max_len]

df['Message'] = df['Message'].apply(lambda x: pad_message(x, max_length))


Max Sequence Length: 39


### 3. RNN model using Pytorch

In [None]:
# Create Dataset and Data loader
class SpamDataset(Dataset):
    def __init__(self, messages, labels):
        self.messages = torch.tensor(messages, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.messages)

    def __getitem__(self, idx):
        return self.messages[idx], self.labels[idx]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['Message'].tolist(), df['Category'].tolist(), test_size=0.2, random_state=42)

# Convert to dataset
train_dataset = SpamDataset(X_train, y_train)
test_dataset = SpamDataset(X_test, y_test)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Build the RNN Model
class SpamRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SpamRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.rnn(x)  # Get last hidden state
        out = self.fc(hidden[-1])
        return self.sigmoid(out)

# Model parameters
vocab_size = len(vocab) + 1  # Plus 1 for padding index
embedding_dim = 50
hidden_dim = 64
output_dim = 1  # Binary classification (Spam or Ham)

# Initialize model
model = SpamRNN(vocab_size, embedding_dim, hidden_dim, output_dim)


### 4. Train and Evaluate the model

In [None]:
#Train the model

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for messages, labels in train_loader:
        messages, labels = messages.to(device), labels.to(device)

        optimizer.zero_grad()
        predictions = model(messages).squeeze()
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/5, Loss: 0.4090
Epoch 2/5, Loss: 0.1876
Epoch 3/5, Loss: 0.0965
Epoch 4/5, Loss: 0.0604
Epoch 5/5, Loss: 0.0395


In [None]:
# Evaluate the Model
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for messages, labels in test_loader:
        messages = messages.to(device)
        outputs = model(messages).squeeze()
        preds = (outputs > 0.5).float().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.9767


### 5. Experimenting with different set of hyper parameters

In [None]:
# Model parameters
vocab_size = len(vocab) + 1  # Plus 1 for padding index
embedding_dim = 60
hidden_dim = 64
output_dim = 1  # Binary classification (Spam or Ham)

# Initialize model
model = SpamRNN(vocab_size, embedding_dim, hidden_dim, output_dim)

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for messages, labels in train_loader:
        messages, labels = messages.to(device), labels.to(device)

        optimizer.zero_grad()
        predictions = model(messages).squeeze()
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/5, Loss: 0.3995
Epoch 2/5, Loss: 0.1745
Epoch 3/5, Loss: 0.0941
Epoch 4/5, Loss: 0.0678
Epoch 5/5, Loss: 0.0444


In [None]:
# Evaluating with the new set of hyper parameters
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for messages, labels in test_loader:
        messages = messages.to(device)
        outputs = model(messages).squeeze()
        preds = (outputs > 0.5).float().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9830
