goal is to classify SMS messages as spam or ham (not spam) using different text-processing techniques and machine learning models.

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Download dataset
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
df = pd.read_csv(path + "/spam.csv", encoding="latin-1")[["v1", "v2"]]  # Keep relevant columns
df.columns = ["label", "text"]



Downloading from https://www.kaggle.com/api/v1/datasets/download/uciml/sms-spam-collection-dataset?dataset_version_number=1...


100%|██████████| 211k/211k [00:00<00:00, 66.9MB/s]

Extracting files...





In [None]:
# Convert labels to numeric
df["label"] = df["label"].map({"ham": 0, "spam": 1})

# Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df["clean_text"] = df["text"].apply(clean_text)



In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
X_train_clean, X_test_clean, _, _ = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)

# Feature Extraction - BOW & TF-IDF
vectorizers = {
    "BOW": CountVectorizer(),
    "TF-IDF": TfidfVectorizer()
}

models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}



The main idea is to create a vocabulary from all documents and represent each document as a vector based on word occurrences.
Variants include binary BoW (presence/absence of words) and frequency-based BoW (counting occurrences).

In [None]:
# Training and Evaluation
for vec_name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    X_train_clean_vec = vectorizer.fit_transform(X_train_clean)
    X_test_clean_vec = vectorizer.transform(X_test_clean)

    print(f"\n--- {vec_name} Features ---")
    for model_name, model in models.items():
        # Train on raw text
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        acc_raw = accuracy_score(y_test, y_pred)

        # Train on cleaned text
        model.fit(X_train_clean_vec, y_train)
        y_pred_clean = model.predict(X_test_clean_vec)
        acc_clean = accuracy_score(y_test, y_pred_clean)

        print(f"{model_name}: Raw Accuracy: {acc_raw:.4f} | Cleaned Accuracy: {acc_clean:.4f}")




--- BOW Features ---
Naive Bayes: Raw Accuracy: 0.9839 | Cleaned Accuracy: 0.9794
Random Forest: Raw Accuracy: 0.9758 | Cleaned Accuracy: 0.9722


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost: Raw Accuracy: 0.9776 | Cleaned Accuracy: 0.9776

--- TF-IDF Features ---
Naive Bayes: Raw Accuracy: 0.9623 | Cleaned Accuracy: 0.9516
Random Forest: Raw Accuracy: 0.9749 | Cleaned Accuracy: 0.9731


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost: Raw Accuracy: 0.9767 | Cleaned Accuracy: 0.9812


In [None]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier
ensemble_model = VotingClassifier(
    estimators=[
        ('nb', MultinomialNB()),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"))
    ],
    voting='hard'  # 'hard' for majority vote, 'soft' for probability averaging
)

# Train on TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

ensemble_model.fit(X_train_tfidf, y_train)
y_pred_ensemble = ensemble_model.predict(X_test_tfidf)

# Accuracy
acc_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"\nEnsemble Model Accuracy (TF-IDF): {acc_ensemble:.4f}")


Parameters: { "use_label_encoder" } are not used.




Ensemble Model Accuracy (TF-IDF): 0.9749


In text tasks, it's used for sentiment intensity, review score predictions, or text-based regression problems.

In [None]:
# Sample messages
test_messages = [
    "Congratulations! You've won a free lottery ticket! Click here to claim now!",  # Likely spam
    "Hey, are we still meeting for coffee at 5?",  # Likely ham
    "URGENT! Your bank account is compromised. Call this number immediately!",  # Likely spam
    "See you at the gym later. Bring your workout shoes!",  # Likely ham
]

# Convert to TF-IDF format
test_vectors = vectorizer.transform(test_messages)

# Make predictions
predictions = model.predict(test_vectors)

# Display results
for msg, pred in zip(test_messages, predictions):
    print(f"Message: {msg} --> Prediction: {'Spam' if pred == 1 else 'Ham'}")


Message: Congratulations! You've won a free lottery ticket! Click here to claim now! --> Prediction: Spam
Message: Hey, are we still meeting for coffee at 5? --> Prediction: Ham
Message: URGENT! Your bank account is compromised. Call this number immediately! --> Prediction: Spam
Message: See you at the gym later. Bring your workout shoes! --> Prediction: Ham


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train_tfidf, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV Score:", scores.mean())


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation scores: [0.97869955 0.97982063 0.97979798 0.97530864 0.98316498]
Mean CV Score: 0.9793583568620938
