In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [3]:

# Load dataset
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

In [4]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# Apply text cleaning
df["cleaned_text"] = df["text"].apply(clean_text)

# Encode labels (ham -> 0, spam -> 1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["label"])

In [5]:

# Feature extraction (BoW & TF-IDF)
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

X_bow_raw = bow_vectorizer.fit_transform(df["text"])
X_bow_cleaned = bow_vectorizer.fit_transform(df["cleaned_text"])
X_tfidf_raw = tfidf_vectorizer.fit_transform(df["text"])
X_tfidf_cleaned = tfidf_vectorizer.fit_transform(df["cleaned_text"])

In [6]:
# Split data for training & testing
X_train_bow_raw, X_test_bow_raw, y_train, y_test = train_test_split(X_bow_raw, y, test_size=0.2, random_state=42)
X_train_bow_cleaned, X_test_bow_cleaned, _, _ = train_test_split(X_bow_cleaned, y, test_size=0.2, random_state=42)
X_train_tfidf_raw, X_test_tfidf_raw, _, _ = train_test_split(X_tfidf_raw, y, test_size=0.2, random_state=42)
X_train_tfidf_cleaned, X_test_tfidf_cleaned, _, _ = train_test_split(X_tfidf_cleaned, y, test_size=0.2, random_state=42)

In [7]:
# Train and evaluate models
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Initialize models
nb_model = MultinomialNB()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [8]:
# Store results
results = {
    "Naive Bayes (BoW, raw)": train_and_evaluate(nb_model, X_train_bow_raw, X_test_bow_raw, y_train, y_test),
    "Naive Bayes (BoW, cleaned)": train_and_evaluate(nb_model, X_train_bow_cleaned, X_test_bow_cleaned, y_train, y_test),
    "Naive Bayes (TF-IDF, raw)": train_and_evaluate(nb_model, X_train_tfidf_raw, X_test_tfidf_raw, y_train, y_test),
    "Naive Bayes (TF-IDF, cleaned)": train_and_evaluate(nb_model, X_train_tfidf_cleaned, X_test_tfidf_cleaned, y_train, y_test),
    "Random Forest (BoW, raw)": train_and_evaluate(rf_model, X_train_bow_raw, X_test_bow_raw, y_train, y_test),
    "Random Forest (BoW, cleaned)": train_and_evaluate(rf_model, X_train_bow_cleaned, X_test_bow_cleaned, y_train, y_test),
    "Random Forest (TF-IDF, raw)": train_and_evaluate(rf_model, X_train_tfidf_raw, X_test_tfidf_raw, y_train, y_test),
    "Random Forest (TF-IDF, cleaned)": train_and_evaluate(rf_model, X_train_tfidf_cleaned, X_test_tfidf_cleaned, y_train, y_test),
    "XGBoost (BoW, raw)": train_and_evaluate(xgb_model, X_train_bow_raw, X_test_bow_raw, y_train, y_test),
    "XGBoost (BoW, cleaned)": train_and_evaluate(xgb_model, X_train_bow_cleaned, X_test_bow_cleaned, y_train, y_test),
    "XGBoost (TF-IDF, raw)": train_and_evaluate(xgb_model, X_train_tfidf_raw, X_test_tfidf_raw, y_train, y_test),
    "XGBoost (TF-IDF, cleaned)": train_and_evaluate(xgb_model, X_train_tfidf_cleaned, X_test_tfidf_cleaned, y_train, y_test)
}

# Display results
results

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



{'Naive Bayes (BoW, raw)': 0.97847533632287,
 'Naive Bayes (BoW, cleaned)': 0.9757847533632287,
 'Naive Bayes (TF-IDF, raw)': 0.9623318385650225,
 'Naive Bayes (TF-IDF, cleaned)': 0.9506726457399103,
 'Random Forest (BoW, raw)': 0.9748878923766816,
 'Random Forest (BoW, cleaned)': 0.9713004484304932,
 'Random Forest (TF-IDF, raw)': 0.9766816143497757,
 'Random Forest (TF-IDF, cleaned)': 0.9721973094170404,
 'XGBoost (BoW, raw)': 0.9775784753363229,
 'XGBoost (BoW, cleaned)': 0.9757847533632287,
 'XGBoost (TF-IDF, raw)': 0.9820627802690582,
 'XGBoost (TF-IDF, cleaned)': 0.9766816143497757}