In [1]:
import pandas as pd
import re
import string
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'text']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['clean_text'] = data['text'].apply(clean_text)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)
X_train_clean, X_test_clean = data['clean_text'][X_train.index], data['clean_text'][X_test.index]

In [6]:
vectorizers = {
    'BOW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

In [7]:
def train_models(X_train, X_test, y_train, y_test, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    models = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }
    
    for name, model in models.items():
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        print(f'\n=== {name} Model ({vectorizer.__class__.__name__}) ===')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
        print(classification_report(y_test, y_pred, zero_division=0))
        print('-' * 50)

    ensemble = VotingClassifier(estimators=[
        ('nb', models['Naive Bayes']),
        ('rf', models['Random Forest']),
        ('xgb', models['XGBoost'])
    ], voting='hard')

    ensemble.fit(X_train_vec, y_train)
    y_pred = ensemble.predict(X_test_vec)
    
    print(f'\n=== Ensemble Model ({vectorizer.__class__.__name__}) ===')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(classification_report(y_test, y_pred, zero_division=0))
    print('=' * 60)

In [8]:
for name, vectorizer in vectorizers.items():
    print(f'\n\n##### Training with {name} Features (Raw Text) #####')
    train_models(X_train, X_test, y_train, y_test, vectorizer)
    
    print(f'\n\n##### Training with {name} Features (Cleaned Text) #####')
    train_models(X_train_clean, X_test_clean, y_train, y_test, vectorizer)



##### Training with BOW Features (Raw Text) #####

=== Naive Bayes Model (CountVectorizer) ===
Accuracy: 0.9839
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------

=== Random Forest Model (CountVectorizer) ===
Accuracy: 0.9758
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.82      0.90       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115

--------------------------------------------------

=== XGBoost Model (CountVectorizer) ===
Accuracy: 0.9776
        

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== Ensemble Model (CountVectorizer) ===
Accuracy: 0.9830
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.87      0.93       150

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



##### Training with BOW Features (Cleaned Text) #####

=== Naive Bayes Model (CountVectorizer) ===
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------

=== Random Forest Model (CountVectorizer) ===
Accuracy: 0.9686
              precision    recall  f1-score   support


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




=== Ensemble Model (CountVectorizer) ===
Accuracy: 0.9785
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



##### Training with TF-IDF Features (Raw Text) #####

=== Naive Bayes Model (TfidfVectorizer) ===
Accuracy: 0.9623
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

--------------------------------------------------

=== Random Forest Model (TfidfVectorizer) ===
Accuracy: 0.9749
              precision    recall  f1-score   support



Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (TfidfVectorizer) ===
Accuracy: 0.9767
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




=== Ensemble Model (TfidfVectorizer) ===
Accuracy: 0.9749
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.99      0.82      0.90       150

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



##### Training with TF-IDF Features (Cleaned Text) #####

=== Naive Bayes Model (TfidfVectorizer) ===
Accuracy: 0.9516
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       1.00      0.64      0.78       150

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.95      0.95      0.95      1115

--------------------------------------------------

=== Random Forest Model (TfidfVectorizer) ===
Accuracy: 0.9677
              precision    recall  f1-score   suppo

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (TfidfVectorizer) ===
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




=== Ensemble Model (TfidfVectorizer) ===
Accuracy: 0.9695
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

