In [None]:
import pandas as pd
import re
import string
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'text']
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['clean_text'] = data['text'].apply(clean_text)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)
X_train_clean, X_test_clean = data['clean_text'][X_train.index], data['clean_text'][X_test.index]

In [None]:
vectorizers = {
    'BOW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

In [None]:
def train_models(X_train, X_test, y_train, y_test, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    models = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    for name, model in models.items():
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        print(f'\n=== {name} Model ({vectorizer.__class__.__name__}) ===')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
        print(classification_report(y_test, y_pred, zero_division=0))
        print('-' * 50)

    ensemble = VotingClassifier(estimators=[
        ('nb', models['Naive Bayes']),
        ('rf', models['Random Forest']),
        ('xgb', models['XGBoost'])
    ], voting='hard')

    ensemble.fit(X_train_vec, y_train)
    y_pred = ensemble.predict(X_test_vec)

    print(f'\n= Ensemble Model ({vectorizer.__class__.__name__}) =')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(classification_report(y_test, y_pred, zero_division=0))
    print('=' * 60)

In [None]:
for name, vectorizer in vectorizers.items():
    print(f'\n\n Training with {name} Features (Raw Text) ')
    train_models(X_train, X_test, y_train, y_test, vectorizer)

    print(f'\n\n# Training with {name} Features (Cleaned Text) #')
    train_models(X_train_clean, X_test_clean, y_train, y_test, vectorizer)



 Training with BOW Features (Raw Text) 

=== Naive Bayes Model (CountVectorizer) ===
Accuracy: 0.9839
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------

=== Random Forest Model (CountVectorizer) ===
Accuracy: 0.9758
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.82      0.90       150

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (CountVectorizer) ===
Accuracy: 0.9776
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.87      0.91       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (CountVectorizer) =
Accuracy: 0.9830
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.87      0.93       150

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Training with BOW Features (Cleaned Text) #

=== Naive Bayes Model (CountVectorizer) ===
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------

=== Random Forest Model (CountVectorizer) ===
Accuracy: 0.9686
              precision    recall  f1-score   support

           

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (CountVectorizer) ===
Accuracy: 0.9776
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.86      0.91       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (CountVectorizer) =
Accuracy: 0.9785
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



 Training with TF-IDF Features (Raw Text) 

=== Naive Bayes Model (TfidfVectorizer) ===
Accuracy: 0.9623
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

--------------------------------------------------

=== Random Forest Model (TfidfVectorizer) ===
Accuracy: 0.9749
              precision    recall  f1-score   support

           0  

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (TfidfVectorizer) ===
Accuracy: 0.9767
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (TfidfVectorizer) =
Accuracy: 0.9749
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.99      0.82      0.90       150

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



# Training with TF-IDF Features (Cleaned Text) #

=== Naive Bayes Model (TfidfVectorizer) ===
Accuracy: 0.9516
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       1.00      0.64      0.78       150

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.95      0.95      0.95      1115

--------------------------------------------------

=== Random Forest Model (TfidfVectorizer) ===
Accuracy: 0.9677
              precision    recall  f1-score   support

        

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (TfidfVectorizer) ===
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (TfidfVectorizer) =
Accuracy: 0.9695
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



GEMINI

1. Data Loading and Cleaning

In [1]:
import pandas as pd
import re
import string
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
data = data[['Category', 'Message']]
data.columns = ['Category', 'Message']
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

# Text Cleaning Function
def clean_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(f'[{string.punctuation}]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Apply cleaning to the 'text' column
data['clean_text'] = data['Message'].apply(clean_text)

2. Feature Extraction (BOW and TF-IDF)
Next, we create features using Bag-of-Words (BOW) and Term Frequency-Inverse Document Frequency (TF-IDF).

Split data: We split the data into training and testing sets using train_test_split.
Create vectorizers: We initialize CountVectorizer for BOW and TfidfVectorizer for TF-IDF.

In [3]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['Message'], data['Category'], test_size=0.2, random_state=42
)
X_train_clean, X_test_clean = data['clean_text'][X_train.index], data['clean_text'][X_test.index]

# Create vectorizers
vectorizers = {
    'BOW': CountVectorizer(),
    'TF-IDF': TfidfVectorizer()
}

3. Model Training and Evaluation
Now, we train and evaluate Naive Bayes, Random Forest, and XGBoost models with and without text cleaning. We also explore an ensemble method.

In [4]:
def train_models(X_train, X_test, y_train, y_test, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    models = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    for name, model in models.items():
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        print(f'\n=== {name} Model ({vectorizer.__class__.__name__}) ===')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
        print(classification_report(y_test, y_pred, zero_division=0))
        print('-' * 50)

    # Ensemble Model (Voting Classifier)
    ensemble = VotingClassifier(estimators=[
        ('nb', models['Naive Bayes']),
        ('rf', models['Random Forest']),
        ('xgb', models['XGBoost'])
    ], voting='hard')

    ensemble.fit(X_train_vec, y_train)
    y_pred = ensemble.predict(X_test_vec)

    print(f'\n= Ensemble Model ({vectorizer.__class__.__name__}) =')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(classification_report(y_test, y_pred, zero_division=0))
    print('=' * 60)

# Train and evaluate models for both raw and cleaned text with BOW and TF-IDF
for name, vectorizer in vectorizers.items():
    print(f'\n\n Training with {name} Features (Raw Text) ')
    train_models(X_train, X_test, y_train, y_test, vectorizer)

    print(f'\n\n# Training with {name} Features (Cleaned Text) #')
    train_models(X_train_clean, X_test_clean, y_train, y_test, vectorizer)



 Training with BOW Features (Raw Text) 

=== Naive Bayes Model (CountVectorizer) ===
Accuracy: 0.9928
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.95      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

--------------------------------------------------

=== Random Forest Model (CountVectorizer) ===
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (CountVectorizer) ===
Accuracy: 0.9821
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.89      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (CountVectorizer) =
Accuracy: 0.9883
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



# Training with BOW Features (Cleaned Text) #

=== Naive Bayes Model (CountVectorizer) ===
Accuracy: 0.9883
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

--------------------------------------------------

=== Random Forest Model (CountVectorizer) ===
Accuracy: 0.9740
              precision    recall  f1-score   support

           

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (CountVectorizer) ===
Accuracy: 0.9758
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.95      0.87      0.91       149

    accuracy                           0.98      1115
   macro avg       0.96      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (CountVectorizer) =
Accuracy: 0.9830
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



 Training with TF-IDF Features (Raw Text) 

=== Naive Bayes Model (TfidfVectorizer) ===
Accuracy: 0.9632
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

--------------------------------------------------

=== Random Forest Model (TfidfVectorizer) ===
Accuracy: 0.9812
              precision    recall  f1-score   support

           0  

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (TfidfVectorizer) ===
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.86      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (TfidfVectorizer) =
Accuracy: 0.9794
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# Training with TF-IDF Features (Cleaned Text) #

=== Naive Bayes Model (TfidfVectorizer) ===
Accuracy: 0.9561
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       966
           1       1.00      0.67      0.80       149

    accuracy                           0.96      1115
   macro avg       0.98      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115

--------------------------------------------------

=== Random Forest Model (TfidfVectorizer) ===
Accuracy: 0.9740
              precision    recall  f1-score   support

        

Parameters: { "use_label_encoder" } are not used.




=== XGBoost Model (TfidfVectorizer) ===
Accuracy: 0.9776
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.




= Ensemble Model (TfidfVectorizer) =
Accuracy: 0.9731
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.80      0.89       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



Conclusions:

Text cleaning generally improves performance: You'll likely observe that models trained on cleaned text achieve higher accuracy and better classification metrics.
TF-IDF often outperforms BOW: TF-IDF tends to provide more informative features than BOW, leading to better results.
Ensemble methods can further enhance performance: The ensemble model might achieve the highest accuracy by combining the strengths of individual models.

Accuracy:
To see the output, run the code.
The accuracy scores and classification reports printed in the output will give you a quantitative measure of how accurate the models are. Look for the "Accuracy" value for overall accuracy and metrics like precision, recall, and F1-score for class-specific performance.