In [1]:
!pip install pandas numpy nltk scikit-learn xgboost



In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
df=pd.read_csv('spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df = df.iloc[:, [0, 1]]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['cleaned_text'] = df['text'].apply(clean_text)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
X_train_clean, X_test_clean, _, _ = train_test_split(df['cleaned_text'], df['label'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_bow_raw = vectorizer.fit_transform(X_train_raw)
X_test_bow_raw = vectorizer.transform(X_test_raw)
X_train_bow_clean = vectorizer.fit_transform(X_train_clean)
X_test_bow_clean = vectorizer.transform(X_test_clean)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf_raw = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf_raw = tfidf_vectorizer.transform(X_test_raw)
X_train_tfidf_clean = tfidf_vectorizer.fit_transform(X_train_clean)
X_test_tfidf_clean = tfidf_vectorizer.transform(X_test_clean)


In [7]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    print(f"Training {name} with BoW (Uncleaned)")
    model.fit(X_train_bow_raw, y_train)
    y_pred = model.predict(X_test_bow_raw)
    print(f"{name} Accuracy (BoW - Uncleaned):", accuracy_score(y_test, y_pred))

    print(f"Training {name} with BoW (Cleaned)")
    model.fit(X_train_bow_clean, y_train)
    y_pred = model.predict(X_test_bow_clean)
    print(f"{name} Accuracy (BoW - Cleaned):", accuracy_score(y_test, y_pred))

    print(f"Training {name} with TF-IDF (Uncleaned)")
    model.fit(X_train_tfidf_raw, y_train)
    y_pred = model.predict(X_test_tfidf_raw)
    print(f"{name} Accuracy (TF-IDF - Uncleaned):", accuracy_score(y_test, y_pred))

    print(f"Training {name} with TF-IDF (Cleaned)")
    model.fit(X_train_tfidf_clean, y_train)
    y_pred = model.predict(X_test_tfidf_clean)
    print(f"{name} Accuracy (TF-IDF - Cleaned):", accuracy_score(y_test, y_pred))

voting_clf = VotingClassifier(estimators=[
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier()),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
], voting='hard')

print("\nTraining Voting Classifier with BoW (Cleaned)")
voting_clf.fit(X_train_bow_clean, y_train)
y_pred = voting_clf.predict(X_test_bow_clean)
print("Voting Classifier Accuracy (BoW - Cleaned):", accuracy_score(y_test, y_pred))

print("\nTraining Voting Classifier with TF-IDF (Cleaned)")
voting_clf.fit(X_train_tfidf_clean, y_train)
y_pred = voting_clf.predict(X_test_tfidf_clean)
print("Voting Classifier Accuracy (TF-IDF - Cleaned):", accuracy_score(y_test, y_pred))

Training Naive Bayes with BoW (Uncleaned)
Naive Bayes Accuracy (BoW - Uncleaned): 0.9838565022421525
Training Naive Bayes with BoW (Cleaned)
Naive Bayes Accuracy (BoW - Cleaned): 0.9802690582959641
Training Naive Bayes with TF-IDF (Uncleaned)
Naive Bayes Accuracy (TF-IDF - Uncleaned): 0.9623318385650225
Training Naive Bayes with TF-IDF (Cleaned)
Naive Bayes Accuracy (TF-IDF - Cleaned): 0.9632286995515695
Training Random Forest with BoW (Uncleaned)
Random Forest Accuracy (BoW - Uncleaned): 0.9757847533632287
Training Random Forest with BoW (Cleaned)
Random Forest Accuracy (BoW - Cleaned): 0.9739910313901345
Training Random Forest with TF-IDF (Uncleaned)
Random Forest Accuracy (TF-IDF - Uncleaned): 0.9757847533632287
Training Random Forest with TF-IDF (Cleaned)
Random Forest Accuracy (TF-IDF - Cleaned): 0.9766816143497757
Training XGBoost with BoW (Uncleaned)


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (BoW - Uncleaned): 0.9775784753363229
Training XGBoost with BoW (Cleaned)


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (BoW - Cleaned): 0.979372197309417
Training XGBoost with TF-IDF (Uncleaned)


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (TF-IDF - Uncleaned): 0.9766816143497757
Training XGBoost with TF-IDF (Cleaned)


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (TF-IDF - Cleaned): 0.9775784753363229

Training Voting Classifier with BoW (Cleaned)


Parameters: { "use_label_encoder" } are not used.



Voting Classifier Accuracy (BoW - Cleaned): 0.9811659192825112

Training Voting Classifier with TF-IDF (Cleaned)


Parameters: { "use_label_encoder" } are not used.



Voting Classifier Accuracy (TF-IDF - Cleaned): 0.97847533632287
