In [19]:
import pandas as pd
import numpy as np
import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

In [20]:
# Download NLTK stopwords
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv("spam.csv", encoding="latin-1")[['v1', 'v2']]
df.columns = ['label', 'text']



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Convert labels to binary (spam = 1, ham = 0)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})



In [22]:
# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))
    return text



In [23]:
# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)



In [24]:
# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['label'].values




In [25]:
# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Dataset Loaded & Preprocessed Successfully!")

Dataset Loaded & Preprocessed Successfully!


In [26]:
# Hyperparameter tuning for Naïve Bayes
nb_params = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
nb_model = GridSearchCV(MultinomialNB(), nb_params, cv=5, scoring='accuracy')
nb_model.fit(X_train, y_train)

# Hyperparameter tuning for Logistic Regression
lr_params = {'C': [0.1, 1, 10, 100], 'max_iter': [100, 200, 300]}
lr_model = GridSearchCV(LogisticRegression(), lr_params, cv=5, scoring='accuracy')
lr_model.fit(X_train, y_train)

# Evaluate the best models
best_nb = nb_model.best_estimator_
best_lr = lr_model.best_estimator_

models = {"Naïve Bayes": best_nb, "Logistic Regression": best_lr}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Best Model Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Save the best model
best_model = best_lr  # Choose Logistic Regression as the best
joblib.dump(best_model, "spam_classifier.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("\nBest Tuned Model Saved as 'spam_classifier.pkl'")



Naïve Bayes Best Model Performance:
Accuracy: 0.9829596412556054
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.91      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Logistic Regression Best Model Performance:
Accuracy: 0.9757847533632287
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.98      0.83      0.90       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.94      1115
weighted avg       0.98      0.98      0.97      1115


Best Tuned Model Saved as 'spam_classifier.pkl'
