In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [3]:
# Load the dataset
df = pd.read_csv('mail_data.xls')

In [4]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    return text

df['Message'] = df['Message'].apply(preprocess_text)

In [5]:
# Split the dataset
X = df['Message']
y = df['Category'].map({'ham': 0, 'spam': 1})

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Vectorize the text data by TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Train a Naive Bayes classifier with hyperparameter tuning
nb_classifier = MultinomialNB()
param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]}

In [9]:
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

In [10]:
best_nb_classifier = grid_search.best_estimator_

In [11]:
# Prediction for training set
y_train_pred = best_nb_classifier.predict(X_train_tfidf)

In [12]:
# Prediction for test set
y_test_pred = best_nb_classifier.predict(X_test_tfidf)

In [13]:
# accuracy for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Accuracy score on the training set: {train_accuracy:.2f}")

Accuracy score on the training set: 1.00


In [14]:
# accuracy for train set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy score on the test set: {test_accuracy:.2f}")

Accuracy score on the test set: 0.99


In [15]:
# Function to classify a new email
def classify_new_email(email):
    email_tfidf = tfidf_vectorizer.transform([email])
    result = best_nb_classifier.predict(email_tfidf)
    return 'spam' if result == 1 else 'ham'

In [16]:
#Classify a new email
new_email = "you have won 5 lottory coupens"
print(f"This email is {classify_new_email(new_email)}!")

This email is spam!


In [70]:
# Save the model
joblib.dump(best_nb_classifier, 'spam_email_detection.pkl')

['spam_email_detection.pkl']