In [16]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv(r'C:\Users\pm\Downloads\spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    return text

data['message'] = data['message'].apply(preprocess_text)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

def predict_email(email):
    email = preprocess_text(email)
    prediction = model.predict([email])
    return "Spam" if prediction[0] == 1 else "Not Spam"

#Testing
sample_email = "Congratulations! You have won a lottery. Click here to claim."
print("Sample Email Prediction:", predict_email(sample_email))


Accuracy: 0.9515695067264573
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       1.00      0.64      0.78       150

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.95      0.95      0.95      1115

Sample Email Prediction: Spam


In [17]:
sample_email = "Hello, I hope you are doing well. Let's catch up soon."
print("Sample Email Prediction:", predict_email(sample_email))

Sample Email Prediction: Not Spam
