In [None]:
import os
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# If running the first time, download stopwords data
nltk.download('stopwords')

In [None]:

# Load your CSV (must have 'Message' and 'Spam' columns)
data = pd.read_csv('spam.csv')

In [None]:
def preprocess_text(text):
    processed_set = set(''.join(char for char in text.lower() if char not in string.punctuation).split())

data['Cleaned'] = data['Message'].apply(preprocess_text)



In [None]:
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(data['Cleaned'])
y = data['Spam']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
def is_spam(email_text, model, tfidf_vectorizer):
    cleaned = preprocess_text(email_text)
    vect = tfidf_vectorizer.transform([cleaned])
    prediction = model.predict(vect)
    return "Spam" if prediction[0] == 1 else "Not Spam"

In [None]:
new_emails = ["Get cheap loan now!", "Meeting at 10AM tomorrow."]
for msg in new_emails:
    print(f"Email: '{msg}' is {is_spam(msg, model, tfidf)}")