In [3]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Function to clean messages
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)


X = df['cleaned_message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧾 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

def predict_spam(message):
    msg_clean = clean_text(message)
    vec = vectorizer.transform([msg_clean])
    result = model.predict(vec)[0]
    return "Spam" if result == 1 else "Not Spam"


print("\n🔍 Testing:")
print("1:", predict_spam("Congratulations! You’ve won a $1000 Walmart gift card. Click here to claim now!"))
print("2:", predict_spam("Hey, are we still meeting for dinner? Let me know!"))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sadiq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Accuracy: 0.9775784753363229

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


🧾 Confusion Matrix:
 [[965   0]
 [ 25 125]]

🔍 Testing:
1: Spam
2: Not Spam
