In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import re

# Step 1: Data Collection
# Load the dataset containing labeled emails (spam or not spam)
data = pd.read_csv("/content/spam.csv", encoding='latin1')

# Step 2: Data Preprocessing
# Preprocess the text data
# (Tokenization, removing stopwords, vectorization)
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['v2'])
y = data['v1']

# Step 3: Model Training
# Choose and train the model (Multinomial Naive Bayes)
model = MultinomialNB()
model.fit(X, y)

def preprocess_email(email):
    # Preprocess the email similar to the training data
    email = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', email)
    email = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', email)
    email = re.sub(r'\b\d+\b', 'number', email)
    email = re.sub(r'£|\$', 'moneysymb', email)
    email = re.sub(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', email)
    email = re.sub(r'\d+(\.\d+)?', 'numbr', email)
    email = re.sub(r'[^\w\d\s]', ' ', email)
    email = re.sub(r'\s+', ' ', email)
    email = email.strip().lower()
    return email

def predict_spam(email):
    # Preprocess the input email
    processed_email = preprocess_email(email)
    # Vectorize the email
    email_vector = vectorizer.transform([processed_email])
    # Predict using the trained model
    prediction = model.predict(email_vector)
    if prediction[0] == 'ham':
        return "Not Spam"
    else:
        return "Spam"

# Accept input from the user
user_input_email = input("Enter the email text: ")

# Predict whether the input email is spam or not
prediction = predict_spam(user_input_email)
print("Prediction:", prediction)


Enter the email text: WINNER!! As a valued network customer you have been selected to receivea �900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
Prediction: Spam
