In [None]:
import numpy as np
import pandas as pd
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
# Load dataset
messages = pd.read_csv('/content/spam.csv', encoding='latin-1')
messages = messages.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
messages.columns = ["label", "message"]

# Convert labels to numerical values
messages['label'] = messages['label'].map({'ham': 0, 'spam': 1})  # Map 'ham' -> 0, 'spam' -> 1

# Define text preprocessing function
def text_preprocess(mess):
    lemmatizer = WordNetLemmatizer()
    nopunc = ''.join([char.lower() for char in mess if char not in string.punctuation])
    words = [lemmatizer.lemmatize(word) for word in nopunc.split() if word not in stopwords.words('english') and word.isalpha()]
    return ' '.join(words)  # Convert list back to string

# Apply text preprocessing
messages["message"] = messages["message"].apply(text_preprocess)

# Convert text into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(messages["message"])
y = messages["label"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB(alpha=0.1)
model.fit(X_train, y_train)

# Function to predict new SMS messages
def predict_sms(text):
    processed_text = text_preprocess(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    probability = model.predict_proba(vectorized_text)[0].max()  # Get the max confidence score
    return "Spam" if prediction == 1 else "Ham", probability


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
sms = "Free Gift.Special Offer. Get now"
label, confidence = predict_sms(sms)
print(f"Prediction: {label}, Confidence: {confidence:.2f}")

Prediction: Spam, Confidence: 0.74


In [None]:
sms = "Good Morning. Can we catch up?"
label, confidence = predict_sms(sms)
print(f"Prediction: {label}, Confidence: {confidence:.2f}")

Prediction: Ham, Confidence: 1.00


In [None]:
sms = "Do participate in the Webinar.Click the below link"
label, confidence = predict_sms(sms)
print(f"Prediction: {label}, Confidence: {confidence:.2f}")

Prediction: Ham, Confidence: 0.57


In [None]:
sms = "You have won 50000. Click here to claim!"
label, confidence = predict_sms(sms)
print(f"Prediction: {label}, Confidence: {confidence:.2f}")

Prediction: Spam, Confidence: 0.97
