In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Load data from CSV
# Source: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
data = pd.read_csv("spam_data.csv", encoding='latin-1')
data = data[['v1', 'v2']]  # Select only relevant columns

# Renaming columns for clarity
data.columns = ['label', 'text']

# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Training the Naive Bayes classifier
clf = MultinomialNB(alpha=1)  # alpha=1 for Laplace smoothing
clf.fit(X_train, y_train)

# Predicting on test data
y_pred = clf.predict(X_test)

# Evaluating model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Cross-validation
cv_scores = cross_val_score(clf, X, data['label'], cv=5)
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Accuracy: 0.97847533632287

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.91      0.93      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Cross-Validation Scores: [0.98295964 0.98206278 0.97935368 0.98114901 0.97845601]
Mean CV Accuracy: 0.980796225777105


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

data = pd.read_csv("spam_data.csv", encoding='latin-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'text']
# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])
y = data['label']

# Training the Naive Bayes classifier
clf = MultinomialNB(alpha=1)  # alpha=1 for Laplace smoothing
clf.fit(X, y)

def classify_message(message):
    message_vectorized = vectorizer.transform([message])
    probabilities = clf.predict_proba(message_vectorized)[0]
    return probabilities

user_message = input("Enter your message: ")

probabilities = classify_message(user_message)

print(f"The message is {probabilities[1]*100:.2f}% likely to be spam.")
print(f"The message is {probabilities[0]*100:.2f}% likely to be not spam.")


Enter your message: Congratulations! you won a 5 day trip to hawaii
The message is 77.82% likely to be spam.
The message is 22.18% likely to be not spam.
