In [25]:
import pandas as pd
import numpy as np
import string
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

df = df[df['label'].isin(['ham', 'spam'])]
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
def clean_text(msg):
    msg = msg.lower()
    msg = ''.join([ch for ch in msg if ch not in string.punctuation])
    words = msg.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

df['cleaned'] = df['text'].apply(clean_text)

df = df[df['cleaned'].str.strip() != '']


In [28]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

cv = CountVectorizer()
X = cv.fit_transform(df['cleaned'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9748653500897666


In [30]:
def predict_spam(message):
    cleaned = clean_text(message)
    if cleaned.strip() == "":
        return "Invalid or empty input"
    vector = cv.transform([cleaned])
    result = model.predict(vector)
    return "Spam" if result[0] == 1 else "Ham (Not Spam)"

print(predict_spam("Congratulations! You've won a free iPhone!"))
print(predict_spam("Hey, are you joining the meeting today?"))


Spam
Ham (Not Spam)
