In [None]:
!pip install pandas scikit-learn




In [None]:
# Download and unzip the dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip

# Load the dataset
import pandas as pd

df = pd.read_csv("SMSSpamCollection", sep='\t', names=['label', 'message'])
print(df.head())


--2025-05-20 09:45:42--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip.2’

smsspamcollection.z     [  <=>               ] 198.65K   802KB/s    in 0.2s    

2025-05-20 09:45:43 (802 KB/s) - ‘smsspamcollection.zip.2’ saved [203415]

Archive:  smsspamcollection.zip
replace SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: SMSSpamCollection       
replace readme? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: readme                  
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label_encoded'], test_size=0.2, random_state=42)

# Create pipeline (Vectorizer + Classifier)
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

# Train the model
model.fit(X_train, y_train)


In [None]:
def predict_spam(message):
    prediction = model.predict([message])[0]
    return "Spam" if prediction == 1 else "Not Spam"


In [None]:
examples = [
    "Congratulations! You've won a free iPhone. Click here to claim now!",  # Spam
    "You have been selected for a $1000 gift card. Reply YES to claim.",  # Spam
    "Hey, are we still meeting for lunch at 1 PM today?",  # Not Spam
    "Please find the attached report for Q2 performance.",  # Not Spam
    "Don't forget to submit your assignment by midnight.",  # Not Spam
    "Happy birthday! Wishing you a fantastic year ahead.",  # Not Spam
    "I'll call you later tonight to discuss the project."  # Not Spam
]

for email in examples:
    print(f"{predict_spam(email):<9} | {email}")


Spam      | Congratulations! You've won a free iPhone. Click here to claim now!
Spam      | You have been selected for a $1000 gift card. Reply YES to claim.
Not Spam  | Hey, are we still meeting for lunch at 1 PM today?
Not Spam  | Please find the attached report for Q2 performance.
Not Spam  | Don't forget to submit your assignment by midnight.
Not Spam  | Happy birthday! Wishing you a fantastic year ahead.
Not Spam  | I'll call you later tonight to discuss the project.
