In [1]:
import pandas as pd
df = pd.read_csv("spam.csv", encoding="latin-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer

# Remove punctuation and to lowercase the messages
df['message_clean'] = df['message'].apply(lambda x: ''.join(char for char in x if char not in string.punctuation).lower())

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Convert text into numeric vectors as it understands only numbers
X = vectorizer.fit_transform(df['message_clean'])

# Target variables
y = df['label']
df.head()

Unnamed: 0,label,message,message_clean
0,0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,0,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [4]:
from sklearn.model_selection import train_test_split

# (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# Train the model using training data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [6]:
from sklearn.metrics import accuracy_score, classification_report

# Predict labels for test messages
y_pred = model.predict(X_test)

# Print accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

# Print full classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))


Accuracy: 0.9399103139013453

Classification Report:

              precision    recall  f1-score   support

         Ham       0.94      1.00      0.97       965
        Spam       0.98      0.57      0.72       150

    accuracy                           0.94      1115
   macro avg       0.96      0.78      0.84      1115
weighted avg       0.94      0.94      0.93      1115



In [7]:
# Sample messages to test
test_messages = [
    "Congratulations! You've won a free ticket to Bahamas. Reply WIN to claim.",
    "Hey, are we still on for lunch today?",
    "URGENT: Your account has been suspended. Click here to verify.",
    "I'll call you when I reach home."
]

# Preprocess and vectorize them
test_clean = [''.join(char for char in msg if char not in string.punctuation).lower() for msg in test_messages]
test_features = vectorizer.transform(test_clean)

# Predict using the trained model
predictions = model.predict(test_features)

# Show results
for msg, label in zip(test_messages, predictions):
    print(f"> \"{msg}\" → {'Spam' if label == 1 else 'Ham'}")


> "Congratulations! You've won a free ticket to Bahamas. Reply WIN to claim." → Spam
> "Hey, are we still on for lunch today?" → Ham
> "URGENT: Your account has been suspended. Click here to verify." → Ham
> "I'll call you when I reach home." → Ham


##### SAVING THE MODEL

In [8]:
import joblib

# Save the model
joblib.dump(model, "spam_classifier_model.pkl")

# Save the vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.


#### Load the saved model and Vectorizer

```python
import joblib
# Load the trained model
model = joblib.load("spam_classifier_model.pkl")

# Load the TF-IDF vectorizer
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# now use it as u want 

In [9]:
# Ask the user for a message
user_msg = input("📩 Enter a message to check for spam:\n")

clean_msg = ''.join(c for c in user_msg if c not in string.punctuation).lower()

msg_vector = vectorizer.transform([clean_msg])

prediction = model.predict(msg_vector)
print("input message:", user_msg)
print("\n✅ Prediction:", "📢 Spam Message" if prediction[0] == 1 else "✔️ Not Spam (Ham)")

input message: Hey, are we still on for lunch today?

✅ Prediction: ✔️ Not Spam (Ham)
