In [8]:
import pandas as pd

# Load dataset
df = pd.read_csv(
    "spam.csv",
    encoding="latin-1"
)

# Keep only useful columns
df = df[["v1", "v2"]]

# Rename columns
df.columns = ["label", "message"]

df.head()



Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df["label"] = df["label"].map({
    "ham": 0,
    "spam": 1
})

df["label"].value_counts()



label
0    4825
1     747
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Features & target
X = df["message"]
y = df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# TF-IDF
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model
model = LogisticRegression(class_weight="balanced")
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# Evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1206
           1       0.96      0.93      0.94       187

    accuracy                           0.98      1393
   macro avg       0.97      0.96      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [11]:
def predict_spam(text):
    vec = vectorizer.transform([text])
    prob = model.predict_proba(vec)[0][1]
    return prob, ("SPAM ðŸš¨" if prob > 0.7 else "NOT SPAM âœ…")

print(predict_spam("Congratulations! You won a free iPhone"))
print(predict_spam("Can you send me the assignment PDF?"))


(np.float64(0.8864077100333491), 'SPAM ðŸš¨')
(np.float64(0.37693069369106136), 'NOT SPAM âœ…')
