In [14]:
# Email Spam Detection using Logistic Regression

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------------
# 1. Load Dataset (SMS Spam Collection)
# -----------------------------------
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Convert labels: ham = 0 (not spam), spam = 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# -----------------------------------
# 2. Train-Test Split
# -----------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    data['message'], data['label'], test_size=0.20, random_state=42
)

# -----------------------------------
# 3. Text Vectorization (TF-IDF)
# -----------------------------------
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# -----------------------------------
# 4. Logistic Regression Model
# -----------------------------------
model = LogisticRegression(max_iter=5000)
model.fit(X_train_tfidf, y_train)

# -----------------------------------
# 5. Accuracy
# -----------------------------------
y_pred = model.predict(X_test_tfidf)
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))

# -----------------------------------
# 6. Predict probability (Sigmoid output)
# -----------------------------------
sample_messages = X_test[:30]
sample_vectors = tfidf.transform(sample_messages)
probabilities = model.predict_proba(sample_vectors)

print("\nCategories: ['Not Spam', 'Spam']\n")

for i, prob in enumerate(probabilities):
    sigmoid_output = prob[1]      # Probability of Spam (class = 1)

    prediction = "SPAM" if sigmoid_output > 0.5 else "NOT SPAM"

    print(
        f"Email {i+1}: Sigmoid Output = {sigmoid_output:.4f} "
        f"({sigmoid_output * 100:.1f}%) -> Prediction: {prediction}"
    )

print("\n" + "="*60 + "\n")



Model Accuracy: 0.97847533632287

Categories: ['Not Spam', 'Spam']

Email 1: Sigmoid Output = 0.0714 (7.1%) -> Prediction: NOT SPAM
Email 2: Sigmoid Output = 0.1336 (13.4%) -> Prediction: NOT SPAM
Email 3: Sigmoid Output = 0.0398 (4.0%) -> Prediction: NOT SPAM
Email 4: Sigmoid Output = 0.0263 (2.6%) -> Prediction: NOT SPAM
Email 5: Sigmoid Output = 0.1289 (12.9%) -> Prediction: NOT SPAM
Email 6: Sigmoid Output = 0.0409 (4.1%) -> Prediction: NOT SPAM
Email 7: Sigmoid Output = 0.0410 (4.1%) -> Prediction: NOT SPAM
Email 8: Sigmoid Output = 0.0197 (2.0%) -> Prediction: NOT SPAM
Email 9: Sigmoid Output = 0.0216 (2.2%) -> Prediction: NOT SPAM
Email 10: Sigmoid Output = 0.0569 (5.7%) -> Prediction: NOT SPAM
Email 11: Sigmoid Output = 0.0358 (3.6%) -> Prediction: NOT SPAM
Email 12: Sigmoid Output = 0.0394 (3.9%) -> Prediction: NOT SPAM
Email 13: Sigmoid Output = 0.0481 (4.8%) -> Prediction: NOT SPAM
Email 14: Sigmoid Output = 0.0352 (3.5%) -> Prediction: NOT SPAM
Email 15: Sigmoid Output = 0