In [2]:
# 📦 Imports
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

# ✅ Download stopwords
nltk.download('stopwords')

# 🧹 Step 1: Load Data
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']

# 🧼 Step 2: Text Cleaning
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    filtered = [ps.stem(w) for w in words if w not in stop_words]
    return ' '.join(filtered)

df['cleaned'] = df['message'].apply(clean_text)

# 🔢 Step 3: Label Encoding
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# ✅ TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned'])
y = df['label_num']

# ✅ Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Apply SMOTE (handle class imbalance)
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# ✅ Train Naive Bayes Model
model = MultinomialNB()
model.fit(X_train_res, y_train_res)

# ⚠️ Get Prediction Probabilities
y_proba = model.predict_proba(X_test)[:, 1]   # Probability for class "1" (Spam)

# ✅ Threshold Tuning
threshold = 0.60  # You can tune this value
y_pred_thresh = (y_proba > threshold).astype(int)

# ✅ Evaluation
print(f"=== Naive Bayes with Threshold: {threshold} ===")
print("Accuracy:", accuracy_score(y_test, y_pred_thresh))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("\nClassification Report:\n", classification_report(y_test, y_pred_thresh))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== Naive Bayes with Threshold: 0.6 ===
Accuracy: 0.9730941704035875

Confusion Matrix:
 [[951  14]
 [ 16 134]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       965
           1       0.91      0.89      0.90       150

    accuracy                           0.97      1115
   macro avg       0.94      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115



why we use Tuning Threshold



---

### 🎯 **Why Do We Tune the Classification Threshold?**

By default, most classification models (like Logistic Regression or Naive Bayes) use a **threshold of 0.5**:

* If **predicted probability ≥ 0.5 → class = 1**
* If **predicted probability <  0.5 → class = 0**

But in real-world cases (like **spam detection**, **fraud detection**, or **medical diagnosis**) this default isn't always optimal.

---

### 🧠 **Tuning the threshold helps you control model behavior**:

| Goal                                                                  | Action                                               | Why                                               |
| --------------------------------------------------------------------- | ---------------------------------------------------- | ------------------------------------------------- |
| ✅ Reduce **False Positives** (e.g., marking important emails as spam) | **Increase** threshold (e.g., from 0.5 → 0.6 or 0.7) | Be more confident before saying something is spam |
| ✅ Reduce **False Negatives** (e.g., missing spam emails)              | **Decrease** threshold (e.g., from 0.5 → 0.4)        | Detect more spam, even if it risks some mistakes  |

---

### 📊 Example: Spam Detection

| Email Text            | Model Confidence (Spam) | Default Prediction (0.5) | If Threshold = 0.7   |
| --------------------- | ----------------------- | ------------------------ | -------------------- |
| "You won a prize!"    | 0.95                    | Spam                     | Spam                 |
| "Get cheap meds now"  | 0.60                    | Spam                     | ❌ Ham (blocked spam) |
| "Hi, are we meeting?" | 0.45                    | Ham                      | Ham                  |

👉 In the above case, setting **threshold = 0.7** helps **avoid false positives**, but it **misses some spam**.

---

### ✅ Summary:

* **Threshold tuning = More control** over model’s sensitivity.
* Helps **balance precision vs recall** depending on your goal.
* Very useful when **class distribution is imbalanced** (like spam detection, fraud detection).

---

Want me to show a **plot of precision & recall at different thresholds**? It will help you **visually choose the best one**.
