<a href="https://colab.research.google.com/github/snehagada31/Spam_Filter_NLP_dl/blob/main/spam_filter_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the necessary library (only needed if using the SMOTE/ImbPipeline approach)
# If using the FINAL CLASS_WEIGHT approach, you only need pandas and sklearn (pre-installed).
# We run it just in case:
!pip install imbalanced-learn
!pip install pandas scikit-learn

print("Libraries installed/verified.")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, classification_report
import pickle
import os

# --- REQUIRED IMPORTS ---
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# ------------------------

# --- 1. Load Data and Initial Cleanup ---
try:
    df = pd.read_csv('spam.csv', encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please place the dataset file in the script directory.")
    print("Execution aborted.")
    exit()

df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print(f"Total samples loaded: {len(df)}")
spam_count = df['label'].sum()
print(f"Spam messages: {spam_count} (The minority class that causes bias)")

# --- 2. Splitting the Data and Calculating Manual Weights ---
X = df['message']
y = df['label']

# Split into 80% training and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# Calculate the actual ratio to determine the class weights
train_counts = y_train.value_counts()
weight_ham = 1.0
# Calculate the SPAM weight: (Count of HAM / Count of SPAM)
# This will be roughly 4825 / 747 â‰ˆ 6.46. SPAM examples are weighted 6.46x higher.
weight_spam = train_counts[0] / train_counts[1]
# Create the class_weight dictionary for the model
manual_class_weights = {0: weight_ham, 1: weight_spam}

print(f"\nCalculated Manual Weights (Ham/Spam): {weight_ham:.2f} / {weight_spam:.2f}")


# --- 3. Define the NLP Pipeline (FINAL FIX: LOGISTIC REGRESSION + MANUAL CLASS WEIGHTS) ---
# This ensures the SPAM class is weighted exactly correctly to counter the imbalance.

model_pipeline = Pipeline([
    # Step 1: Feature Extraction - Use reliable WORD TF-IDF
    ('tfidf', TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
        stop_words='english',
        lowercase=True
    )),

    # Step 2: Classifier Model - Logistic Regression with Manual Class Weights
    ('classifier', LogisticRegression(
        C=1.0,
        solver='liblinear',
        random_state=42,
        class_weight=manual_class_weights # <-- CRITICAL FIX: Use the calculated dictionary
    ))
])

# --- 4. Train the Pipeline ---
print("\n" + "="*70)
print("--- STARTING MODEL TRAINING WITH MANUAL CLASS WEIGHTS (FIXING BIAS) ---")
print("="*70)
model_pipeline.fit(X_train, y_train)
print("Model training complete. Manual class weights applied.")

# --- 5. Make Predictions and Evaluate ---
y_pred = model_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print("\n" + "="*70)
print("--- Model Evaluation on Test Data ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f} (This measures how many predicted SPAM messages were correct)")
print("\nClassification Report:")
# Check the Spam (label 1) RECALL: this should now be high (e.g., > 0.90)
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
print("="*70)


# --- 6. Save the Trained Pipeline ---
model_filename = 'spam_filter_model.pkl'

try:
    with open(model_filename, 'wb') as file:
        pickle.dump(model_pipeline, file)
    print(f"\nModel successfully saved to {model_filename}")
except Exception as e:
    print(f"Error saving model: {e}")


# --- 7. Real-Time Prediction Function Setup ---

loaded_pipeline = None
try:
    with open(model_filename, 'rb') as file:
        loaded_pipeline = pickle.load(file)
    print(f"\nModel successfully loaded from {model_filename} for real-time use.")
except FileNotFoundError:
    print("Error: Saved model file not found for loading.")
    exit()

def classify_new_message(message_text, source="LIVE INPUT"):
    """Takes a single message string and prints its classification and confidence."""

    if loaded_pipeline is None:
        return

    prediction = loaded_pipeline.predict([message_text])[0]
    probabilities = loaded_pipeline.predict_proba([message_text])[0]

    if prediction == 1:
        result = "SPAM"
        confidence = probabilities[1]
    else:
        result = "HAM"
        confidence = probabilities[0]

    print("-" * 70)
    print(f"Source: {source}")
    print(f"Message: '{message_text}'")
    print(f"Classification: **{result}**")
    print(f"Confidence: {confidence:.4f}")

# --- 8. Random Example Validation ---

spam_df = df[df['label'] == 1]
ham_df = df[df['label'] == 0]

# Randomly select one Ham and one Spam message every time the script runs
random_ham = ham_df.sample(n=1)['message'].iloc[0]
random_spam = spam_df.sample(n=1)['message'].iloc[0]

print("\n" + "*"*70)
print("--- RANDOM DATASET VALIDATION (Two new examples every run) ---")
print("*"*70)

classify_new_message(random_ham, source="DATASET HAM EXAMPLE (Should be HAM)")
classify_new_message(random_spam, source="DATASET SPAM EXAMPLE (Should be SPAM)")


# --- 9. Live Input Loop for Real-Time Testing ---

print("\n" + "="*70)
print("--- LIVE REAL-TIME CLASSIFIER TEST ---")
print("Enter a message below and press Enter to classify.")
print("Type 'quit' or 'exit' to stop.")
print("="*70)

while True:
    try:
        user_input = input("ENTER MESSAGE: ")
    except EOFError:
        user_input = 'quit'

    if user_input.lower() in ('quit', 'exit'):
        break

    if not user_input.strip():
        print("Please enter some text.")
        continue

    # Classify the user's input
    classify_new_message(user_input, source="LIVE INPUT")

print("\nClassifier session ended. Thank you for testing!")

Total samples loaded: 5572
Spam messages: 747 (The minority class that causes bias)
Training samples: 4457
Testing samples: 1115

Calculated Manual Weights (Ham/Spam): 1.00 / 6.47

--- STARTING MODEL TRAINING WITH MANUAL CLASS WEIGHTS (FIXING BIAS) ---
Model training complete. Manual class weights applied.

--- Model Evaluation on Test Data ---
Accuracy:  0.9776
Precision: 0.9195 (This measures how many predicted SPAM messages were correct)

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       965
        Spam       0.92      0.91      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Model successfully saved to spam_filter_model.pkl

Model successfully loaded from spam_filter_model.pkl for real-time use.

**********************************************************************
--- RAND