In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report

RANDOM_STATE = 42

# --------------------------
# 1) Load dataset
df = pd.read_csv("spam_ham_dataset.csv")

# Ensure correct columns
if "label" not in df.columns or "text" not in df.columns:
    raise ValueError("Dataset must have columns 'label' and 'text'")

# stable mapping: ham -> 0, spam -> 1
df["label_encoded"] = df["label"].map({"ham": 0, "spam": 1})
if df["label_encoded"].isna().any():
    raise ValueError("Found labels other than 'ham'/'spam' in the dataset")

X = df["text"]
y = df["label_encoded"]

# 2) custom keyword transformer
spam_keywords = ["free", "win", "winner", "prize", "claim", "urgent",
                 "offer", "click", "gift", "lottery", "cash", "buy", "reward"]

class KeywordFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, keywords=None):
        self.keywords = keywords if keywords is not None else []
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rows = []
        for msg in X:
            s = str(msg).lower()
            rows.append([0 if kw in s else 1 for kw in self.keywords])
        import numpy as np
        return np.array(rows)

# 3) build pipeline
tfidf = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))

pipeline = Pipeline([
    ("features", FeatureUnion([
        ("tfidf", tfidf),
        ("keywords", KeywordFeatures(spam_keywords))
    ])),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE))
])

# 4) train/test split & fit
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

pipeline.fit(X_train, y_train)

# 5) evaluation
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=["ham","spam"]))

# 6) Save pipeline and stable label map
label_map = {1: "HAM", 0: "SPAM"}
joblib.dump(pipeline, "C:\\Training Project\\models\\pipeline.pkl")
joblib.dump(label_map, "C:\\Training Project\\models\\label_map.pkl")

print("✅ Saved pipeline and label_map successfully!")


Accuracy: 0.972972972972973
              precision    recall  f1-score   support

         ham       0.97      0.98      0.97      1059
        spam       0.98      0.96      0.97       939

    accuracy                           0.97      1998
   macro avg       0.97      0.97      0.97      1998
weighted avg       0.97      0.97      0.97      1998

✅ Saved pipeline and label_map successfully!
