In [21]:
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

# --------------------------
# 1) Load Dataset
# --------------------------
data = pd.read_csv("data/spam.csv", encoding='latin-1')

# --------------------------
# 2) Preprocessing Function
# --------------------------
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

data["clean_text"] = data["v2"].apply(clean_text)
data["label"] = data["v1"].map({"ham": 0, "spam": 1})

# --------------------------
# 3) Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    data["clean_text"], data["label"], test_size=0.2, random_state=42
)

# --------------------------
# 4) CREATE PIPELINE (This defines pipeline!)
# --------------------------
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

# --------------------------
# 5) Train Model
# --------------------------
pipeline.fit(X_train, y_train)

# --------------------------
# 6) Evaluation
# --------------------------
pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))

# --------------------------
# 7) SAVE ONE PIPELINE FILE
# --------------------------
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/pipeline.joblib")

print("\nModel saved as models/pipeline.joblib")


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         1

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


Confusion Matrix:
 [[3 0]
 [0 1]]

Model saved as models/pipeline.joblib


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
import pickle
import os

# create models folder if not exists
os.makedirs("models", exist_ok=True)

# save vectorizer
with open("models/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# save model
with open("models/model.pkl", "wb") as f:
    pickle.dump(model, f)

print("model.pkl and vectorizer.pkl saved successfully!")

model.pkl and vectorizer.pkl saved successfully!
