In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import os

# Step 2: Ensure NLTK resources are available (idempotent)
def ensure_nltk(resource):
    try:
        nltk.data.find(resource)
    except LookupError:
        nltk.download(resource.split('/')[-1], quiet=True)

ensure_nltk('corpora/stopwords')
ensure_nltk('corpora/wordnet')

# Step 3: Load dataset (tab-separated file inside spam.csv folder)
DATA_PATH = os.path.join('..', 'data', 'spam.csv', 'SMSSpamCollection')
df = pd.read_csv(DATA_PATH, sep='\t', header=None, names=['label', 'message'])

# Step 4: Clean text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

# Apply cleaning
df['cleaned'] = df['message'].apply(clean_text)

# Step 5: Feature extraction
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['cleaned']).toarray()
y = df['label'].map({'ham': 0, 'spam': 1}).astype(int)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 7: Model training
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 8: Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

# Step 9: Save model and vectorizer
MODEL_PATH = os.path.join('..', 'app', 'model.pkl')
with open(MODEL_PATH, 'wb') as f:
    pickle.dump((model, tfidf), f)

print("✅ Model and vectorizer saved successfully at:", os.path.abspath(MODEL_PATH))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sripr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sripr\AppData\Roaming\nltk_data...


EmptyDataError: No columns to parse from file