In [158]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Load Dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep="\t", names=["label", "text"])
    return df

# Text Preprocessing
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))

    # Remove non-alphabet characters and lowercase
    text = re.sub(r"[^a-zA-Z]", " ", text).lower()
    words = word_tokenize(text)

    # Lemmatization & Stopword removal
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

# Feature Extraction
def extract_features(corpus):
    vectorizer = CountVectorizer(max_features=2500)
    X = vectorizer.fit_transform(corpus).toarray()
    return X, vectorizer

# Train-Test Split
def split_data(X, y):
    return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Model
def train_model(X_train, y_train):
    model = MultinomialNB(alpha=0.5)
    model.fit(X_train, y_train)
    return model

# Evaluate Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Main Function
if __name__ == "__main__":
    file_path = r"E:\spam_classifier\sms+spam+collection\SMSSpamCollection"  # Update this path as needed
    df = load_data(file_path)

    # Encode labels
    label_encoder = LabelEncoder()
    df["label"] = label_encoder.fit_transform(df["label"])

    # Preprocess text
    df["clean_text"] = df["text"].apply(preprocess_text)

    # Extract features
    X, vectorizer = extract_features(df["clean_text"])
    y = df["label"]

    # Split data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Handle class imbalance with SMOTE (only on training data)
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Train and evaluate model
    model = train_model(X_train, y_train)
    evaluate_model(model, X_test, y_test)


Accuracy: 0.9721973094170404
Precision: 0.8831168831168831
Recall: 0.912751677852349
Confusion Matrix:
 [[948  18]
 [ 13 136]]
