In [None]:
# ML4 - SMS Spam Classification using Naive Bayes & Logistic Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset (tab-separated)
data = pd.read_csv("SMSSpamCollection.txt", sep="\t", names=["label", "message"])
data.head()

# ---------------------- A. DATA PREPROCESSING ----------------------

# Label Encoding (ham = 0, spam = 1)
encoder = LabelEncoder()
data['label_encoded'] = encoder.fit_transform(data['label'])

# Text Cleaning
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # keep only letters
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)

data['cleaned'] = data['message'].apply(preprocess_text)

# ---------------------- B. DATA PREPARATION ----------------------

X = data['cleaned']
y = data['label_encoded']

# Convert cleaned text â†’ numerical TF-IDF features
vectorizer = TfidfVectorizer(max_features=3000)
X_vectorized = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42)

# ---------------------- C. APPLY TWO ML MODELS ----------------------

# 1) Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix (Naive Bayes):")
print(confusion_matrix(y_test, y_pred_nb))

# 2) Logistic Regression
lr_model = LogisticRegression(max_iter=1000, solver='liblinear')
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix (Logistic Regression):")
print(confusion_matrix(y_test, y_pred_lr))
