In [None]:
# ML7 â€“ Improving Performance of Classifier Models (SMS Spam Classification)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ------------------ LOAD DATA ------------------
df = pd.read_csv("/content/SMSSpamCollection.txt",
                 sep='\t', header=None, names=['label', 'message'])
print(df.head())

# ------------------ A. DATA PREPROCESSING ------------------

# Missing values
print("\nMissing values:\n", df.isnull().sum())

# Encode labels (ham=0, spam=1)
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# Add feature: message length
df['length'] = df['message'].apply(len)

# ------------------ B. TRAIN-TEST SPLIT ------------------
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# ------------------ C. APPLY TWO MODELS ------------------

# Logistic Regression pipeline
lr_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(max_iter=1000))
])

# Naive Bayes pipeline
nb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

# Train models
lr_pipeline.fit(X_train, y_train)
nb_pipeline.fit(X_train, y_train)

# Predictions
lr_pred = lr_pipeline.predict(X_test)
nb_pred = nb_pipeline.predict(X_test)

# ---- Evaluation ----
print("\nðŸ“Š Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print(confusion_matrix(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

print("\nðŸ“Š Naive Bayes Performance:")
print("Accuracy:", accuracy_score(y_test, nb_pred))
print(confusion_matrix(y_test, nb_pred))
print(classification_report(y_test, nb_pred))

# ------------------ D. CROSS VALIDATION ------------------
print("\nPerforming 5-Fold Cross Validation:")

lr_cv = cross_val_score(lr_pipeline, X, y, cv=5)
nb_cv = cross_val_score(nb_pipeline, X, y, cv=5)

print("Logistic Regression CV Mean Accuracy:", lr_cv.mean())
print("Naive Bayes CV Mean Accuracy:", nb_cv.mean())

# ------------------ E. HYPERPARAMETER TUNING ------------------

param_grid = {
    'clf__C': [0.1, 1, 10],
    'clf__solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(lr_pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("\nBest Parameters from Grid Search:", grid_search.best_params_)

# Best Model prediction
y_pred_best = grid_search.predict(X_test)

print("\nðŸ“ˆ Tuned Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
