### **Setup and Data Loading**

In [3]:
import pandas as pd
import numpy as np
import argparse
import sys
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Download the dataset directly if not uploaded
!wget -N https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
!unzip -o sms+spam+collection.zip

def load_data(file_path='SMSSpamCollection'):
    df = pd.read_csv(file_path, sep='\t', names=['label', 'message'])
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['label']) # ham: 0, spam: 1
    return df

df = load_data()
print(f"Dataset Loaded: {df.shape[0]} messages.")
df.head()

--2026-02-28 14:42:30--  https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘sms+spam+collection.zip’

sms+spam+collection     [ <=>                ] 198.65K  1.05MB/s    in 0.2s    

Last-modified header missing -- time-stamps turned off.
2026-02-28 14:42:30 (1.05 MB/s) - ‘sms+spam+collection.zip’ saved [203415]

Archive:  sms+spam+collection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  
Dataset Loaded: 5572 messages.


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### **Text Preprocessing and Model Definition**

In [4]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(df['message'])
y = df['label']

def get_models():
    # Base Learners
    nb = MultinomialNB()
    lr = LogisticRegression(solver='liblinear')
    svm = SVC(kernel='linear', probability=True)

    base_methods = [('nb', nb), ('lr', lr), ('svm', svm)]

    # Ensemble Strategies
    return {
        "Naive Bayes": nb,
        "Logistic Regression": lr,
        "Linear SVM": svm,
        "Voting (Hard)": VotingClassifier(estimators=base_methods, voting='hard'),
        "Voting (Soft)": VotingClassifier(estimators=base_methods, voting='soft'),
        "Stacking": StackingClassifier(estimators=base_methods, final_estimator=LogisticRegression()),
        "AdaBoost (Stumps)": AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=1),
            n_estimators=50,
            algorithm='SAMME'
        )
    }

### **Execution and Evaluation**

In [6]:
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
models = get_models()
summary_results = []

print(f"{'Model Name':<20} | {'F1-Score':<10} | {'ROC-AUC':<10}")
print("-" * 45)

for name, model in models.items():
    current_metrics = ['precision', 'recall', 'f1'] # Always include these
    # roc_auc requires predict_proba or decision_function, which hard voting doesn't have.
    if name != "Voting (Hard)":
        current_metrics.append('roc_auc')

    cv_output = cross_validate(model, X, y, cv=skf, scoring=current_metrics)

    res = {"Model": name}
    for m in current_metrics:
        res[f"{m}_mean"] = cv_output[f'test_{m}'].mean()
        res[f"{m}_std"] = cv_output[f'test_{m}'].std()

    f1_score = res['f1_mean']
    roc_auc_str = f"{res['roc_auc_mean']:.4f}" if 'roc_auc_mean' in res else "N/A       " # Pad to match width

    summary_results.append(res)
    print(f"{name:<20} | {f1_score:.4f}     | {roc_auc_str}")

# Save comparison report
pd.DataFrame(summary_results).to_csv("ensemble_comparison.csv", index=False)

Model Name           | F1-Score   | ROC-AUC   
---------------------------------------------
Naive Bayes          | 0.9254     | 0.9912
Logistic Regression  | 0.8505     | 0.9911
Linear SVM           | 0.9332     | 0.9914
Voting (Hard)        | 0.9233     | N/A       
Voting (Soft)        | 0.9316     | 0.9930
Stacking             | 0.9490     | 0.9928




AdaBoost (Stumps)    | 0.4772     | 0.8956


### **Final Predictions and Holdout Testing**

In [7]:
# Train/Test split for the final prediction file
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, df.index, test_size=0.2, random_state=42, stratify=y
)

# Using Stacking as the final model for demonstration
final_clf = models["Stacking"]
final_clf.fit(X_train, y_train)
y_pred = final_clf.predict(X_test)
y_prob = final_clf.predict_proba(X_test)[:, 1]

# Save detailed predictions
final_preds = pd.DataFrame({
    "MessageId": idx_test,
    "Actual": y_test,
    "Predicted": y_pred,
    "Probability": y_prob
})
final_preds.to_csv("final_model_predictions.csv", index=False)

print("\nFinal Model Evaluation (Stacking):")
print(classification_report(y_test, y_pred))


Final Model Evaluation (Stacking):
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

