# ðŸ“Š SMS Spam â€” Test Accuracy (NB, LR, RF)
This notebook loads your `smsspam.csv`, trains three models (Naive Bayes, Logistic Regression, Random Forest), and optionally an ensemble. It includes safety checks so the test split always works.

> **Tip**: If your CSV isn't in the same folder, update `DATA_PATH` in Cell 2.

In [1]:
# ---- Install dependencies (run once if needed) ----
# !pip install scikit-learn pandas matplotlib


In [2]:
# ---- Imports & dataset path ----
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score

# Point this to your CSV. Change if your file sits elsewhere.
DATA_PATH = Path('smsspam.csv')

assert DATA_PATH.exists(), f'Dataset not found at {DATA_PATH.resolve()}'
df = pd.read_csv(DATA_PATH, encoding='latin-1')

# Normalize basic columns (first is label, second is message) if headers vary
df = df.rename(columns={df.columns[0]: 'label', df.columns[1]: 'message'})
df = df[['label', 'message']].dropna()
df['label'] = df['label'].astype(str).str.lower().map({'ham': 0, 'spam': 1})

print('âœ… Loaded:', len(df), 'rows | Class balance:', df['label'].value_counts().to_dict())
df.head()

âœ… Loaded: 2 rows | Class balance: {}


Unnamed: 0,label,message
ham,,thanks for letting me know.
ham,,how are you today?


In [3]:
# ---- Core evaluation function with safety checks ----
def evaluate_models(df: pd.DataFrame,
                    test_size=0.2,
                    max_features=5000,
                    ngram_min=1, ngram_max=2,
                    rf_trees=200,
                    use_ensemble=True,
                    random_state=42):
    """Train NB, LR, RF (and optional Ensemble) and return accuracies."""
    # make sure test set can hold at least 1 item per class and is reasonable
    n = len(df)
    n_classes = df['label'].nunique()
    min_test_fraction = max(test_size, (n_classes + 1) / n)  # ensure >= n_classes
    min_test_fraction = max(min_test_fraction, 0.10)         # at least 10%
    min_test_fraction = min(min_test_fraction, 0.50)         # cap at 50%

    X_train, X_test, y_train, y_test = train_test_split(
        df['message'], df['label'],
        test_size=min_test_fraction,
        stratify=df['label'],
        random_state=random_state
    )

    vect = TfidfVectorizer(max_features=max_features,
                           ngram_range=(ngram_min, ngram_max),
                           stop_words='english')
    X_train_vec = vect.fit_transform(X_train)
    X_test_vec  = vect.transform(X_test)

    nb = MultinomialNB()
    lr = LogisticRegression(max_iter=1000)
    rf = RandomForestClassifier(n_estimators=rf_trees, random_state=random_state)

    nb.fit(X_train_vec, y_train)
    lr.fit(X_train_vec, y_train)
    rf.fit(X_train_vec, y_train)

    results = {
        'Naive Bayes':        accuracy_score(y_test, nb.predict(X_test_vec)),
        'Logistic Regression':accuracy_score(y_test, lr.predict(X_test_vec)),
        'Random Forest':      accuracy_score(y_test, rf.predict(X_test_vec))
    }

    if use_ensemble:
        ens = VotingClassifier(estimators=[('nb', nb), ('lr', lr), ('rf', rf)], voting='soft')
        ens.fit(X_train_vec, y_train)
        results['Ensemble'] = accuracy_score(y_test, ens.predict(X_test_vec))

    return results


In [4]:
# ---- Run once and show accuracies ----
results = evaluate_models(df, test_size=0.2, max_features=5000, ngram_min=1, ngram_max=2, rf_trees=200)
print('ðŸ“ˆ Accuracies:')
for name, acc in results.items():
    print(f'{name}: {acc:.3f}')

ValueError: Input y contains NaN.

In [None]:
# ---- Plot a simple bar chart ----
def plot_accuracy(results):
    plt.figure(figsize=(6.2, 4.2))
    labels = list(results.keys())
    vals = [results[k] for k in labels]
    bars = plt.bar(labels, vals, color='skyblue', edgecolor='black')
    for b, a in zip(bars, vals):
        plt.text(b.get_x() + b.get_width()/2, a + 0.003, f'{a:.3f}', ha='center', va='bottom', fontweight='bold')
    plt.ylim(0.80, 1.00)
    plt.ylabel('Accuracy')
    plt.title('Test Accuracy of Models')
    plt.xticks(rotation=10)
    plt.grid(axis='y', linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()

plot_accuracy(results)