# üìä SMS Spam ‚Äî Live Test Accuracy Dashboard
Interactive notebook that trains **Naive Bayes**, **Logistic Regression**, **Random Forest** and an optional **soft-voting ensemble**.

**Dataset expected at:** `data/smsspam.csv` (columns will be auto-detected).

Use the widget controls to update test split, vectorizer settings, and tree count ‚Äî the accuracy chart refreshes live.

In [1]:
# ‚¨áÔ∏è Install if missing (uncomment and run once)
# !pip install scikit-learn matplotlib pandas ipywidgets
# Classic Notebook only:
# !jupyter nbextension enable --py widgetsnbextension


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import ipywidgets as W
from IPython.display import display, clear_output

DATA_PATH = Path(r"C:/Users/Asus/Downloads/smsspam.csv")
assert DATA_PATH.exists(), f'Dataset not found at {DATA_PATH.resolve()}'


In [3]:
def load_dataset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding='latin-1')
    # Detect label and message-like columns
    lowers = {c.lower(): c for c in df.columns}
    label_col = lowers.get('label') or lowers.get('category') or lowers.get('class') or list(df.columns)[0]
    text_col  = lowers.get('message') or lowers.get('text') or list(df.columns)[1]
    df = df[[label_col, text_col]].rename(columns={label_col:'label', text_col:'message'})
    df['label'] = df['label'].astype(str).str.strip().str.lower().map({'ham':0, 'spam':1})
    df = df.dropna().reset_index(drop=True)
    return df

df = load_dataset(DATA_PATH)
print('Rows:', len(df), '| Class counts:', df['label'].value_counts().to_dict())
df.head()


Rows: 4 | Class counts: {0: 2, 1: 2}


Unnamed: 0,label,message
0,0,Hey there! Reached home. Talk later?
1,0,Morning! Meeting at 10am √¢¬Ä¬î don't be late
2,1,WIN cash now! Visit http://x.co/claim to redee...
3,1,URGENT: Verify bank info at example.com or acc...


In [4]:
def evaluate_models(df: pd.DataFrame,
                    test_size=0.2,
                    max_features=5000,
                    ngram_min=1, ngram_max=2,
                    rf_trees=200,
                    use_ensemble=True,
                    random_state=42):

    X_train, X_test, y_train, y_test = train_test_split(
        df['message'], df['label'], test_size=test_size, stratify=df['label'], random_state=random_state
    )

    vect = TfidfVectorizer(max_features=max_features,
                           ngram_range=(ngram_min, ngram_max),
                           stop_words='english')
    X_train_vec = vect.fit_transform(X_train)
    X_test_vec  = vect.transform(X_test)

    nb = MultinomialNB(alpha=0.1)
    lr = LogisticRegression(max_iter=200, solver='lbfgs')
    rf = RandomForestClassifier(n_estimators=rf_trees, random_state=random_state, n_jobs=-1)

    models = {'Naive Bayes': nb, 'Logistic Regression': lr, 'Random Forest': rf}
    if use_ensemble:
        ens = VotingClassifier(estimators=[('nb',nb), ('lr',lr), ('rf',rf)], voting='soft', weights=[1.0,1.2,1.0])
        models['Ensemble (NB+LR+RF)'] = ens

    accuracies, fitted = {}, {}
    for name, mdl in models.items():
        mdl.fit(X_train_vec, y_train)
        y_pred = mdl.predict(X_test_vec)
        accuracies[name] = accuracy_score(y_test, y_pred)
        fitted[name] = mdl

    return accuracies, fitted, (X_test_vec, y_test)


In [5]:
# === Live Accuracy UI ===
test_size_slider   = W.FloatSlider(value=0.2, min=0.1, max=0.4, step=0.05, description='Test size')
max_feat_slider    = W.IntSlider(value=5000, min=1000, max=20000, step=1000, description='Max features')
ngram_dropdown     = W.Dropdown(options=[('1-gram',(1,1)), ('1‚Äì2-gram',(1,2))], value=(1,2), description='N-grams')
rf_trees_slider    = W.IntSlider(value=200, min=50, max=500, step=50, description='RF trees')
ensemble_checkbox  = W.Checkbox(value=True, description='Include Ensemble')

out = W.Output()

def plot_bar(accuracies: dict):
    labels = list(accuracies.keys())
    vals   = [accuracies[k] for k in labels]
    plt.figure(figsize=(7,4.5))
    bars = plt.bar(labels, vals, color='skyblue', edgecolor='black')
    for b,a in zip(bars, vals):
        plt.text(b.get_x()+b.get_width()/2, a+0.001, f'{a:.3f}', ha='center', va='bottom', fontweight='bold')
    plt.ylim(0.9, 1.0)
    plt.ylabel('Accuracy')
    plt.title('Test Accuracy (live)')
    plt.xticks(rotation=12)
    plt.grid(axis='y', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

def refresh(_=None):
    with out:
        clear_output(wait=True)
        accs, _, _ = evaluate_models(
            df,
            test_size=test_size_slider.value,
            max_features=max_feat_slider.value,
            ngram_min=ngram_dropdown.value[0],
            ngram_max=ngram_dropdown.value[1],
            rf_trees=rf_trees_slider.value,
            use_ensemble=ensemble_checkbox.value
        )
        print('Live metrics:', {k: round(v,3) for k,v in accs.items()})
        plot_bar(accs)

for w in [test_size_slider, max_feat_slider, ngram_dropdown, rf_trees_slider, ensemble_checkbox]:
    w.observe(refresh, 'value')

display(W.VBox([W.HBox([test_size_slider, max_feat_slider]),
                W.HBox([ngram_dropdown, rf_trees_slider, ensemble_checkbox]),
                out]))
refresh()


VBox(children=(HBox(children=(FloatSlider(value=0.2, description='Test size', max=0.4, min=0.1, step=0.05), In‚Ä¶

In [6]:
# === Optional: show a confusion matrix snapshot for any model ===
cm_out = W.Output()
model_pick = W.Dropdown(options=['Naive Bayes','Logistic Regression','Random Forest','Ensemble (NB+LR+RF)'],
                        value='Ensemble (NB+LR+RF)', description='Model')

def show_cm(_=None):
    with cm_out:
        clear_output(wait=True)
        accs, fitted, (X_test_vec, y_test) = evaluate_models(
            df,
            test_size=test_size_slider.value,
            max_features=max_feat_slider.value,
            ngram_min=ngram_dropdown.value[0],
            ngram_max=ngram_dropdown.value[1],
            rf_trees=rf_trees_slider.value,
            use_ensemble=ensemble_checkbox.value
        )
        name = model_pick.value
        if name not in fitted:
            name = list(fitted.keys())[0]
        y_pred = fitted[name].predict(X_test_vec)
        cm = confusion_matrix(y_test, y_pred, labels=[0,1])
        plt.figure(figsize=(4.5,4))
        plt.imshow(cm, cmap='Blues')
        for (i,j), v in np.ndenumerate(cm):
            plt.text(j,i,str(v),ha='center',va='center',fontsize=12)
        plt.title(f'Confusion Matrix ‚Äî {name}')
        plt.xlabel('Predicted'); plt.ylabel('Actual')
        plt.xticks([0,1], ['ham','spam']); plt.yticks([0,1], ['ham','spam'])
        plt.colorbar(fraction=0.046, pad=0.04)
        plt.tight_layout(); plt.show()

model_pick.observe(show_cm, 'value')
display(W.HBox([model_pick]))
display(cm_out)
show_cm()


HBox(children=(Dropdown(description='Model', index=3, options=('Naive Bayes', 'Logistic Regression', 'Random F‚Ä¶

Output()