# SMS Spam — Tiny‑Dataset Friendly Evaluation
Place **s.csv** (columns: `label,message`) next to this notebook and run the cell below.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

DATA_PATH = Path('siri.csv')
df = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
expected_cols = {'label','message'}
if not expected_cols.issubset(set(df.columns.str.lower())):
    df = pd.read_csv(DATA_PATH, encoding='utf-8-sig', names=['label','message'], header=0)
df.columns = [c.strip().lower() for c in df.columns]
df = df[['label','message']].copy()
df['label'] = df['label'].astype(str).str.strip().str.lower()
df['message'] = df['message'].astype(str).str.strip()
df = df.replace({'': np.nan}).dropna(subset=['label','message'])
df = df[df['label'].isin(['ham','spam'])].copy()
if df.empty:
    raise RuntimeError('No valid rows found; check siri.csv format (label,message with ham/spam).')

y = df['label'].map({'ham':0,'spam':1}).astype(int)
X = df['message']

vect = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
nb  = MultinomialNB(alpha=0.1)
svm = LinearSVC()
rf  = RandomForestClassifier(n_estimators=200, random_state=42)
ensemble = VotingClassifier(estimators=[('nb',nb),('svm',svm),('rf',rf)], voting='hard')
models = {'Naive Bayes':nb,'SVM (LinearSVC)':svm,'Random Forest':rf,'Ensemble (NB+SVM+RF)':ensemble}

n_samples = len(df)
min_class = y.value_counts().min()
use_cv = (min_class < 2 or n_samples < 6)
accuracies = {}

if not use_cv:
    safe_test = max(0.2, 2/n_samples)
    safe_test = min(safe_test, 0.3)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=safe_test, random_state=42)
    (train_idx, test_idx), = sss.split(X, y)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    X_train_vec = vect.fit_transform(X_train)
    X_test_vec  = vect.transform(X_test)
    for name, model in models.items():
        model.fit(X_train_vec, y_train)
        pred = model.predict(X_test_vec)
        accuracies[name] = accuracy_score(y_test, pred)
else:
    n_splits = max(2, min(5, min_class))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    def cv_acc(est):
        vals = []
        for tr, te in skf.split(X, y):
            X_tr = vect.fit_transform(X.iloc[tr])
            X_te = vect.transform(X.iloc[te])
            e = est
            e.fit(X_tr, y.iloc[tr])
            vals.append(accuracy_score(y.iloc[te], e.predict(X_te)))
        return float(np.mean(vals))
    for name, model in models.items():
        accuracies[name] = cv_acc(model)

print('Samples per class:', y.value_counts().to_dict())
print('Mode:', 'Cross-Validation' if use_cv else 'Stratified Split')
for k,v in accuracies.items():
    print(f'{k:24s}: {v:.4f}')

plt.figure(figsize=(9,4.5))
bars = plt.bar(list(accuracies.keys()), list(accuracies.values()), color='#90caf9', edgecolor='#1e88e5')
plt.ylim(0.0,1.05)
plt.title('Model Accuracy' + (' (mean CV)' if use_cv else ' (test split)'))
plt.ylabel('Accuracy')
plt.xticks(rotation=15, ha='right')
for b in bars:
    h = b.get_height()
    plt.text(b.get_x()+b.get_width()/2, h+0.01, f'{h:.3f}', ha='center', va='bottom', fontsize=10)
plt.tight_layout(); plt.show()


RuntimeError: No valid rows found; check siri.csv format (label,message with ham/spam).