In [1]:
# If running in a fresh environment, uncomment installs:
%pip install numpy pandas scikit-learn nltk matplotlib seaborn tqdm joblib imbalanced-learn fastapi uvicorn pydantic

import sys, os, platform, sklearn, numpy as np, pandas as pd
print("Python:", platform.python_version())
print("sklearn:", sklearn.__version__)

Note: you may need to restart the kernel to use updated packages.
Python: 3.10.18
sklearn: 1.7.2


In [2]:
import os
from pathlib import Path

ROOT = Path.cwd()  # adjust if your notebook is in notebooks/; e.g., ROOT = Path.cwd().parent
DATA_RAW = ROOT / "data" / "raw"
DATA_PROCESSED = ROOT / "data" / "processed"
EXPERIMENTS = ROOT / "experiments"
REPORTS = EXPERIMENTS / "reports"
CM_DIR = EXPERIMENTS / "confusion_matrices"

for d in [DATA_RAW, DATA_PROCESSED, EXPERIMENTS, REPORTS, CM_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Root:", ROOT)

Root: /home/olivia-sabb/MyProjects/spam-detector/notebooks


In [3]:
import pandas as pd
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd()
DATA_RAW = ROOT / "data" / "raw"
DATA_PROCESSED = ROOT / "data" / "processed"

for d in [DATA_RAW, DATA_PROCESSED]:
    d.mkdir(parents=True, exist_ok=True)

raw_csv_path = DATA_RAW / "spam-email-dataset.csv"  # change if needed

def load_sms_spam_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, encoding='latin-1')
    # Try to detect columns
    possible_label_cols = [c for c in df.columns if c.lower().startswith('spam')]
    possible_text_cols = [c for c in df.columns if c.lower() in ("text", "message") or "message" in c.lower()]
    if not possible_label_cols or not possible_text_cols:
        # For the common SMS Spam dataset on Kaggle/UCI:
        # It often has columns: v1 (spam_or_not), v2 (text)
        if "v1" in df.columns and "v2" in df.columns:
            df = df[["v1", "v2"]]
            df.columns = ["spam_or_not", "text"]
        elif "text" in df.columns and "spam" in df.columns:
            df = df[["text", "spam"]]
            df.columns = ["text", "spam_or_not"]
        else:
            raise ValueError(f"Could not find spam/text columns in {path}. Columns: {df.columns.tolist()}")
    else:
        df = df[[possible_label_cols[0], possible_text_cols[0]]]
        df.columns = ["spam_or_not", "text"]
    df = df.dropna(subset=["text"])
    df["spam_or_not"] = df["spam_or_not"].astype(int)
    return df

df_raw = load_sms_spam_csv(raw_csv_path)
df_raw.head()

Unnamed: 0,spam_or_not,text
0,1,Subject: naturally irresistible your corporate...
1,1,Subject: the stock trading gunslinger fanny i...
2,1,Subject: unbelievable new homes made easy im ...
3,1,Subject: 4 color printing special request add...
4,1,"Subject: do not have money , get software cds ..."


In [4]:
import re

def basic_clean(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " URL ", text)
    text = re.sub(r"\d+", " NUM ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df = df_raw.copy()
df["text"] = df["text"].astype(str).apply(basic_clean)
df.head(), df["spam_or_not"].value_counts()

(   spam_or_not                                               text
 0            1  subject naturally irresistible your corporate ...
 1            1  subject the stock trading gunslinger fanny is ...
 2            1  subject unbelievable new homes made easy im wa...
 3            1  subject NUM color printing special request add...
 4            1  subject do not have money get software cds fro...,
 spam_or_not
 0    4360
 1    1368
 Name: count, dtype: int64)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["spam_or_not"], test_size=0.2, random_state=42, stratify=df["spam_or_not"]
)

pd.DataFrame({"text": X_train, "spam_or_not": y_train}).to_csv(DATA_PROCESSED / "train.csv", index=False)
pd.DataFrame({"text": X_test, "spam_or_not": y_test}).to_csv(DATA_PROCESSED / "test.csv", index=False)

len(X_train), len(X_test)

(4582, 1146)