In [3]:
from pathlib import Path
import pandas as pd

df = pd.read_csv("../DATA/SMSSpamCollection.csv")

def normalize_sms_df(df_in):
    df = df_in.copy()

    # 1) Validate expected columns from your earlier EDA
    expected = ["Label", "SMS_Message"]
    missing = [c for c in expected if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}. Found: {list(df.columns)}")

    # 2) Standardize labels
    df["Label"] = df["Label"].astype(str).str.strip().str.lower()

    # 3) Normalize text
    df["SMS_Message"] = (
        df["SMS_Message"]
        .astype(str)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)  # collapse whitespace
        .str.replace("’", "'", regex=False)    # curly apostrophe -> straight
        .str.replace("‘", "'", regex=False)    # curly opening quote -> straight
        .str.replace("“", '"', regex=False)    # curly double -> straight
        .str.replace("”", '"', regex=False)    # curly double -> straight
    )

    return df



clean_path = Path("../DATA/clean/sms_clean.csv")
clean_path.parent.mkdir(parents=True, exist_ok=True)

df_clean = normalize_sms_df(df)

# Quick sanity prints
print("Rows:", len(df_clean))
print("Null counts:", df_clean.isna().sum().to_dict())
print(df_clean["Label"].value_counts())

df_clean.to_csv(clean_path, index=False)
print(f"Wrote cleaned CSV to: {clean_path}")

Rows: 5572
Null counts: {'Label': 0, 'SMS_Message': 0}
Label
ham     4825
spam     747
Name: count, dtype: int64
Wrote cleaned CSV to: ../DATA/clean/sms_clean.csv
