In [4]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split

# Define paths
clean_path = Path("../DATA/clean/sms_clean.csv")
splits_dir = Path("../DATA/splits")
splits_dir.mkdir(parents=True, exist_ok=True)

# Load cleaned dataset
df = pd.read_csv(clean_path)

# Stratification target
y = (df["Label"].str.lower() == "spam").astype(int)

# Split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=y, random_state=42)
y_temp = (temp_df["Label"].str.lower() == "spam").astype(int)
val_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=y_temp, random_state=42)

# Save splits
train_df.to_csv(splits_dir / "train.csv", index=False)
val_df.to_csv(splits_dir / "val.csv", index=False)
test_df.to_csv(splits_dir / "test.csv", index=False)

# Quick sanity prints
def ratio(d):
    return float((d["Label"].str.lower() == "spam").mean())

print("Rows (total):", len(df))
print("Label counts:", df["Label"].value_counts().to_dict())
print(f"train: n={len(train_df)} | spam_ratio={ratio(train_df):.4f}")
print(f"val  : n={len(val_df)}   | spam_ratio={ratio(val_df):.4f}")
print(f"test : n={len(test_df)}  | spam_ratio={ratio(test_df):.4f}")
print(f"Wrote splits to: {splits_dir}")

Rows (total): 5572
Label counts: {'ham': 4825, 'spam': 747}
train: n=3900 | spam_ratio=0.1341
val  : n=836   | spam_ratio=0.1340
test : n=836  | spam_ratio=0.1340
Wrote splits to: ../DATA/splits
