In [10]:
## ========================= DATA PREPARATION =========================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Read dataset
def read_dataset(path):
    df = pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=["category", "text"]
    )
    return df


# Clean  and encode
def clean_dataset(df):
    df = df.drop_duplicates().copy()
    encoder = LabelEncoder()
    df["category"] = encoder.fit_transform(df["category"])
    return df


data = read_dataset("/content/SMSSpamCollection")
data = clean_dataset(data)

print("=" * 100)
print("Shape:", data.shape)
print("Columns:", data.columns.tolist())
print("Missing values:\n", data.isna().sum())
print("=" * 100)


# Split dataset
def create_splits(df):
    X = df["text"]
    y = df["category"]

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.30, stratify=y, random_state=42
    )

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
    )

    train_df = pd.DataFrame({"text": X_train, "category": y_train})
    val_df   = pd.DataFrame({"text": X_val, "category": y_val})
    test_df  = pd.DataFrame({"text": X_test, "category": y_test})

    return train_df, val_df, test_df


def save_splits(train_df, val_df, test_df):
    train_df.to_csv("train_data.csv", index=False)
    val_df.to_csv("val_data.csv", index=False)
    test_df.to_csv("test_data.csv", index=False)


train_df, val_df, test_df = create_splits(data)
save_splits(train_df, val_df, test_df)

Shape: (5169, 2)
Columns: ['category', 'text']
Missing values:
 category    0
text        0
dtype: int64
