In [1]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import re

def read_dataset(file_path):
    """Reads data from a given file path"""
    return pd.read_csv(file_path)

def clean_text(text):
    """Cleans the text data"""
    text = text.lower().strip()
    text = re.sub(r"[^a-zA_Z\d\s]", "", text)
    stop_words = stopwords.words("english")
    words = word_tokenize(text)
    return " ".join(word for word in words if word not in stop_words)

def clean_data(data):
    """Cleans the data"""
    data = data.dropna()
    data["text"] = data["text"].apply(clean_text)
    return data.drop_duplicates(subset="text")

def partition_data(data):
    """Partitions the data into train/validation/test sets"""
    train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 42)
    val_set, test_set = train_test_split(test_set, test_size = 0.5, random_state = 42)
    return train_set, val_set, test_set

def save_splits(train_set, val_set, test_set):
    """Saves the splits at train.csv, validation.csv, and test.csv"""
    train_set.to_csv("train.csv", index = False)
    val_set.to_csv("validation.csv", index = False)
    test_set.to_csv("test.csv", index = False)

# Read, clean, partition, and save data
dataset = read_dataset("spam_email_dataset.csv")
dataset = clean_data(dataset)
train_set, val_set, test_set = partition_data(dataset)
save_splits(train_set, val_set, test_set)
