In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



This script processes and cleans the datasets used for SMART Goals generation. It applies consistent preprocessing steps, such as stripping whitespace, converting text to lowercase, and removing quotes, to ensure uniformity across all datasets (train, validation, and test). Key columns, including 'Augmented Vague Goal' and 'SMART Goal,' are checked for missing values, which are removed to maintain data integrity. The cleaned datasets are then saved to new files for subsequent use in model training and evaluation. This preprocessing step is critical for ensuring high-quality inputs throughout the SMART Goals generation pipeline

In [None]:
import pandas as pd
import warnings
import wandb

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

# Login to Weights & Biases
wandb.login()

# File paths
file_paths = {
    "train": "/content/drive/My Drive/train_data.csv",
    "validation": "/content/drive/My Drive/validation_data.csv",
    "test": "/content/drive/My Drive/test_data.csv"
}

# Output file names
output_files = {
    "train": "/content/drive/My Drive/train_data_cleaned.csv",
    "validation": "/content/drive/My Drive/validation_data_cleaned.csv",
    "test": "/content/drive/My Drive/test_data_cleaned.csv"
}

# Cleaning function for the entire dataset
def clean_entire_dataset(dataset):
    # Convert all columns to strings
    dataset = dataset.astype(str)

    # Clean all columns
    for col in dataset.columns:
        # Strip whitespace and lowercase
        dataset[col] = dataset[col].str.strip().str.lower()

        # Remove quotes
        dataset[col] = dataset[col].str.replace(r'["\']', '', regex=True)

    # Drop rows with NaN values in the two key columns (converted to string "nan")
    dataset.replace("nan", pd.NA, inplace=True)
    dataset.dropna(subset=['Augmented Vague Goal', 'SMART Goal'], inplace=True)

    return dataset

# Clean and save each dataset
for name, path in file_paths.items():
    print(f"Processing {name.capitalize()} data...")

    # Load dataset
    data = pd.read_csv(path)

    # Clean dataset (all columns treated as strings)
    cleaned_data = clean_entire_dataset(data)

    # Save cleaned dataset to a new file
    save_path = output_files[name]
    cleaned_data.to_csv(save_path, index=False)
    print(f"{name.capitalize()} data cleaned and saved to {save_path}")

print("All datasets cleaned and saved successfully!")


[34m[1mwandb[0m: Currently logged in as: [33mbraid781[0m ([33mbraid781-northeastern-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Processing Train data...
Train data cleaned and saved to /content/drive/My Drive/train_data_cleaned.csv
Processing Validation data...
Validation data cleaned and saved to /content/drive/My Drive/validation_data_cleaned.csv
Processing Test data...
Test data cleaned and saved to /content/drive/My Drive/test_data_cleaned.csv
All datasets cleaned and saved successfully!
