Title: SAD: A Stress Annotated Dataset for Recognizing Everyday Stressors in SMS-like Conversational Systems

Link: https://github.com/PervasiveWellbeingTech/Stress-Annotated-Dataset-SAD 

Dataset description: The Stress Annotated Dataset (SAD) contains 6,850 anonymized SMS-like sentences labeled into nine categories of everyday stressors, such as work, school, financial problems, family issues, and health. The data was collected from chatbot conversations, crowdsourcing on Amazon Mechanical Turk, and targeted web scraping from LiveJournal posts. Each entry includes metadata such as severity ratings, multiple label votes, and COVID-related tags, making it suitable for nuanced classification tasks. This dataset provides a valuable resource for training and evaluating conversational agents in stress detection and mental health applications

In [3]:
from pathlib import Path
import pandas as pd
import re

# ========= CONFIG =========
# Run this script from your project root that contains:
#   Data_Lake/
#   Data_Warehouse/
project_dir = Path.cwd()
source_path = project_dir / "Data_Lake" / "Dataset_10" / "SAD_v1.xlsx"
warehouse_dir = project_dir / "Data_Warehouse"
warehouse_dir.mkdir(parents=True, exist_ok=True)

# Optionally specify a sheet name if needed, e.g. sheet_name="Sheet1"
SHEET_NAME = None  # set to a string if your data is not on the first sheet

# ========= LOAD DATA =========
df = pd.read_excel(source_path, sheet_name=0)

# Basic sanity checks
required_cols = {"sentence", "is_stressor", "top_label"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# ========= TRANSFORM =========
# Keep only rows where is_stressor == 1
df = df[df["is_stressor"] == 1].copy()

# Clean sentence
def clean_text(s: str) -> str:
    s = str(s).strip()
    s = re.sub(r"\s+", " ", s)
    return s

df["sentence"] = df["sentence"].map(clean_text)
df = df[df["sentence"] != ""]
df = df.drop_duplicates(subset=["sentence"])

# Build the final dataframe
df_final = pd.DataFrame({
    "text": df["sentence"],
    "label": "stress",
    "sub-source": df["top_label"].fillna("Other").astype(str),
    "source": "dataset_10",
})

# ========= UTILS: unique filename =========
def get_unique_path(base_dir: Path, base_name: str) -> Path:
    """Return a unique path by adding _2, _3, etc. if needed."""
    out_path = base_dir / base_name
    if not out_path.exists():
        return out_path
    stem, ext = base_name.rsplit(".", 1)
    i = 2
    while True:
        candidate = base_dir / f"{stem}_{i}.{ext}"
        if not candidate.exists():
            return candidate
        i += 1

# ========= SAVE =========
out_path = get_unique_path(warehouse_dir, "stress_dataset.csv")
df_final.to_csv(out_path, index=False, encoding="utf-8")
print(f"Saved {len(df_final)} rows to {out_path}")


Saved 6476 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\stress_dataset_2.csv
