Title: Dreaddit: A Reddit Dataset for Stress Analysis in Social Media

Link: Direct download link avaialbe in the paper's 1st page footnote. 

Dataset description: The Dreaddit dataset consists of 187,444 Reddit posts collected from ten subreddits across five domains (abuse, anxiety, financial, PTSD, and social/relationships). A subset of 3,553 text segments was annotated through Amazon Mechanical Turk for stress classification, with labels indicating stressful or non-stressful content. Unlike short microblogs, Dreaddit provides long-form, multi-domain narratives, enabling deeper analysis of how stress is expressed in online communities.

In [4]:
from pathlib import Path
import pandas as pd

# ========= CONFIG =========
project_dir = Path.cwd()
source_dir = project_dir / "Data_Lake" / "Dataset_8"
warehouse_dir = project_dir / "Data_Warehouse"
warehouse_dir.mkdir(parents=True, exist_ok=True)

# ========= LOAD DATA =========
train_df = pd.read_csv(source_dir / "dreaddit-train.csv")
test_df = pd.read_csv(source_dir / "dreaddit-test.csv")

# Combine train and test
df = pd.concat([train_df, test_df], ignore_index=True)

# Keep only stress-labeled rows (label == 1)
df = df[df["label"] == 1].copy()

# Map subreddits into 3 classes
def map_class(subreddit):
    if subreddit == "anxiety":
        return "anxiety"
    elif subreddit == "ptsd":
        return "ptsd"
    else:
        return "stress"

df["label"] = df["subreddit"].apply(map_class)

# Add sub-source (original subreddit name) and source (Dataset_8)
df["sub-source"] = df["subreddit"]
df["source"] = "Dataset_8"

# Keep only relevant columns
df_final = df[["text", "label", "sub-source", "source"]]

# ========= FUNCTION: get unique filename =========
def get_unique_path(base_dir: Path, base_name: str) -> Path:
    """Return a unique path by adding _2, _3, etc. if needed."""
    out_path = base_dir / base_name
    if not out_path.exists():
        return out_path
    # If exists, increment suffix
    stem, ext = base_name.rsplit(".", 1)
    i = 2
    while True:
        new_name = f"{stem}_{i}.{ext}"
        out_path = base_dir / new_name
        if not out_path.exists():
            return out_path
        i += 1

# ========= SAVE SEPARATE FILES =========
for class_name in df_final["label"].unique():
    subset = df_final[df_final["label"] == class_name]
    
    # Start with class_dataset.csv
    out_path = get_unique_path(warehouse_dir, f"{class_name}_dataset.csv")
    
    subset.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved {len(subset)} rows to {out_path}")


Saved 414 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\ptsd_dataset_2.csv
Saved 1027 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\stress_dataset_2.csv
Saved 416 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\anxiety_dataset_2.csv
