Title: Dreaddit: A Reddit Dataset for Stress Analysis in Social Media

Link: Direct download link avaialbe in the paper's 1st page footnote. 

Dataset description: The Dreaddit dataset consists of 187,444 Reddit posts collected from ten subreddits across five domains (abuse, anxiety, financial, PTSD, and social/relationships). A subset of 3,553 text segments was annotated through Amazon Mechanical Turk for stress classification, with labels indicating stressful or non-stressful content. Unlike short microblogs, Dreaddit provides long-form, multi-domain narratives, enabling deeper analysis of how stress is expressed in online communities.

In [4]:
from pathlib import Path
import pandas as pd

# ========= CONFIG =========
project_dir = Path.cwd()
source_dir = project_dir / "Data_Lake" / "Dataset_8"
warehouse_dir = project_dir / "Data_Warehouse"
warehouse_dir.mkdir(parents=True, exist_ok=True)

# ========= LOAD DATA =========
train_df = pd.read_csv(source_dir / "dreaddit-train.csv")
test_df = pd.read_csv(source_dir / "dreaddit-test.csv")

# Combine train and test
df = pd.concat([train_df, test_df], ignore_index=True)

# Keep only stress-labeled rows (label == 1)
df = df[df["label"] == 1].copy()

# Map subreddits into 3 classes
def map_class(subreddit):
    if subreddit == "anxiety":
        return "anxiety"
    elif subreddit == "ptsd":
        return "ptsd"
    else:
        return "stress"

df["label"] = df["subreddit"].apply(map_class)

# Add sub-source (original subreddit name) and source (Dataset_8)
df["sub-source"] = df["subreddit"]
df["source"] = "Dataset_8"

# Keep only relevant columns
df_final = df[["text", "label", "sub-source", "source"]]

# ========= FUNCTION: get unique filename =========
def get_unique_path(base_dir: Path, base_name: str) -> Path:
    """Return a unique path by adding _2, _3, etc. if needed."""
    out_path = base_dir / base_name
    if not out_path.exists():
        return out_path
    # If exists, increment suffix
    stem, ext = base_name.rsplit(".", 1)
    i = 2
    while True:
        new_name = f"{stem}_{i}.{ext}"
        out_path = base_dir / new_name
        if not out_path.exists():
            return out_path
        i += 1

# ========= SAVE SEPARATE FILES =========
for class_name in df_final["label"].unique():
    subset = df_final[df_final["label"] == class_name]
    
    # Start with class_dataset.csv
    out_path = get_unique_path(warehouse_dir, f"{class_name}_dataset.csv")
    
    subset.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved {len(subset)} rows to {out_path}")


Saved 414 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\ptsd_dataset_2.csv
Saved 1027 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\stress_dataset_2.csv
Saved 416 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\anxiety_dataset_2.csv


In [None]:
from pathlib import Path
import pandas as pd

# ========= CONFIG =========
project_dir = Path.cwd()
warehouse_dir = project_dir / "Data_Warehouse"

# Find all stress dataset files
stress_files = list(warehouse_dir.glob("stress_dataset*.csv"))
print("Found files:", stress_files)

# Load and combine all stress datasets
dfs = [pd.read_csv(f) for f in stress_files]
df_stress = pd.concat(dfs, ignore_index=True)

# Check distribution by sub-source (subreddit)
freq = df_stress["sub-source"].value_counts()

print("\nDistribution of stress data by sub-source:")
print(freq)


Found files: [WindowsPath('d:/Sajjad-Workspace/PSS_XAI/Data_Process/Data_Warehouse/stress_dataset.csv'), WindowsPath('d:/Sajjad-Workspace/PSS_XAI/Data_Process/Data_Warehouse/stress_dataset_2.csv')]

Distribution of stress data by sub-source:
sub-source
Work                                 1341
Health, Fatigue, or Physical Pain     782
Family Issues                         741
School                                739
Emotional Turmoil                     667
Financial Problem                     635
Social Relationships                  626
Other                                 608
Everyday Decision Making              337
relationships                         307
domesticviolence                      249
survivorsofabuse                      143
assistance                            126
homeless                               81
almosthomeless                         59
stress                                 45
food_pantry                            17
Name: count, dtype: int64

Saved 

In [8]:
from pathlib import Path
import pandas as pd

# ===== CONFIG =====
project_dir = Path.cwd()
warehouse_dir = project_dir / "Data_Warehouse"

# Categories to cap around 400
heavy_subsources = [
    "Work",
    "Health, Fatigue, or Physical Pain",
    "Family Issues",
    "School",
    "Emotional Turmoil",
    "Financial Problem",
    "Social Relationships",
]
CAP = 400
RANDOM_STATE = 42

# ===== helpers =====
def get_unique_path(base_dir: Path, base_name: str) -> Path:
    """Return a unique path by adding _2, _3, ... if needed."""
    out_path = base_dir / base_name
    if not out_path.exists():
        return out_path
    stem, ext = base_name.rsplit(".", 1)
    i = 2
    while True:
        candidate = base_dir / f"{stem}_{i}.{ext}"
        if not candidate.exists():
            return candidate
        i += 1

# ===== load =====
stress_files = sorted(warehouse_dir.glob("stress_dataset*.csv"))
if not stress_files:
    raise FileNotFoundError("No stress_dataset*.csv files found in Data_Warehouse")
dfs = [pd.read_csv(f) for f in stress_files]
df = pd.concat(dfs, ignore_index=True)

if "sub-source" not in df.columns:
    raise KeyError("Expected column 'sub-source' not found")

# optional de duplicate
df = df.drop_duplicates().reset_index(drop=True)

# ===== downsample heavy categories =====
parts = []
removed_counts = {}

for name, group in df.groupby("sub-source", dropna=False):
    if name in heavy_subsources and len(group) > CAP:
        removed_counts[name] = len(group) - CAP
        parts.append(group.sample(CAP, random_state=RANDOM_STATE))
    else:
        removed_counts[name] = 0
        parts.append(group)

df_out = pd.concat(parts, ignore_index=True)

# ===== save =====
out_path = get_unique_path(warehouse_dir, "stres_dataset_small_merged.csv")
df_out.to_csv(out_path, index=False, encoding="utf-8")

# ===== report =====
print(f"Saved shortened dataset to: {out_path}")
print("\nRemoved counts from capped categories:")
for k in heavy_subsources:
    print(f"{k:35s} {removed_counts.get(k, 0)}")


print("\nNew distribution by sub-source:")
print(df_out["sub-source"].value_counts())



Saved shortened dataset to: d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\stres_dataset_small_merged.csv

Removed counts from capped categories:
Work                                941
Health, Fatigue, or Physical Pain   382
Family Issues                       341
School                              339
Emotional Turmoil                   267
Financial Problem                   235
Social Relationships                226

New distribution by sub-source:
sub-source
Other                                608
Emotional Turmoil                    400
Social Relationships                 400
School                               400
Work                                 400
Health, Fatigue, or Physical Pain    400
Financial Problem                    400
Family Issues                        400
Everyday Decision Making             337
relationships                        307
domesticviolence                     249
survivorsofabuse                     143
assistance                   

In [9]:
print(df_out.shape)

(4772, 4)
