Title: Deep Learning for Suicide and Depression Identification with Unsupervised Label Correction

Link: https://github.com/ayaanzhaque/SDCNL/blob/main/README.md

Dataset description: The study’s primary dataset was collected from Reddit posts in r/Depression and r/SuicideWatch, containing 1,895 posts labeled according to subreddit membership. To validate the label correction method, the authors also used the Reddit C-SSRS dataset (500 posts labeled by psychologists using the Columbia Suicide Severity Rating Scale) and the IMDB movie review dataset (50,000 reviews for sentiment classification). Additionally, posts from r/CasualConversation were used alongside r/SuicideWatch to construct a comparison dataset for suicide vs healthy classification

In [3]:
from pathlib import Path
import pandas as pd

# ========= CONFIG =========
project_dir = Path.cwd()   # current working directory
source_dir = project_dir / "Data_Lake" / "Dataset_12"
warehouse_dir = project_dir / "Data_Warehouse"
warehouse_dir.mkdir(parents=True, exist_ok=True)

# ========= LOAD DATA =========
# Replace with your actual file name inside Dataset_12
df = pd.read_csv(source_dir / "combined-set.csv")

# ========= MAP LABELS =========
def map_label(row):
    return "suicide" if row["is_suicide"] == 1 else "depression"

def map_subsource(row):
    return "r/SuicideWatch" if row["is_suicide"] == 1 else "r/Depression"

df["label"] = df.apply(map_label, axis=1)
df["sub-source"] = df.apply(map_subsource, axis=1)
df["source"] = "Dataset_12"
df["text"] = df["selftext"]

# Keep only required columns
df_final = df[["text", "label", "sub-source", "source"]]

# ========= FUNCTION: get unique filename =========
def get_unique_path(base_dir: Path, base_name: str) -> Path:
    """Return a unique path by adding _2, _3, etc. if needed."""
    out_path = base_dir / base_name
    if not out_path.exists():
        return out_path
    stem, ext = base_name.rsplit(".", 1)
    i = 2
    while True:
        new_name = f"{stem}_{i}.{ext}"
        out_path = base_dir / new_name
        if not out_path.exists():
            return out_path
        i += 1

# ========= SAVE SEPARATE FILES =========
for class_name in df_final["label"].unique():
    subset = df_final[df_final["label"] == class_name]
    out_path = get_unique_path(warehouse_dir, f"{class_name}_dataset.csv")
    subset.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved {len(subset)} rows to {out_path}")


Saved 915 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\depression_dataset_2.csv
Saved 980 rows to d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\suicide_dataset.csv
