In [4]:
from pathlib import Path
import json, re
import pandas as pd

In [6]:
# ===== directories =====
project_dir = Path.cwd()
source_dir = project_dir / "Data_Lake" / "Dataset_3"
all_combined_dir = source_dir / "all_combined"
labels_path = source_dir / "shuffled_ground_truth_labels.txt"
warehouse_dir = project_dir / "Data_Warehouse"
warehouse_dir.mkdir(parents=True, exist_ok=True)
out_csv = warehouse_dir / "erisk_task2_posts_and_comments.csv"  # new name

This script processes the eRisk Task 2 dataset to build a user-level depression vs. non-depression dataset. It:

Reads ground-truth labels from shuffled_ground_truth_labels.txt.

Iterates through all .json files in all_combined/, collecting posts and comments only from the target user (excluding deleted_user).

Cleans each text (removes extra whitespace, excludes texts with URLs).

Concatenates all valid writings per user into a single document.

Applies a length filter: keeps only texts with 50–400 words (trimming longer ones to 400).

Assigns labels (depression or non depression) using the ground truth.

Saves the result as a JSONL file (erisk_task2_userlevel.json), where each line contains user_id, text, label_id, and label.

The output can then be loaded as a DataFrame or used directly for training and evaluation.

In [1]:
from pathlib import Path
import json, re
import pandas as pd

# ===== directories (your layout) =====
project_dir = Path.cwd()
source_dir = project_dir / "Data_Lake" / "Dataset_3"
all_combined_dir = source_dir / "all_combined"
labels_path = source_dir / "shuffled_ground_truth_labels.txt"
warehouse_dir = project_dir / "Data_Warehouse"
warehouse_dir.mkdir(parents=True, exist_ok=True)

# output: one line per user (JSONL)
out_jsonl = warehouse_dir / "erisk_task2_userlevel.json"

URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
WS_RE  = re.compile(r"\s+")

def contains_url(text: str) -> bool:
    return bool(URL_RE.search(text or ""))

def clean_text(s: str) -> str:
    if not s: return ""
    return WS_RE.sub(" ", s.strip())

def word_count(text: str) -> int:
    # simple token-ish word counter
    return len(re.findall(r"\b\w+\b", text or ""))

def load_labels(path: Path) -> dict:
    labels = {}
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                labels[parts[0]] = int(parts[1])
    return labels

labels = load_labels(labels_path)

records = []
files = sorted(all_combined_dir.glob("*.json"))
for jf in files:
    try:
        blocks = json.loads(jf.read_text(encoding="utf-8"))
    except Exception as e:
        print(f"[skip read] {jf.name}: {e}")
        continue

    # collect all target==True writings for this file’s subject
    target_user_ids = set()
    pieces = []

    for blk in blocks:
        sub = blk.get("submission") or {}
        # target user's submission?
        if sub.get("target", False):
            uid = sub.get("user_id")
            if uid and uid != "deleted_user" and uid in labels:
                target_user_ids.add(uid)
                # combine title+body, exclude if URL present
                title = clean_text(sub.get("title") or "")
                body  = clean_text(sub.get("body") or "")
                text  = " ".join([t for t in (title, body) if t])
                if text and not contains_url(text):
                    pieces.append(text)

        # target user's comments in this thread
        for c in blk.get("comments") or []:
            if not c.get("target", False):
                continue
            uid = c.get("user_id")
            if not uid or uid == "deleted_user" or uid not in labels:
                continue
            target_user_ids.add(uid)
            text = clean_text(c.get("body") or "")
            if text and not contains_url(text):
                pieces.append(text)

    # sanity: expect exactly one target user per file
    if len(target_user_ids) == 0:
        # no target content found -> skip file
        continue
    if len(target_user_ids) > 1:
        print(f"[warn] {jf.name}: multiple target users {sorted(target_user_ids)}; using first")
    uid = sorted(target_user_ids)[0]

    # concatenate all cleaned pieces (excluding any that contained URLs)
    if not pieces:
        continue
    concat_text = clean_text("\n\n".join(pieces))

    # length gate: 50–400 words (trim to 400 if longer)
    wc = word_count(concat_text)
    if wc < 50:
        # too little content after URL filtering
        continue
    if wc > 400:
        # trim to first 400 words
        words = re.findall(r"\b\w+\b", concat_text)
        # rebuild using a simple slice; keep original spacing roughly
        trimmed = " ".join(words[:400])
        concat_text = trimmed

    y = labels[uid]
    records.append({
        "user_id": uid,
        "text": concat_text,
        "label_id": int(y),
        "label": "depression" if y == 1 else "non depression",
    })

# save JSONL
with out_jsonl.open("w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"Users written: {len(records)}")
print(f"Saved user-level JSONL to: {out_jsonl.resolve()}")

# (Optional) quick peek as a dataframe
if records:
    df = pd.DataFrame(records)
    print("\nLabel counts:")
    print(df["label"].value_counts())


Users written: 902
Saved user-level JSONL to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\erisk_task2_userlevel.json

Label counts:
label
non depression    800
depression        102
Name: count, dtype: int64


In [3]:
from pathlib import Path
import pandas as pd
import json

# ===== input/output =====
warehouse_dir = Path.cwd() / "Data_Warehouse"
src_jsonl = warehouse_dir / "erisk_task2_userlevel.json"
out_csv = warehouse_dir / "erisk_task2_userlevel_50_50.csv"

# ===== load JSONL into dataframe =====
records = []
with src_jsonl.open("r", encoding="utf-8") as f:
    for line in f:
        try:
            records.append(json.loads(line))
        except Exception:
            continue

df = pd.DataFrame(records)
print("Full dataset counts:")
print(df["label"].value_counts())

# ===== stratified sampling: 50 per class =====
dfs = []
for lbl in ["depression", "non depression"]:
    subset = df[df["label"] == lbl]
    if len(subset) < 50:
        raise ValueError(f"Not enough samples for {lbl}: found {len(subset)}")
    dfs.append(subset.sample(n=50, random_state=42))

df_small = pd.concat(dfs).reset_index(drop=True)

print("\nSampled dataset counts:")
print(df_small["label"].value_counts())

# ===== save =====
df_small.to_csv(out_csv, index=False)
print(f"\nSaved 50-50 dataset to {out_csv.resolve()}")


Full dataset counts:
label
non depression    800
depression        102
Name: count, dtype: int64

Sampled dataset counts:
label
depression        50
non depression    50
Name: count, dtype: int64

Saved 50-50 dataset to D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\erisk_task2_userlevel_50_50.csv
