### With No Stress

In [1]:
# Step 1 and 2: Load no stress splits and build few shot examples
from pathlib import Path
import pandas as pd
import numpy as np

RANDOM_STATE = 42
FEW_SHOT_K = 3

# ===== locate Data_Warehouse =====
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent  # if running as script
    except NameError:
        script_dir = Path.cwd()                       # if running in notebook
    data_warehouse = find_data_warehouse(script_dir)

split_dir = data_warehouse / "mental_health_splits_no_stress"

# ===== load no stress test.csv and train.csv =====
test_path = split_dir / "test.csv"
train_path = split_dir / "train.csv"
df_test = pd.read_csv(test_path)
df_train = pd.read_csv(train_path)

print("Test rows:", len(df_test))
print("Train rows:", len(df_train))
print(df_test.head())

# ===== light cleaning =====
for df in (df_train, df_test):
    if "text" not in df.columns or "label" not in df.columns:
        raise ValueError("Expected columns text and label in the split files")
    df["text"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
    df["label"] = df["label"].astype(str).str.strip()

# ===== ensure label overlap between train and test =====
labels_test = sorted(df_test["label"].unique().tolist())
labels_train = sorted(df_train["label"].unique().tolist())
missing_in_train = sorted(set(labels_test) - set(labels_train))
if missing_in_train:
    print("Warning: the following test labels do not appear in train and cannot provide few shot examples:", missing_in_train)

# ===== build few shot dataframe: up to 3 examples per label =====
fewshot_parts = []
for lab in labels_test:
    block = df_train[df_train["label"] == lab]
    take = min(FEW_SHOT_K, len(block))
    if take == 0:
        continue
    fewshot_parts.append(block.sample(n=take, random_state=RANDOM_STATE))

if not fewshot_parts:
    raise RuntimeError("Could not collect any few shot examples. Check your train split.")

df_fewshot = pd.concat(fewshot_parts, ignore_index=True)

# optional truncation to keep prompts compact
MAX_TEXT_CHARS = 400
df_fewshot["text_short"] = df_fewshot["text"].str.slice(0, MAX_TEXT_CHARS)

# save few shot examples for reference
fewshot_out = split_dir / "few_shot_examples_no_stress_k3.csv"
df_fewshot[["label", "text"]].to_csv(fewshot_out, index=False)
print("Saved few shot examples to:", fewshot_out)

Test rows: 470
Train rows: 3756
                                                text       label  label_enc
0  I had a promising academic future. I had a won...     suicide          3
1  It seems that goalkeepers tend to be older com...        none          4
2  "Life has no meaning the moment you lose the i...  depression          1
3  I got way too attached and when she ghosted me...     suicide          3
4  I have severe severe depression, high magnitud...  depression          1
Saved few shot examples to: d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_no_stress\few_shot_examples_no_stress_k3.csv


### With Stress

In [17]:
# Step 1 and 2: Load no stress splits and build few shot examples
from pathlib import Path
import pandas as pd
import numpy as np

RANDOM_STATE = 42
FEW_SHOT_K = 3

# ===== locate Data_Warehouse =====
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent  # if running as script
    except NameError:
        script_dir = Path.cwd()                       # if running in notebook
    data_warehouse = find_data_warehouse(script_dir)

split_dir = data_warehouse / "mental_health_splits_with_stress"

# ===== load no stress test.csv and train.csv =====
test_path = split_dir / "test.csv"
train_path = split_dir / "train.csv"
df_test = pd.read_csv(test_path)
df_train = pd.read_csv(train_path)

print("Test rows:", len(df_test))
print("Train rows:", len(df_train))
print(df_test.head())

# ===== light cleaning =====
for df in (df_train, df_test):
    if "text" not in df.columns or "label_norm" not in df.columns:
        raise ValueError("Expected columns text and label in the split files")
    df["text"] = df["text"].astype(str).str.replace("\n", " ").str.strip()
    df["label_norm"] = df["label_norm"].astype(str).str.strip()

# ===== ensure label overlap between train and test =====
labels_test = sorted(df_test["label_norm"].unique().tolist())
labels_train = sorted(df_train["label_norm"].unique().tolist())
missing_in_train = sorted(set(labels_test) - set(labels_train))
if missing_in_train:
    print("Warning: the following test labels do not appear in train and cannot provide few shot examples:", missing_in_train)

# ===== build few shot dataframe: up to 3 examples per label =====
fewshot_parts = []
for lab in labels_test:
    block = df_train[df_train["label_norm"] == lab]
    take = min(FEW_SHOT_K, len(block))
    if take == 0:
        continue
    fewshot_parts.append(block.sample(n=take, random_state=RANDOM_STATE))

if not fewshot_parts:
    raise RuntimeError("Could not collect any few shot examples. Check your train split.")

df_fewshot = pd.concat(fewshot_parts, ignore_index=True)

# optional truncation to keep prompts compact
MAX_TEXT_CHARS = 400
df_fewshot["text_short"] = df_fewshot["text"].str.slice(0, MAX_TEXT_CHARS)

# save few shot examples for reference
fewshot_out = split_dir / "few_shot_examples_no_stress_k3.csv"
df_fewshot[["label_norm", "text"]].to_csv(fewshot_out, index=False)
print("Saved few shot examples to:", fewshot_out)

Test rows: 697
Train rows: 5576
                                                text  label_norm  label_enc
0  I use a lot of different ingredients when prep...        none          5
1  When we are at work we joke around but we all ...      stress          4
2  1 year ago, I left for good. I was a reddit lu...  depression          1
3  someone once told me smoking weed and then tou...        none          5
4  The main source of this stress is a scholarshi...      stress          4
Saved few shot examples to: d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_with_stress\few_shot_examples_no_stress_k3.csv


In [18]:
# Step 2: Prompt builder for mental health classification
# zero shot first, few shot supported if you later set is_few_shot=True

from typing import List

def build_full_prompt(
    labels: List[str],
    test_data_for_prompt: str,
    is_few_shot: bool = False,
    train_data_for_prompt: str = ""
) -> str:
    """
    Build a single prompt that can contain multiple test items.
    test_data_for_prompt should be lines like: id | text
    if is_few_shot=True, train_data_for_prompt should be lines like: id | label | text
    """

    instruction = """
Instruction
Your task is to assign exactly one mental health label to each user text. This is a single label classification task only.
"""

    context = f"""
Context
This will be used in a research demo that compares large language models on mental health text classification.
Do not provide medical advice. Do not provide crisis instructions. Only return labels.
Each example contains
    label the ground truth class used for evaluation
    text a short natural language post

Predefined Labels
{", ".join(labels)}
"""

    handling_uncertainty = """
Handling Uncertainty
If the text is unclear or could reasonably match multiple categories, select the closest label by meaning.
If there is no signal of a mental health condition, use the label none.
"""

    training_data = """
Training Data
You will see a few labeled examples inside triple quotes.
Format
id | label | text
"""

    test_data_intro = """
Test Data
You will see unlabeled items inside triple quotes.
For each item, return one predicted label from the predefined list.
Format
id | text
"""

    output_format = """
Output Format
Return one line per item using this exact format
id | predicted_label | text
Do not add explanations. Do not add extra fields. Keep the original text unchanged.
"""

    if is_few_shot:
        full_prompt = "\n\n".join([
            instruction.strip(),
            context.strip(),
            handling_uncertainty.strip(),
            training_data.strip(),
            f'"""\n{train_data_for_prompt.strip()}\n"""',
            test_data_intro.strip(),
            f'"""\n{test_data_for_prompt.strip()}\n"""',
            output_format.strip()
        ])
    else:
        full_prompt = "\n\n".join([
            instruction.strip(),
            context.strip(),
            handling_uncertainty.strip(),
            test_data_intro.strip(),
            f'"""\n{test_data_for_prompt.strip()}\n"""',
            output_format.strip()
        ])

    return full_prompt

In [19]:
# ===== build few-shot training block from train split =====
FEW_SHOT_K = 3
RANDOM_STATE = 42

# get labels from train split
labels_train = sorted(df_train["label_norm"].astype(str).str.strip().unique().tolist())

# collect up to 3 examples per label
train_lines = []
for lab in labels:
    subset = df_train[df_train["label_norm"] == lab]
    take = min(FEW_SHOT_K, len(subset))
    if take == 0:
        continue
    sampled = subset.sample(n=take, random_state=RANDOM_STATE)
    for j, row in sampled.iterrows():
        txt = str(row["text"]).replace("\n", " ").strip()
        train_lines.append(f"{j} | {lab} | {txt}")

train_block = "\n".join(train_lines)

# ===== build test block as before =====
test_lines = []
for i, r in df_test.reset_index(drop=True).iterrows():
    txt = str(r["text"]).replace("\n", " ").strip()
    test_lines.append(f"{i} | {txt}")
test_block = "\n".join(test_lines[:20])  # pick a slice for one batch

# ===== build final few-shot prompt =====
full_prompt = build_full_prompt(
    labels=labels,
    test_data_for_prompt=test_block,
    is_few_shot=True,
    train_data_for_prompt=train_block
)

print(full_prompt[:15000])


Instruction
Your task is to assign exactly one mental health label to each user text. This is a single label classification task only.

Context
This will be used in a research demo that compares large language models on mental health text classification.
Do not provide medical advice. Do not provide crisis instructions. Only return labels.
Each example contains
    label the ground truth class used for evaluation
    text a short natural language post

Predefined Labels
anxiety, depression, none, ptsd, suicide

Handling Uncertainty
If the text is unclear or could reasonably match multiple categories, select the closest label by meaning.
If there is no signal of a mental health condition, use the label none.

Training Data
You will see a few labeled examples inside triple quotes.
Format
id | label | text

"""
442 | anxiety | I always sound like I’m about to cry. I get really hot and sweaty, my hearing starts to go. I had to stop in a presentation one time last semester to say “I’m sorry

In [26]:
# Step 3: Zero shot run and evaluation on mental_health_splits_no_stress/test.csv

import os, re, json, time, math
from pathlib import Path
import pandas as pd
import numpy as np
import requests
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ===== inputs already prepared earlier =====
# df_test -> loaded from Data_Warehouse/mental_health_splits_no_stress/test.csv
# build_full_prompt(...) -> from the previous message

# ===== user knobs =====
llm = "DeepSeek"            # "GPT" or DeepSeek
BATCH_SIZE = 5        # items per request
TEMPERATURE = 0.0
TOP_P = 1.0
RUN_TAG = "few_shot_withstress_test"

OUT_DIR = Path("llm_runs_few_shot")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ===== model routing exactly as you shared =====
if llm == "GPT":
    api_key = os.getenv("GPT_API_KEY")
    api_url = "https://api.openai.com/v1/chat/completions"
    model_name = "gpt-5"
else:
    api_key = os.getenv("DS_API_KEY")
    api_url = "https://api.deepseek.com/v1/chat/completions"
    model_name = "deepseek-chat" #deepseek-reasoner

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}


In [27]:
# ===== helpers =====
def normalize_label(s: str) -> str:
    return " ".join(str(s).strip().lower().split())

def canonicalize(pred_label: str, allowed: list[str]) -> str | None:
    """map a predicted label string to one of the allowed labels"""
    if pred_label is None:
        return None
    norm = normalize_label(pred_label)
    allowed_norm = {normalize_label(a): a for a in allowed}
    if norm in allowed_norm:
        return allowed_norm[norm]
    # simple token overlap fallback
    toks = set(norm.split())
    if not toks:
        return None
    best, best_score = None, -1
    for a in allowed:
        atoks = set(normalize_label(a).split())
        score = len(toks & atoks)
        if score > best_score:
            best, best_score = a, score
    return best

def build_test_block(df: pd.DataFrame, start: int, end: int) -> str:
    lines = []
    for i, r in df.iloc[start:end].reset_index(drop=True).iterrows():
        txt = str(r["text"]).replace("\n", " ").strip()
        lines.append(f"{start+i} | {txt}")
    return "\n".join(lines)

line_pat = re.compile(r"^\s*(\d+)\s*\|\s*([^\|]+?)\s*\|\s*(.*)$")

def parse_output_to_df(raw_text: str) -> pd.DataFrame:
    """
    expects lines: id | predicted_label | text
    returns a dataframe with columns: id, pred, text
    """
    rows = []
    for line in raw_text.splitlines():
        m = line_pat.match(line)
        if not m:
            continue
        idx = int(m.group(1))
        pred = m.group(2).strip()
        txt  = m.group(3).strip()
        rows.append({"id": idx, "pred_raw": pred, "text_out": txt})
    return pd.DataFrame(rows)

def call_llm(prompt: str) -> str:
    data = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": "You are a careful labeler. Classify mental health text. Do not give advice."},
            {"role": "user", "content": prompt}
        ],
        "temperature": TEMPERATURE,
        "top_p": TOP_P
    }
    for attempt in range(5):
        r = requests.post(api_url, headers=headers, json=data, timeout=120)
        if r.status_code == 200:
            return r.json()["choices"][0]["message"]["content"]
        time.sleep(1.5 * (attempt + 1))
    raise RuntimeError(f"LLM API error: {r.status_code} {r.text}")

def evaluate(y_true: list[str], y_pred: list[str], labels: list[str]) -> dict:
    return {
        "n": len(y_true),
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted"),
        "report": classification_report(y_true, y_pred, labels=labels, output_dict=True),
        "confusion_matrix": confusion_matrix(y_true, y_pred, labels=labels).tolist(),
        "labels": labels
    }


In [28]:
# ===== label list from split =====
label_list = sorted(df_test["label_norm"].astype(str).str.strip().unique().tolist())

# ===== batch over test set =====
n = len(df_test)
all_preds = {}     # id -> predicted label
all_conf = {}      # optional future use
start_time = time.time()

for b in range(0, n, BATCH_SIZE):
    e = min(b + BATCH_SIZE, n)
    test_block = build_test_block(df_test, b, e)
    prompt = build_full_prompt(labels=label_list, test_data_for_prompt=test_block, is_few_shot=False)

    t0 = time.time()
    raw = call_llm(prompt)
    dt = time.time() - t0
    print(f"Batch {b}-{e} got reply in {dt:.1f}s")

    df_out = parse_output_to_df(raw)

    # map back to predictions
    for _, row in df_out.iterrows():
        rid = int(row["id"])
        pred_norm = canonicalize(row["pred_raw"], label_list)
        if pred_norm is None:
            pred_norm = label_list[0]  # fallback
        all_preds[rid] = pred_norm

elapsed = time.time() - start_time
print(f"Total wall time {elapsed:.1f}s for {n} items")

# ===== build final predictions frame aligned with ground truth =====
df_eval = df_test.copy().reset_index(drop=True)
df_eval["pred"] = [all_preds.get(i, label_list[0]) for i in range(len(df_eval))]

# ===== metrics =====
metrics = evaluate(
    y_true=df_eval["label_norm"].tolist(),
    y_pred=df_eval["pred"].tolist(),
    labels=label_list
)

# ===== save artifacts =====
pred_out = OUT_DIR / f"pred_{RUN_TAG}_{'gpt_5' if llm=='GPT' else 'deepseek'}.csv"
df_eval.to_csv(pred_out, index=False)

metrics_out = OUT_DIR / f"metrics_{RUN_TAG}_{'gpt_5' if llm=='GPT' else 'deepseek'}.json"
with open(metrics_out, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

# ===== pretty print summary =====
print("\nOverall metrics")
print({k: round(v, 4) for k, v in metrics.items() if isinstance(v, float)})

print("\nLabel wise metrics on TEST")
rep = metrics["report"]
for lab in label_list:
    if lab in rep:
        lr = rep[lab]
        print(f"{lab:>12}  p={lr['precision']:.3f}  r={lr['recall']:.3f}  f1={lr['f1-score']:.3f}  support={int(lr['support'])}")

Batch 0-5 got reply in 27.1s
Batch 5-10 got reply in 37.0s
Batch 10-15 got reply in 15.4s
Batch 15-20 got reply in 31.5s
Batch 20-25 got reply in 16.8s
Batch 25-30 got reply in 24.2s
Batch 30-35 got reply in 17.3s
Batch 35-40 got reply in 36.5s
Batch 40-45 got reply in 18.5s
Batch 45-50 got reply in 35.8s
Batch 50-55 got reply in 18.7s
Batch 55-60 got reply in 27.6s
Batch 60-65 got reply in 24.1s
Batch 65-70 got reply in 20.3s
Batch 70-75 got reply in 12.5s
Batch 75-80 got reply in 45.1s
Batch 80-85 got reply in 26.2s
Batch 85-90 got reply in 21.9s
Batch 90-95 got reply in 21.5s
Batch 95-100 got reply in 42.7s
Batch 100-105 got reply in 31.5s
Batch 105-110 got reply in 25.0s
Batch 110-115 got reply in 32.1s
Batch 115-120 got reply in 35.5s
Batch 120-125 got reply in 30.7s
Batch 125-130 got reply in 32.2s
Batch 130-135 got reply in 17.9s
Batch 135-140 got reply in 36.2s
Batch 140-145 got reply in 49.9s
Batch 145-150 got reply in 47.3s
Batch 150-155 got reply in 22.5s
Batch 155-160 got r

### GPT-5 No Stress

Overall metrics
{'accuracy': 0.6745, 'f1_macro': 0.6729, 'f1_weighted': 0.6742}

Label wise metrics on TEST
     anxiety  p=0.571  r=0.762  f1=0.653  support=42
  depression  p=0.906  r=0.539  f1=0.676  support=232
        none  p=0.600  r=0.972  f1=0.742  support=71
        ptsd  p=0.880  r=0.537  f1=0.667  support=41
     suicide  p=0.507  r=0.821  f1=0.627  support=84

### GPT-4.1 No Stress

Overall metrics
{'accuracy': 0.6957, 'f1_macro': 0.6834, 'f1_weighted': 0.6958}

Label wise metrics on TEST
     anxiety  p=0.574  r=0.738  f1=0.646  support=42
  depression  p=0.873  r=0.595  f1=0.708  support=232
        none  p=0.663  r=0.972  f1=0.789  support=71
        ptsd  p=0.840  r=0.512  f1=0.636  support=41
     suicide  p=0.527  r=0.810  f1=0.638  support=84

### DeepSeek No Stress

Overall metrics
{'accuracy': 0.7043, 'f1_macro': 0.6878, 'f1_weighted': 0.7056}

Label wise metrics on TEST
     anxiety  p=0.493  r=0.810  f1=0.613  support=42
  depression  p=0.886  r=0.603  f1=0.718  support=232
        none  p=0.719  r=0.972  f1=0.826  support=71
        ptsd  p=0.759  r=0.537  f1=0.629  support=41
     suicide  p=0.559  r=0.786  f1=0.653  support=84

### GPT-4.1 With Stress

Overall metrics
{'accuracy': 0.5839, 'f1_macro': 0.5715, 'f1_weighted': 0.5764}

Label wise metrics on TEST
     anxiety  p=0.375  r=0.714  f1=0.492  support=42
  depression  p=0.697  r=0.595  f1=0.642  support=232
        none  p=0.446  r=0.986  f1=0.614  support=71
        ptsd  p=0.561  r=0.561  f1=0.561  support=41
      stress  p=0.888  r=0.348  f1=0.500  support=227
     suicide  p=0.508  r=0.798  f1=0.620  support=84

### GPT-5 with Stress

Overall metrics
{'accuracy': 0.5753, 'f1_macro': 0.5597, 'f1_weighted': 0.58}

Label wise metrics on TEST
     anxiety  p=0.326  r=0.714  f1=0.448  support=42
  depression  p=0.812  r=0.522  f1=0.635  support=232
        none  p=0.402  r=0.986  f1=0.571  support=71
        ptsd  p=0.600  r=0.512  f1=0.553  support=41
      stress  p=0.858  r=0.401  f1=0.547  support=227
     suicide  p=0.482  r=0.810  f1=0.604  support=84

### DeepSeek with Stress

Overall metrics
{'accuracy': 0.594, 'f1_macro': 0.5774, 'f1_weighted': 0.5943}

Label wise metrics on TEST
     anxiety  p=0.369  r=0.738  f1=0.492  support=42
  depression  p=0.738  r=0.595  f1=0.659  support=232
        none  p=0.450  r=0.944  f1=0.609  support=71
        ptsd  p=0.548  r=0.561  f1=0.554  support=41
      stress  p=0.803  r=0.414  f1=0.547  support=227
     suicide  p=0.517  r=0.726  f1=0.604  support=84