### Without Stress

In [7]:
# Step 1: Load no-stress test split
from pathlib import Path
import pandas as pd

# ===== locate Data_Warehouse =====
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent  # if running as script
    except NameError:
        script_dir = Path.cwd()                       # if running in notebook
    data_warehouse = find_data_warehouse(script_dir)

# ===== load no-stress test.csv =====
test_path = data_warehouse / "mental_health_splits_no_stress" / "test.csv"
df_test = pd.read_csv(test_path)

print("Rows:", len(df_test))
print(df_test.head())


Rows: 470
                                                text       label  label_enc
0  I had a promising academic future. I had a won...     suicide          3
1  It seems that goalkeepers tend to be older com...        none          4
2  "Life has no meaning the moment you lose the i...  depression          1
3  I got way too attached and when she ghosted me...     suicide          3
4  I have severe severe depression, high magnitud...  depression          1


### With Stress

In [1]:
# Step 1: Load no-stress test split
from pathlib import Path
import pandas as pd

# ===== locate Data_Warehouse =====
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate Data_Warehouse")

try:
    data_warehouse
except NameError:
    try:
        script_dir = Path(__file__).resolve().parent  # if running as script
    except NameError:
        script_dir = Path.cwd()                       # if running in notebook
    data_warehouse = find_data_warehouse(script_dir)

# ===== load no-stress test.csv =====
test_path = data_warehouse / "mental_health_splits_with_stress" / "test.csv"
df_test = pd.read_csv(test_path)

print("Rows:", len(df_test))
print(df_test.head())

Rows: 697
                                                text  label_norm  label_enc
0  I use a lot of different ingredients when prep...        none          5
1  When we are at work we joke around but we all ...      stress          4
2  1 year ago, I left for good. I was a reddit lu...  depression          1
3  someone once told me smoking weed and then tou...        none          5
4  The main source of this stress is a scholarshi...      stress          4


In [8]:
# Step 2: Prompt builder for mental health classification
# zero shot first, few shot supported if you later set is_few_shot=True

from typing import List

def build_full_prompt(
    labels: List[str],
    test_data_for_prompt: str,
    is_few_shot: bool = False,
    train_data_for_prompt: str = ""
) -> str:
    """
    Build a single prompt that can contain multiple test items.
    test_data_for_prompt should be lines like: id | text
    if is_few_shot=True, train_data_for_prompt should be lines like: id | label | text
    """

    instruction = """
Instruction
Your task is to assign exactly one mental health label to each user text. This is a single label classification task only.
"""

    context = f"""
Context
This will be used in a research demo that compares large language models on mental health text classification.
Do not provide medical advice. Do not provide crisis instructions. Only return labels.
Each example contains
    label the ground truth class used for evaluation
    text a short natural language post

Predefined Labels
{", ".join(labels)}
"""

    handling_uncertainty = """
Handling Uncertainty
If the text is unclear or could reasonably match multiple categories, select the closest label by meaning.
If there is no signal of a mental health condition, use the label none.
"""

    training_data = """
Training Data
You will see a few labeled examples inside triple quotes.
Format
id | label | text
"""

    test_data_intro = """
Test Data
You will see unlabeled items inside triple quotes.
For each item, return one predicted label from the predefined list.
Format
id | text
"""

    output_format = """
Output Format
Return one line per item using this exact format
id | predicted_label | text
Do not add explanations. Do not add extra fields. Keep the original text unchanged.
"""

    if is_few_shot:
        full_prompt = "\n\n".join([
            instruction.strip(),
            context.strip(),
            handling_uncertainty.strip(),
            training_data.strip(),
            f'"""\n{train_data_for_prompt.strip()}\n"""',
            test_data_intro.strip(),
            f'"""\n{test_data_for_prompt.strip()}\n"""',
            output_format.strip()
        ])
    else:
        full_prompt = "\n\n".join([
            instruction.strip(),
            context.strip(),
            handling_uncertainty.strip(),
            test_data_intro.strip(),
            f'"""\n{test_data_for_prompt.strip()}\n"""',
            output_format.strip()
        ])

    return full_prompt


In [9]:
# Build label list from your split and a compact test block
labels = sorted(df_test["label"].astype(str).str.strip().unique().tolist())

# Create lines: id | text
test_lines = []
for i, r in df_test.reset_index(drop=True).iterrows():
    txt = str(r["text"]).replace("\n", " ").strip()
    test_lines.append(f"{i} | {txt}")
test_block = "\n".join(test_lines[:50])  # you can choose any slice size

# Build the final zero shot prompt
full_prompt = build_full_prompt(labels=labels, test_data_for_prompt=test_block, is_few_shot=False)
print(full_prompt[:2000])  # preview


Instruction
Your task is to assign exactly one mental health label to each user text. This is a single label classification task only.

Context
This will be used in a research demo that compares large language models on mental health text classification.
Do not provide medical advice. Do not provide crisis instructions. Only return labels.
Each example contains
    label the ground truth class used for evaluation
    text a short natural language post

Predefined Labels
anxiety, depression, none, ptsd, suicide

Handling Uncertainty
If the text is unclear or could reasonably match multiple categories, select the closest label by meaning.
If there is no signal of a mental health condition, use the label none.

Test Data
You will see unlabeled items inside triple quotes.
For each item, return one predicted label from the predefined list.
Format
id | text

"""
0 | I had a promising academic future. I had a wonderful, sweet partner. I had so much drive. I had so much support. People were pr

In [10]:
# Step 3: Zero shot run and evaluation on mental_health_splits_no_stress/test.csv

import os, re, json, time, math
from pathlib import Path
import pandas as pd
import numpy as np
import requests
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# ===== inputs already prepared earlier =====
# df_test -> loaded from Data_Warehouse/mental_health_splits_no_stress/test.csv
# build_full_prompt(...) -> from the previous message

# ===== user knobs =====
llm = "GPT"            # "GPT" or DeepSeek
BATCH_SIZE = 5        # items per request
TEMPERATURE = 1.0
TOP_P = 1.0
RUN_TAG = "zero_shot_nostress_test"

OUT_DIR = Path("llm_runs_zero_shot")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ===== model routing exactly as you shared =====
if llm == "GPT":
    api_key = os.getenv("GPT_API_KEY")
    api_url = "https://api.openai.com/v1/chat/completions"
    model_name = "gpt-5"
else:
    api_key = os.getenv("DS_API_KEY")
    api_url = "https://api.deepseek.com/v1/chat/completions"
    model_name = "deepseek-chat" #deepseek-reasoner

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}


In [11]:
# ===== helpers =====
def normalize_label(s: str) -> str:
    return " ".join(str(s).strip().lower().split())

def canonicalize(pred_label: str, allowed: list[str]) -> str | None:
    """map a predicted label string to one of the allowed labels"""
    if pred_label is None:
        return None
    norm = normalize_label(pred_label)
    allowed_norm = {normalize_label(a): a for a in allowed}
    if norm in allowed_norm:
        return allowed_norm[norm]
    # simple token overlap fallback
    toks = set(norm.split())
    if not toks:
        return None
    best, best_score = None, -1
    for a in allowed:
        atoks = set(normalize_label(a).split())
        score = len(toks & atoks)
        if score > best_score:
            best, best_score = a, score
    return best

def build_test_block(df: pd.DataFrame, start: int, end: int) -> str:
    lines = []
    for i, r in df.iloc[start:end].reset_index(drop=True).iterrows():
        txt = str(r["text"]).replace("\n", " ").strip()
        lines.append(f"{start+i} | {txt}")
    return "\n".join(lines)

line_pat = re.compile(r"^\s*(\d+)\s*\|\s*([^\|]+?)\s*\|\s*(.*)$")

def parse_output_to_df(raw_text: str) -> pd.DataFrame:
    """
    expects lines: id | predicted_label | text
    returns a dataframe with columns: id, pred, text
    """
    rows = []
    for line in raw_text.splitlines():
        m = line_pat.match(line)
        if not m:
            continue
        idx = int(m.group(1))
        pred = m.group(2).strip()
        txt  = m.group(3).strip()
        rows.append({"id": idx, "pred_raw": pred, "text_out": txt})
    return pd.DataFrame(rows)

def call_llm(prompt: str) -> str:
    data = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": "You are a careful labeler. Classify mental health text. Do not give advice."},
            {"role": "user", "content": prompt}
        ],
        "temperature": TEMPERATURE,
        "top_p": TOP_P
    }
    for attempt in range(5):
        r = requests.post(api_url, headers=headers, json=data, timeout=120)
        if r.status_code == 200:
            return r.json()["choices"][0]["message"]["content"]
        time.sleep(1.5 * (attempt + 1))
    raise RuntimeError(f"LLM API error: {r.status_code} {r.text}")

def evaluate(y_true: list[str], y_pred: list[str], labels: list[str]) -> dict:
    return {
        "n": len(y_true),
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "f1_weighted": f1_score(y_true, y_pred, average="weighted"),
        "report": classification_report(y_true, y_pred, labels=labels, output_dict=True),
        "confusion_matrix": confusion_matrix(y_true, y_pred, labels=labels).tolist(),
        "labels": labels
    }


In [12]:
# ===== label list from split =====
label_list = sorted(df_test["label"].astype(str).str.strip().unique().tolist())

# ===== batch over test set =====
n = len(df_test)
all_preds = {}     # id -> predicted label
all_conf = {}      # optional future use
start_time = time.time()

for b in range(0, n, BATCH_SIZE):
    e = min(b + BATCH_SIZE, n)
    test_block = build_test_block(df_test, b, e)
    prompt = build_full_prompt(labels=label_list, test_data_for_prompt=test_block, is_few_shot=False)

    t0 = time.time()
    raw = call_llm(prompt)
    dt = time.time() - t0
    print(f"Batch {b}-{e} got reply in {dt:.1f}s")

    df_out = parse_output_to_df(raw)

    # map back to predictions
    for _, row in df_out.iterrows():
        rid = int(row["id"])
        pred_norm = canonicalize(row["pred_raw"], label_list)
        if pred_norm is None:
            pred_norm = label_list[0]  # fallback
        all_preds[rid] = pred_norm

elapsed = time.time() - start_time
print(f"Total wall time {elapsed:.1f}s for {n} items")

# ===== build final predictions frame aligned with ground truth =====
df_eval = df_test.copy().reset_index(drop=True)
df_eval["pred"] = [all_preds.get(i, label_list[0]) for i in range(len(df_eval))]

# ===== metrics =====
metrics = evaluate(
    y_true=df_eval["label"].tolist(),
    y_pred=df_eval["pred"].tolist(),
    labels=label_list
)

# ===== save artifacts =====
pred_out = OUT_DIR / f"pred_{RUN_TAG}_{'gpt_5' if llm=='GPT' else 'deepseek'}.csv"
df_eval.to_csv(pred_out, index=False)

metrics_out = OUT_DIR / f"metrics_{RUN_TAG}_{'gpt_5' if llm=='GPT' else 'deepseek'}.json"
with open(metrics_out, "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)

# ===== pretty print summary =====
print("\nOverall metrics")
print({k: round(v, 4) for k, v in metrics.items() if isinstance(v, float)})

print("\nLabel wise metrics on TEST")
rep = metrics["report"]
for lab in label_list:
    if lab in rep:
        lr = rep[lab]
        print(f"{lab:>12}  p={lr['precision']:.3f}  r={lr['recall']:.3f}  f1={lr['f1-score']:.3f}  support={int(lr['support'])}")

Batch 0-5 got reply in 18.8s
Batch 5-10 got reply in 13.8s
Batch 10-15 got reply in 20.8s
Batch 15-20 got reply in 24.3s
Batch 20-25 got reply in 7.8s
Batch 25-30 got reply in 31.9s
Batch 30-35 got reply in 21.8s
Batch 35-40 got reply in 27.4s
Batch 40-45 got reply in 14.0s
Batch 45-50 got reply in 34.4s
Batch 50-55 got reply in 14.7s
Batch 55-60 got reply in 21.6s
Batch 60-65 got reply in 24.2s
Batch 65-70 got reply in 18.1s
Batch 70-75 got reply in 11.6s
Batch 75-80 got reply in 13.1s
Batch 80-85 got reply in 17.3s
Batch 85-90 got reply in 14.0s
Batch 90-95 got reply in 7.8s
Batch 95-100 got reply in 29.7s
Batch 100-105 got reply in 13.0s
Batch 105-110 got reply in 10.9s
Batch 110-115 got reply in 18.1s
Batch 115-120 got reply in 23.1s
Batch 120-125 got reply in 16.8s
Batch 125-130 got reply in 22.3s
Batch 130-135 got reply in 18.9s
Batch 135-140 got reply in 13.4s
Batch 140-145 got reply in 13.5s
Batch 145-150 got reply in 13.0s
Batch 150-155 got reply in 9.1s
Batch 155-160 got repl

#### DeepSeek No Stress: 

Overall metrics
{'accuracy': 0.7149, 'f1_macro': 0.694, 'f1_weighted': 0.7151}

Label wise metrics on TEST
     anxiety  p=0.554  r=0.738  f1=0.633  support=42
  depression  p=0.875  r=0.634  f1=0.735  support=232
        none  p=0.693  r=0.986  f1=0.814  support=71
        ptsd  p=0.759  r=0.537  f1=0.629  support=41
     suicide  p=0.569  r=0.786  f1=0.660  support=84

### GPT-4.1 with Stress

Overall metrics
{'accuracy': 0.5753, 'f1_macro': 0.5616, 'f1_weighted': 0.5661}

Label wise metrics on TEST
     anxiety  p=0.361  r=0.714  f1=0.480  support=42
  depression  p=0.685  r=0.591  f1=0.634  support=232
        none  p=0.449  r=0.986  f1=0.617  support=71
        ptsd  p=0.568  r=0.512  f1=0.538  support=41
      stress  p=0.864  r=0.335  f1=0.483  support=227
     suicide  p=0.504  r=0.798  f1=0.618  support=84

### DeepSeek with Stress

Overall metrics
{'accuracy': 0.5911, 'f1_macro': 0.579, 'f1_weighted': 0.5903}

Label wise metrics on TEST
     anxiety  p=0.378  r=0.738  f1=0.500  support=42
  depression  p=0.717  r=0.591  f1=0.648  support=232
        none  p=0.453  r=0.944  f1=0.612  support=71
        ptsd  p=0.571  r=0.585  f1=0.578  support=41
      stress  p=0.802  r=0.410  f1=0.542  support=227
     suicide  p=0.508  r=0.714  f1=0.594  support=84

### GPT-5 with Stress

Overall metrics
{'accuracy': 0.571, 'f1_macro': 0.5611, 'f1_weighted': 0.5738}

Label wise metrics on TEST
     anxiety  p=0.337  r=0.714  f1=0.458  support=42
  depression  p=0.797  r=0.491  f1=0.608  support=232
        none  p=0.395  r=0.986  f1=0.565  support=71
        ptsd  p=0.611  r=0.537  f1=0.571  support=41
      stress  p=0.844  r=0.405  f1=0.548  support=227
     suicide  p=0.490  r=0.833  f1=0.617  support=84

### GPT-5 No Stress

Overall metrics
{'accuracy': 0.6638, 'f1_macro': 0.6621, 'f1_weighted': 0.6623}

Label wise metrics on TEST
     anxiety  p=0.544  r=0.738  f1=0.626  support=42
  depression  p=0.909  r=0.517  f1=0.659  support=232
        none  p=0.574  r=0.986  f1=0.725  support=71
        ptsd  p=0.880  r=0.537  f1=0.667  support=41
     suicide  p=0.515  r=0.821  f1=0.633  support=84