In [2]:
from pathlib import Path
import os
import re
import argparse
import random
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [11]:
# ---------------------------
# Config & paths
# ---------------------------
def find_data_warehouse(start: Path) -> Path:
    for p in [start] + list(start.parents):
        dw = p / "Data_Warehouse"
        if dw.exists():
            return dw
    raise FileNotFoundError("Could not locate a folder named Data_Warehouse")

try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    SCRIPT_DIR = Path.cwd()

DATA_WAREHOUSE = find_data_warehouse(SCRIPT_DIR)
SPLIT_DIR = DATA_WAREHOUSE / "mental_health_splits_no_stress"
MODEL_BASE = SPLIT_DIR / "all_roberta_large_v1_multiclass"
BEST_DIR = MODEL_BASE / "best"
MODEL_DIR = BEST_DIR if BEST_DIR.exists() else MODEL_BASE

# defaults (can override via CLI)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LEN = 256             # shorter to save mem
PRED_BATCH = 4
SEED = 42

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x11a10d4c570>

In [12]:
# ---------------------------
# IO helpers
# ---------------------------
def load_test_and_mapping():
    test_path = SPLIT_DIR / "test.csv"
    if not test_path.exists():
        raise FileNotFoundError(f"Missing test.csv at {test_path}")
    df_test = pd.read_csv(test_path)

    label_map_path = SPLIT_DIR / "label_classes.csv"
    if label_map_path.exists():
        df_map = pd.read_csv(label_map_path, header=None)
        if df_map.shape[1] == 2:
            class_to_id = {str(df_map.iloc[i, 0]).strip().lower(): int(df_map.iloc[i, 1]) for i in range(len(df_map))}
        else:
            class_to_id = {str(df_map.iloc[i, -2]).strip().lower(): int(df_map.iloc[i, -1]) for i in range(len(df_map))}
    else:
        uniq = sorted([lbl for lbl in df_test["label"].astype(str).str.lower().unique() if lbl != "none"])
        class_to_id = {lbl: i for i, lbl in enumerate(uniq)}
        class_to_id["none"] = 4

    id_to_class = {v: k for k, v in class_to_id.items()}
    if "suicide" not in class_to_id or "depression" not in class_to_id:
        raise ValueError(f"Mapping must contain 'suicide' and 'depression'. Found: {list(class_to_id.keys())}")
    return df_test, class_to_id, id_to_class

In [17]:
# ---------------------------
# Model + prediction
# ---------------------------
def load_model_and_tokenizer():
    tok = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
    mdl = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
    mdl.to(DEVICE); mdl.eval()
    return tok, mdl

def predict_batch(texts, tokenizer, model, batch_size=PRED_BATCH):
    preds, probs = [], []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, truncation=True, max_length=MAX_LEN,
                        return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            logits = model(**enc).logits
            p = torch.softmax(logits, dim=-1).cpu().numpy()
            y = p.argmax(axis=1)
        preds.extend(y.tolist()); probs.extend(p.tolist())
        # cleanup
        del enc, logits
        import gc; gc.collect()
        if torch.cuda.is_available(): torch.cuda.empty_cache()
    return np.array(preds), np.array(probs)

# ---------------------------
# Text cleaning (optional)
# ---------------------------
ARTIFACT_RX = re.compile(r"[^\w\s'’\-\.]+")  # keep basic punctuation and apostrophes
def clean_text(t: str) -> str:
    if not isinstance(t, str): return ""
    # fix common mojibake seen in your data
    t = (t.replace("âĢĻ", "'")
           .replace("âĿ¤ï¸ı", "")
           .replace("Ċ", ""))
    t = ARTIFACT_RX.sub(" ", t)
    # collapse whitespace
    t = re.sub(r"\s+", " ", t).strip()
    return t

# ---------------------------
# KeyBERT extraction
# ---------------------------
def build_keybert(backend_model: str = "all-MiniLM-L6-v2"):
    #st = SentenceTransformer(backend_model)
    st = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    return KeyBERT(model=st)

def extract_keyphrases_for_docs(
    docs,
    kb: KeyBERT,
    top_n=10,
    ngram_range=(1, 3),
    use_mmr=True,
    diversity=0.7,
    min_df_len=10
):
    """
    Run KeyBERT per doc, aggregate by phrase:
      - average score
      - total score
      - frequency
    Returns a DataFrame sorted by total_score desc.
    """
    agg_score = defaultdict(float)
    agg_count = Counter()

    for d in docs:
        d = clean_text(d or "")
        if len(d.split()) < min_df_len:
            continue
        try:
            kws = kb.extract_keywords(
                d,
                keyphrase_ngram_range=ngram_range,
                stop_words="english",
                use_maxsum=False,
                use_mmr=use_mmr,
                diversity=diversity,
                top_n=top_n
            )
            for phrase, score in kws:
                phrase_c = phrase.strip().lower()
                if not phrase_c: continue
                agg_score[phrase_c] += float(score)
                agg_count[phrase_c] += 1
        except Exception:
            # skip problematic doc, continue
            continue

    rows = []
    for ph in agg_score:
        rows.append({
            "phrase": ph,
            "freq": agg_count[ph],
            "avg_score": agg_score[ph] / max(1, agg_count[ph]),
            "total_score": agg_score[ph],
        })
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["total_score", "freq", "avg_score"], ascending=False)
    return df


In [14]:
# ---------------------------
# Buckets & main
# ---------------------------
def main(args):
    print("Using model dir:", MODEL_DIR)
    out_dir = MODEL_DIR / "keybert_phrases"
    out_dir.mkdir(parents=True, exist_ok=True)

    df_test, class_to_id, id_to_class = load_test_and_mapping()
    tokenizer, model = load_model_and_tokenizer()

    texts = df_test["text"].astype(str).tolist()
    y_true = df_test["label"].astype(str).str.lower().map(class_to_id).to_numpy()
    y_pred, y_prob = predict_batch(texts, tokenizer, model, batch_size=args.pred_batch)

    idx_dep = class_to_id["depression"]
    idx_su = class_to_id["suicide"]

    # Buckets requested
    FN_suicide   = np.where((y_true == idx_su) & (y_pred == idx_dep))[0]   # suicide but predicted depression
    TP_suicide   = np.where((y_true == idx_su) & (y_pred == idx_su))[0]   # suicide and predicted suicide
    TN_depr      = np.where((y_true == idx_dep) & (y_pred == idx_dep))[0] # depression but predicted depression
    FP_dep_to_su = np.where((y_true == idx_dep) & (y_pred == idx_su))[0]  # depression and predicted suicide

    # sample to cap runtime if desired
    def sample_idx(arr, k):
        arr = list(arr)
        random.shuffle(arr)
        return arr[:min(len(arr), k)]

    FN_suicide_s   = sample_idx(FN_suicide,   args.max_docs)
    TP_suicide_s   = sample_idx(TP_suicide,   args.max_docs)
    TN_depr_s      = sample_idx(TN_depr,      args.max_docs)
    FP_dep_to_su_s = sample_idx(FP_dep_to_su, args.max_docs)

    # build docs lists
    docs_FN   = [texts[i] for i in FN_suicide_s]
    docs_TP   = [texts[i] for i in TP_suicide_s]
    docs_TN   = [texts[i] for i in TN_depr_s]
    docs_FP   = [texts[i] for i in FP_dep_to_su_s]

    # KeyBERT
    kb = build_keybert(args.backend)

    print(f"KeyBERT on FN_suicide docs: {len(docs_FN)}")
    df_FN = extract_keyphrases_for_docs(
        docs_FN, kb, top_n=args.topn, ngram_range=tuple(args.ngram), use_mmr=not args.no_mmr,
        diversity=args.diversity, min_df_len=args.min_doc_len
    )
    df_FN.head(args.save_k).to_csv(out_dir / "keybert_FN_suicide_top.csv", index=False)

    print(f"KeyBERT on TP_suicide docs: {len(docs_TP)}")
    df_TP = extract_keyphrases_for_docs(
        docs_TP, kb, top_n=args.topn, ngram_range=tuple(args.ngram), use_mmr=not args.no_mmr,
        diversity=args.diversity, min_df_len=args.min_doc_len
    )
    df_TP.head(args.save_k).to_csv(out_dir / "keybert_TP_suicide_top.csv", index=False)

    print(f"KeyBERT on TN_depression docs: {len(docs_TN)}")
    df_TN = extract_keyphrases_for_docs(
        docs_TN, kb, top_n=args.topn, ngram_range=tuple(args.ngram), use_mmr=not args.no_mmr,
        diversity=args.diversity, min_df_len=args.min_doc_len
    )
    df_TN.head(args.save_k).to_csv(out_dir / "keybert_TN_depression_top.csv", index=False)

    print(f"KeyBERT on FP_depression_to_suicide docs: {len(docs_FP)}")
    df_FP = extract_keyphrases_for_docs(
        docs_FP, kb, top_n=args.topn, ngram_range=tuple(args.ngram), use_mmr=not args.no_mmr,
        diversity=args.diversity, min_df_len=args.min_doc_len
    )
    df_FP.head(args.save_k).to_csv(out_dir / "keybert_FP_depression_to_suicide_top.csv", index=False)

    print("Saved CSVs to:", out_dir.resolve())

In [18]:
class Args:
    #backend = "all-MiniLM-L6-v2"
    backend = "all-mpnet-base-v2"
    topn = 8
    ngram = [1, 3]
    diversity = 0.7
    no_mmr = False
    max_docs = 10000   # use all
    save_k = 50
    pred_batch = 4
    min_doc_len = 10

args = Args()
main(args)

Using model dir: d:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_no_stress\all_roberta_large_v1_multiclass\best
KeyBERT on FN_suicide docs: 19
KeyBERT on TP_suicide docs: 64
KeyBERT on TN_depression docs: 207
KeyBERT on FP_depression_to_suicide docs: 23
Saved CSVs to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\mental_health_splits_no_stress\all_roberta_large_v1_multiclass\best\keybert_phrases
