In [1]:
import os, re, json, warnings
from datetime import datetime
from typing import Any, Optional, Tuple, Dict, List

import numpy as np
import pandas as pd

# ------------------- CONFIG -------------------
OUT_DIR    = r"C:\Users\sagni\Downloads\Suicidal Detection"
MODEL_PATH = os.path.join(OUT_DIR, "ms_model.joblib")   # optional (from earlier training)
INFER_SOURCE = None  # e.g., r"C:\Users\sagni\Downloads\Suicidal Detection\new_messages.csv"

# If INFER_SOURCE is None, these sample texts will be used:
SAMPLE_TEXTS = [
    "I feel okay lately, just a bit stressed about exams.",
    "Sometimes I just want to disappear.",
    "I am exhausted but I will be fine after some rest."
]

# Zero-shot threshold for labeling as suicidal (after heuristic boost)
ZS_SUICIDAL_THRESHOLD = 0.50

# Column detection candidates for CSV
TEXT_CANDS = [
    "text", "message", "content", "body", "post", "comment", "clean_text",
    "utterance", "selftext", "title"
]

# ------------------- SAFETY / ETHICS -------------------
# If someone may be in danger or expressing intent to self-harm,
# contact local emergency services or appropriate crisis hotlines immediately.

# ------------------- OPTIONAL: LOAD TRAINED MODEL -------------------
from joblib import load as joblib_load
from sklearn.pipeline import Pipeline

def load_trained_model_if_available() -> Optional[Pipeline]:
    if os.path.exists(MODEL_PATH):
        try:
            pipe = joblib_load(MODEL_PATH)
            print("[INFO] Loaded trained model:", MODEL_PATH)
            return pipe
        except Exception as e:
            print("[WARN] Could not load model; falling back to zero-shot. Error:", e)
    else:
        print("[INFO] No trained model found. Using zero-shot fallback.")
    return None

# ------------------- ZERO-SHOT FALLBACK -------------------
# Uses Hugging Face pipeline (facebook/bart-large-mnli).
warnings.filterwarnings("ignore", category=FutureWarning)
_zs_pipeline = None

def ensure_zero_shot():
    global _zs_pipeline
    if _zs_pipeline is None:
        try:
            from transformers import pipeline
        except Exception as e:
            raise RuntimeError(
                "Transformers not installed. Install with: pip install transformers"
            ) from e
        _zs_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)

def zero_shot_score(texts: List[str]) -> np.ndarray:
    """
    Returns p_suicidal in [0,1] for each text using zero-shot labels.
    """
    ensure_zero_shot()
    labels = ["suicidal thoughts", "no suicidal thoughts"]
    probs = []
    for t in texts:
        out = _zs_pipeline(t, labels, hypothesis_template="This text expresses {label}.")
        # Map score for "suicidal thoughts"
        p = 0.0
        for lab, sc in zip(out["labels"], out["scores"]):
            if lab == "suicidal thoughts":
                p = float(sc); break
        probs.append(p)
    return np.array(probs, dtype=float)

# ------------------- HEURISTIC BOOST + HIGHLIGHTS -------------------
CRISIS_PATTERNS = {
    "self_harm": r"\b(kill myself|suicide|end it|cutting|self[- ]?harm|i want to die)\b",
    "ideation_soft": r"\b(disappear|tired of everything|no way out|give up|can't go on)\b",
    "method_access": r"\b(rope|pills|overdose|blade|train tracks)\b",
    "immediacy": r"\b(today|tonight|now|right now|immediately|this is the end)\b"
}

def highlight_spans(text: str) -> List[Tuple[int,int,str]]:
    hits = []
    for k,pat in CRISIS_PATTERNS.items():
        for m in re.finditer(pat, text, flags=re.I):
            hits.append((m.start(), m.end(), k))
    return hits

def heuristic_boost(p_suicidal: float, spans: List[Tuple[int,int,str]]) -> float:
    """
    Boost the suicidal probability based on indicative phrases; keep in [0,1].
    """
    boosted = p_suicidal
    tags = {tag for *_ , tag in spans}
    if "self_harm" in tags or "method_access" in tags:
        boosted += 0.20
    if "ideation_soft" in tags or "immediacy" in tags:
        boosted += 0.10
    return max(0.0, min(1.0, boosted))

# ------------------- I/O HELPERS -------------------
def load_texts_from_file(path: str) -> List[str]:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            lines = [ln.strip() for ln in f if ln.strip()]
        return lines
    elif ext == ".json":
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict) and "messages" in data:
            msgs = data["messages"]
            return [m["text"] if isinstance(m, dict) and "text" in m else str(m) for m in msgs]
        if isinstance(data, list):
            return [d["text"] if isinstance(d, dict) and "text" in d else str(d) for d in data]
        raise ValueError("Unsupported JSON schema. Use list or {'messages': [{'text': ...}, ...]}")
    elif ext == ".csv":
        df = pd.read_csv(path)
        lower = {c.lower(): c for c in df.columns}
        text_col = None
        for c in TEXT_CANDS:
            if c in lower: text_col = lower[c]; break
        if text_col is None:
            if df.shape[1] == 1:
                text_col = df.columns[0]
            else:
                raise ValueError("CSV must contain a text column (e.g., 'text').")
        texts = df[text_col].astype(str).fillna("").str.strip().tolist()
        return [t for t in texts if t]
    else:
        raise ValueError("Supported input files: .txt, .csv, .json")

def save_predictions(df: pd.DataFrame, base_name: str = "mindshield_suicide_pred"):
    os.makedirs(OUT_DIR, exist_ok=True)
    csv_path  = os.path.join(OUT_DIR, f"{base_name}.csv")
    json_path = os.path.join(OUT_DIR, f"{base_name}.json")
    df.to_csv(csv_path, index=False, encoding="utf-8")
    with open(json_path, "w", encoding="utf-8") as f:
        payload = {
            "created_utc": datetime.utcnow().isoformat() + "Z",
            "items": df.to_dict(orient="records"),
            "summary": {
                "total": int(len(df)),
                "pred_counts": {
                    "non-suicidal": int((df["pred_label"]==0).sum()),
                    "suicidal": int((df["pred_label"]==1).sum())
                }
            }
        }
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print("[OUT] Saved CSV:", csv_path)
    print("[OUT] Saved JSON:", json_path)

# ------------------- CORE PREDICTORS -------------------
def predict_with_trained_model(pipe: Pipeline, texts: List[str]) -> pd.DataFrame:
    preds = pipe.predict(texts).astype(int)
    if hasattr(pipe[-1], "predict_proba"):
        p1 = pipe.predict_proba(texts)[:, 1]
    else:
        p1 = np.full(len(preds), np.nan, dtype=float)

    rows = []
    for t, yhat, prob in zip(texts, preds, p1):
        spans = highlight_spans(t)
        rows.append({
            "text": t,
            "pred_label": int(yhat),
            "pred_name": "suicidal" if yhat == 1 else "non-suicidal",
            "prob_suicidal": float(prob) if np.isfinite(prob) else None,
            "highlights": spans
        })
    return pd.DataFrame(rows)

def predict_zero_shot(texts: List[str]) -> pd.DataFrame:
    base_p = zero_shot_score(texts)     # p(suicidal) from zero-shot
    rows = []
    for t, p in zip(texts, base_p):
        spans = highlight_spans(t)
        boosted = heuristic_boost(p, spans)
        label = 1 if boosted >= ZS_SUICIDAL_THRESHOLD else 0
        rows.append({
            "text": t,
            "pred_label": label,
            "pred_name": "suicidal" if label == 1 else "non-suicidal",
            "prob_suicidal": round(float(boosted), 4),
            "highlights": spans
        })
    return pd.DataFrame(rows)

def identify_suicidal_thoughts(texts: List[str]) -> pd.DataFrame:
    """
    Main entrypoint: returns a DataFrame with predicted label (0/1) and probability.
    Tries trained model, otherwise zero-shot.
    """
    pipe = load_trained_model_if_available()
    if pipe is not None:
        return predict_with_trained_model(pipe, texts)
    return predict_zero_shot(texts)

# ------------------- MAIN -------------------
def main():
    # Gather texts
    if INFER_SOURCE is None:
        texts = [t for t in SAMPLE_TEXTS if t.strip()]
        print(f"[INFO] Using SAMPLE_TEXTS ({len(texts)} items). Set INFER_SOURCE to run on a file.")
    else:
        if not os.path.exists(INFER_SOURCE):
            raise FileNotFoundError(INFER_SOURCE)
        texts = load_texts_from_file(INFER_SOURCE)
        print(f"[INFO] Loaded {len(texts)} texts from:", INFER_SOURCE)

    if not texts:
        print("[WARN] No texts to analyze.")
        return

    # Predict
    df = identify_suicidal_thoughts(texts)

    # Pretty print first few
    print("\n=== Predictions (first 10) ===")
    for _, r in df.head(10).iterrows():
        print(f"[{r['pred_name']}] p={r['prob_suicidal']}: {r['text'][:120]}")

    # Save
    save_predictions(df)

if __name__ == "__main__":
    main()


[INFO] Using SAMPLE_TEXTS (3 items). Set INFER_SOURCE to run on a file.
[INFO] Loaded trained model: C:\Users\sagni\Downloads\Suicidal Detection\ms_model.joblib

=== Predictions (first 10) ===
[non-suicidal] p=0.2104050537994849: I feel okay lately, just a bit stressed about exams.
[non-suicidal] p=0.4390754524264109: Sometimes I just want to disappear.
[non-suicidal] p=0.3667858837171598: I am exhausted but I will be fine after some rest.
[OUT] Saved CSV: C:\Users\sagni\Downloads\Suicidal Detection\mindshield_suicide_pred.csv
[OUT] Saved JSON: C:\Users\sagni\Downloads\Suicidal Detection\mindshield_suicide_pred.json
