In [1]:
import os, re, json, warnings
from datetime import datetime
from typing import Any, Optional, Tuple, Dict, List

import numpy as np
import pandas as pd
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report
)

warnings.filterwarnings("ignore", category=FutureWarning)

# ------------------- USER PATHS -------------------
CSV_PATH   = r"C:\Users\sagni\Downloads\Suicidal Detection\archive\suicidal_ideation_reddit_annotated.csv"
OUT_DIR    = r"C:\Users\sagni\Downloads\Suicidal Detection"
MODEL_PATH = os.path.join(OUT_DIR, "ms_model.joblib")

# If None, we evaluate on the TEST split created from the training CSV.
# Otherwise, set to a path of .txt / .csv / .json to run predictions on that file.
INFER_SOURCE = None
# Examples:
# INFER_SOURCE = r"C:\Users\sagni\Downloads\Suicidal Detection\some_new_texts.txt"
# INFER_SOURCE = r"C:\Users\sagni\Downloads\Suicidal Detection\new_messages.csv"
# INFER_SOURCE = r"C:\Users\sagni\Downloads\Suicidal Detection\messages.json"
# --------------------------------------------------

RANDOM_STATE = 42
SPLIT_FRAC = (0.70, 0.15, 0.15)  # train/valid/test

TEXT_CANDS = [
    "text", "message", "content", "body", "post", "comment", "clean_text", "utterance",
    "selftext", "title"
]
LABEL_CANDS = ["label", "class", "target", "is_suicidal", "suicidal", "suicide", "risk", "y"]

POS_TOKENS = {"1","true","t","yes","y","suicidal","suicide","positive","pos","high","at risk"}
NEG_TOKENS = {"0","false","f","no","n","non-suicidal","non suicidal","negative","neg","low","not at risk"}

# ------------------- UTILITIES -------------------
def ensure_out():
    os.makedirs(OUT_DIR, exist_ok=True)

def load_csv(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin-1")

def normalize_labels(series: pd.Series) -> pd.Series:
    def to01(x: Any) -> Optional[int]:
        if pd.isna(x): return None
        if isinstance(x, (int, np.integer, float, np.floating)):
            xi = int(x)
            if xi in (0,1): return xi
        xs = str(x).strip().lower()
        if xs in POS_TOKENS: return 1
        if xs in NEG_TOKENS: return 0
        if re.fullmatch(r"(non[-\s]?suicidal|no risk|not suicidal)", xs): return 0
        if re.fullmatch(r"(suicidal|at[-\s]?risk|high risk|suicide)", xs): return 1
        try:
            vi = int(float(xs))
            if vi in (0,1): return vi
        except: pass
        return None

    mapped = series.map(to01)
    mask = mapped.isin([0,1])
    if mask.sum() == 0:
        raise ValueError("Could not map labels to {0,1}. Ensure labels indicate suicidal vs non.")
    return mapped[mask].astype(int)

def detect_train_columns(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series, str, str]:
    cols_lower = {c.lower(): c for c in df.columns}

    # Text
    text_col = None
    for c in TEXT_CANDS:
        if c in cols_lower:
            text_col = cols_lower[c]
            break

    if text_col is None and "title" in cols_lower and "selftext" in cols_lower:
        title_col = cols_lower["title"]
        st_col = cols_lower["selftext"]
        text = (df[title_col].fillna("").astype(str) + " " + df[st_col].fillna("").astype(str)).str.strip()
    else:
        if text_col is None:
            obj_cols = [c for c in df.columns if df[c].dtype == object]
            if not obj_cols:
                raise ValueError("No obvious text column. Rename to 'text'.")
            text_col = obj_cols[0]
        text = df[text_col].astype(str)

    text = text.fillna("").str.replace(r"\s+", " ", regex=True).str.strip()
    text = text[text != ""]
    df = df.loc[text.index]

    # Label
    label_col = None
    for c in LABEL_CANDS:
        if c in cols_lower:
            label_col = cols_lower[c]
            break
    if label_col is None:
        for c in df.columns:
            uniq = pd.Series(df[c].dropna().unique()).astype(str).str.lower().str.strip()
            if len(uniq) <= 6 and uniq.isin(list(POS_TOKENS | NEG_TOKENS)).any():
                label_col = c; break
    if label_col is None:
        raise ValueError("No label column found. Rename to 'label' or add to LABEL_CANDS.")

    labels = normalize_labels(df[label_col])
    labels = labels.loc[text.index]
    return text, labels, (text_col if text_col else "title+selftext"), label_col

def make_splits(text: pd.Series, y: pd.Series):
    df = pd.DataFrame({"text": text, "label": y}).dropna()
    train_df, temp_df = train_test_split(
        df, test_size=(1.0 - SPLIT_FRAC[0]),
        stratify=df["label"], random_state=RANDOM_STATE, shuffle=True
    )
    valid_size = SPLIT_FRAC[1] / (SPLIT_FRAC[1] + SPLIT_FRAC[2])
    valid_df, test_df = train_test_split(
        temp_df, test_size=(1.0 - valid_size),
        stratify=temp_df["label"], random_state=RANDOM_STATE, shuffle=True
    )
    for d in (train_df, valid_df, test_df):
        d.reset_index(drop=True, inplace=True)
    return train_df, valid_df, test_df

def build_pipeline() -> Pipeline:
    return Pipeline(steps=[
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            ngram_range=(1,2),       # unigrams + bigrams
            min_df=2,                # ignore super-rare noise
            max_features=200000,     # cap features
            strip_accents="unicode"
        )),
        ("clf", LogisticRegression(
            solver="saga",
            penalty="l2",
            C=1.0,
            class_weight="balanced",  # handle class imbalance
            max_iter=300,
            n_jobs=-1,
            random_state=RANDOM_STATE
        ))
    ])

# ------------------- TRAIN + SAVE -------------------
def train_and_save_model() -> Dict[str, Any]:
    df = load_csv(CSV_PATH)
    text, y, text_col, label_col = detect_train_columns(df)
    train_df, valid_df, test_df = make_splits(text, y)

    X_train = np.concatenate([train_df["text"].values, valid_df["text"].values])
    y_train = np.concatenate([train_df["label"].values, valid_df["label"].values])

    pipe = build_pipeline()
    pipe.fit(X_train, y_train)

    dump(pipe, MODEL_PATH)

    # quick test metrics for reference
    y_pred = pipe.predict(test_df["text"].values)
    acc  = float(accuracy_score(test_df["label"].values, y_pred))
    f1   = float(f1_score(test_df["label"].values, y_pred, zero_division=0))
    prec = float(precision_score(test_df["label"].values, y_pred, zero_division=0))
    rec  = float(recall_score(test_df["label"].values, y_pred, zero_division=0))

    # Save a report
    rep = classification_report(
        test_df["label"].values, y_pred,
        target_names=["Non-suicidal (0)", "Suicidal (1)"], zero_division=0
    )
    with open(os.path.join(OUT_DIR, "ms_classification_report.txt"), "w", encoding="utf-8") as f:
        f.write(rep)

    summary = {
        "created_utc": datetime.utcnow().isoformat() + "Z",
        "source_csv": CSV_PATH,
        "text_column_used": text_col,
        "label_column_used": label_col,
        "splits_sizes": {
            "train": int(len(train_df)), "valid": int(len(valid_df)), "test": int(len(test_df))
        },
        "test_metrics": {
            "accuracy": round(acc, 4),
            "f1": round(f1, 4),
            "precision": round(prec, 4),
            "recall": round(rec, 4)
        },
        "model_path": MODEL_PATH
    }
    with open(os.path.join(OUT_DIR, "ms_infer_summary.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("[TRAIN] Saved model to:", MODEL_PATH)
    print("[TRAIN] Test metrics:", summary["test_metrics"])
    return {"pipe": pipe, "test_df": test_df, "summary": summary}

# ------------------- INFERENCE HELPERS -------------------
def load_model() -> Pipeline:
    if not os.path.exists(MODEL_PATH):
        raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Run training first.")
    return load(MODEL_PATH)

def _infer_from_texts(pipe: Pipeline, texts: List[str]) -> pd.DataFrame:
    preds = pipe.predict(texts)
    if hasattr(pipe[-1], "predict_proba"):
        prob1 = pipe.predict_proba(texts)[:, 1]
    else:
        prob1 = np.full(len(preds), np.nan, dtype=float)

    df = pd.DataFrame({
        "text": texts,
        "pred_label": preds.astype(int),
        "prob_suicidal": prob1
    })
    df["pred_name"] = np.where(df["pred_label"] == 1, "suicidal", "non-suicidal")
    return df

def infer_from_file(pipe: Pipeline, path: str) -> pd.DataFrame:
    p = str(path)
    ext = os.path.splitext(p)[1].lower()
    if ext == ".txt":
        with open(p, "r", encoding="utf-8", errors="ignore") as f:
            lines = [ln.strip() for ln in f if ln.strip()]
        return _infer_from_texts(pipe, lines)
    elif ext == ".json":
        with open(p, "r", encoding="utf-8") as f:
            data = json.load(f)
        # accept {"messages":[{"text": ...}, ...]} or list[str]/list[dict{text:...}]
        if isinstance(data, dict) and "messages" in data:
            msgs = data["messages"]
            texts = [m["text"] if isinstance(m, dict) and "text" in m else str(m) for m in msgs]
            return _infer_from_texts(pipe, texts)
        if isinstance(data, list):
            texts = [d["text"] if isinstance(d, dict) and "text" in d else str(d) for d in data]
            return _infer_from_texts(pipe, texts)
        raise ValueError("Unsupported JSON schema for inference.")
    elif ext == ".csv":
        df = pd.read_csv(p)
        # detect a text column
        text_col = None
        lower = {c.lower(): c for c in df.columns}
        for c in TEXT_CANDS:
            if c in lower: text_col = lower[c]; break
        if text_col is None:
            # Single-column CSV fallback
            if df.shape[1] == 1:
                text_col = df.columns[0]
            else:
                raise ValueError("CSV must contain a text column (e.g., 'text').")
        texts = df[text_col].astype(str).fillna("").str.strip().tolist()
        texts = [t for t in texts if t]
        return _infer_from_texts(pipe, texts)
    else:
        raise ValueError("Supported inference files: .txt, .csv, .json")

def save_predictions(df: pd.DataFrame, base_name: str = "ms_predictions"):
    csv_path  = os.path.join(OUT_DIR, f"{base_name}.csv")
    json_path = os.path.join(OUT_DIR, f"{base_name}.json")
    df.to_csv(csv_path, index=False, encoding="utf-8")
    with open(json_path, "w", encoding="utf-8") as f:
        payload = {
            "created_utc": datetime.utcnow().isoformat() + "Z",
            "items": df.to_dict(orient="records"),
            "summary": {
                "total": int(len(df)),
                "pred_counts": {
                    "non-suicidal": int((df["pred_label"]==0).sum()),
                    "suicidal": int((df["pred_label"]==1).sum())
                }
            }
        }
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print("[INFER] Saved predictions:")
    print("  CSV :", csv_path)
    print("  JSON:", json_path)

# ------------------- MAIN FLOW -------------------
def main():
    ensure_out()

    # Train model if missing; else load
    if not os.path.exists(MODEL_PATH):
        train_out = train_and_save_model()
        pipe = train_out["pipe"]
        test_df = train_out["test_df"]
    else:
        print("[INFO] Loading existing model:", MODEL_PATH)
        pipe = load_model()
        # still need test_df for evaluation if INFER_SOURCE is None
        df = load_csv(CSV_PATH)
        text, y, *_ = detect_train_columns(df)
        _, _, test_df = make_splits(text, y)

    if INFER_SOURCE is None:
        # Evaluate on TEST split and save predictions
        infer_df = _infer_from_texts(pipe, test_df["text"].tolist())
        infer_df.insert(1, "y_true", test_df["label"].values)

        acc  = float(accuracy_score(test_df["label"].values, infer_df["pred_label"].values))
        f1   = float(f1_score(test_df["label"].values, infer_df["pred_label"].values, zero_division=0))
        prec = float(precision_score(test_df["label"].values, infer_df["pred_label"].values, zero_division=0))
        rec  = float(recall_score(test_df["label"].values, infer_df["pred_label"].values, zero_division=0))

        # Append metrics to summary file
        summary_path = os.path.join(OUT_DIR, "ms_infer_summary.json")
        base_summary = {}
        if os.path.exists(summary_path):
            try:
                with open(summary_path, "r", encoding="utf-8") as f:
                    base_summary = json.load(f)
            except Exception:
                base_summary = {}
        base_summary.setdefault("inference_runs", [])
        base_summary["inference_runs"].append({
            "run_utc": datetime.utcnow().isoformat() + "Z",
            "source": "TEST_SPLIT",
            "count": int(len(infer_df)),
            "metrics": {
                "accuracy": round(acc, 4), "f1": round(f1, 4),
                "precision": round(prec, 4), "recall": round(rec, 4)
            }
        })
        with open(summary_path, "w", encoding="utf-8") as f:
            json.dump(base_summary, f, ensure_ascii=False, indent=2)

        save_predictions(infer_df, base_name="ms_predictions")
        print("[TEST] Metrics:", {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec})
    else:
        # Predict on external file
        infer_df = infer_from_file(pipe, INFER_SOURCE)
        save_predictions(infer_df, base_name="ms_predictions_external")
        print(f"[INFER] Ran predictions on: {INFER_SOURCE}")
        print(infer_df.head(10))

if __name__ == "__main__":
    main()


[TRAIN] Saved model to: C:\Users\sagni\Downloads\Suicidal Detection\ms_model.joblib
[TRAIN] Test metrics: {'accuracy': 0.8662, 'f1': 0.8713, 'precision': 0.8758, 'recall': 0.8669}
[INFER] Saved predictions:
  CSV : C:\Users\sagni\Downloads\Suicidal Detection\ms_predictions.csv
  JSON: C:\Users\sagni\Downloads\Suicidal Detection\ms_predictions.json
[TEST] Metrics: {'accuracy': 0.866245392311743, 'f1': 0.8713272543059777, 'precision': 0.8757637474541752, 'recall': 0.8669354838709677}
