In [1]:
import os, re, json, warnings
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

# ------------------- CONFIG -------------------
OUT_DIR         = r"C:\Users\sagni\Downloads\Tox Watch Hinglish"
MODEL_PATH      = os.path.join(OUT_DIR, "twh_model.joblib")
LABELMAP_PATH   = os.path.join(OUT_DIR, "twh_label_map.json")
INFER_SOURCE    = None  # e.g., r"C:\Users\sagni\Downloads\Tox Watch Hinglish\new_comments.csv"

# If INFER_SOURCE is None, these sample texts will be used:
SAMPLE_TEXTS = [
    "Aaj ka match mast tha, sab chill!",
    "I hate that community, they should go back.",
    "Yaar calm down, arguments se kuch solve nahi hota."
]

# Zero-shot threshold (after heuristic boost)
ZS_HATE_THRESHOLD = 0.50

# Which class names from your label map count as hateful (case-insensitive substring match)
HATE_NAME_HINTS = {"hate", "hatespeech", "hs"}  # extend if your dataset uses different wording

# Column detection candidates for CSV
TEXT_CANDS = [
    "text","tweet","sentence","content","message","post","comment",
    "clean_text","utterance","selftext","title"
]

# ------------------- OPTIONAL: LOAD TRAINED MODEL -------------------
from joblib import load as joblib_load
from sklearn.pipeline import Pipeline

def load_trained_model_if_available() -> Tuple[Optional[Pipeline], Optional[Dict[int,str]]]:
    if os.path.exists(MODEL_PATH) and os.path.exists(LABELMAP_PATH):
        try:
            pipe = joblib_load(MODEL_PATH)
            with open(LABELMAP_PATH, "r", encoding="utf-8") as f:
                maps = json.load(f)
            id2label = {int(k): v for k, v in maps.get("id2label", {}).items()}
            print("[INFO] Loaded trained model & labels.")
            return pipe, id2label
        except Exception as e:
            print("[WARN] Could not load model/labels; falling back to zero-shot.", e)
    else:
        print("[INFO] No saved model found; using zero-shot fallback.")
    return None, None

# ------------------- ZERO-SHOT FALLBACK (multilingual) -------------------
warnings.filterwarnings("ignore", category=FutureWarning)
_zs_pipeline = None

def ensure_zero_shot():
    """
    Multilingual NLI zero-shot model (good for Hinglish/Indic code-mix).
    """
    global _zs_pipeline
    if _zs_pipeline is None:
        try:
            from transformers import pipeline
        except Exception as e:
            raise RuntimeError(
                "Transformers not installed. Install with: pip install transformers"
            ) from e
        # Multilingual XLM-RoBERTa NLI works well across languages/code-mix
        _zs_pipeline = pipeline(
            "zero-shot-classification",
            model="joeddav/xlm-roberta-large-xnli",
            device=-1
        )

def zero_shot_hate_prob(texts: List[str]) -> np.ndarray:
    """
    Return p_hateful in [0,1] for each text.
    """
    ensure_zero_shot()
    labels = ["hateful language", "not hateful"]
    probs = []
    for t in texts:
        out = _zs_pipeline(t, labels, hypothesis_template="This text contains {label}.")
        p = 0.0
        for lab, sc in zip(out["labels"], out["scores"]):
            if lab == "hateful language":
                p = float(sc); break
        probs.append(p)
    return np.array(probs, dtype=float)

# ------------------- HEURISTICS + HIGHLIGHTS -------------------
# Intentionally avoid explicit slurs in this demo. You can extend with a private lexicon.
HATE_PATTERNS = {
    "violent_intent": r"\b(kill|wipe\s+(them|those)|exterminate|get rid of|eliminate)\b",
    "exclusion": r"\b(go back|kick (them|those) out|throw (them|those) out|send (them|those) back)\b",
    "broad_dehumanizing": r"\b(these people|that community|vermin|animals|dirty)\b",
    "explicit_hate_word": r"\bI hate\b|\bhate (them|those|that)\b"
}

def highlight_spans(text: str) -> List[Tuple[int,int,str]]:
    hits = []
    for tag, pat in HATE_PATTERNS.items():
        for m in re.finditer(pat, text, flags=re.I):
            hits.append((m.start(), m.end(), tag))
    return hits

def heuristic_boost(p_hate: float, spans: List[Tuple[int,int,str]]) -> float:
    """
    Boost probability based on explainable phrases; clamp to [0,1].
    """
    boost = 0.0
    tags = {t for *_ , t in spans}
    if "violent_intent" in tags: boost += 0.25
    if "exclusion" in tags: boost += 0.15
    if "broad_dehumanizing" in tags or "explicit_hate_word" in tags: boost += 0.10
    return max(0.0, min(1.0, p_hate + boost))

# ------------------- FILE LOADING -------------------
def load_texts_from_file(path: str) -> List[str]:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return [ln.strip() for ln in f if ln.strip()]
    if ext == ".json":
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict) and "messages" in data:
            msgs = data["messages"]
            return [m["text"] if isinstance(m, dict) and "text" in m else str(m) for m in msgs]
        if isinstance(data, list):
            return [d["text"] if isinstance(d, dict) and "text" in d else str(d) for d in data]
        raise ValueError("Unsupported JSON schema. Use list or {'messages':[{'text':...}]} .")
    if ext == ".csv":
        df = pd.read_csv(path)
        lower = {c.lower(): c for c in df.columns}
        tcol = None
        for c in TEXT_CANDS:
            if c in lower: tcol = lower[c]; break
        if tcol is None:
            if df.shape[1] == 1: tcol = df.columns[0]
            else: raise ValueError("CSV must contain a text-like column (e.g., 'text').")
        return df[tcol].astype(str).fillna("").str.strip().tolist()
    raise ValueError("Supported inputs: .txt / .csv / .json")

# ------------------- PREDICTION CORE -------------------
def hateful_from_model(pipe: Pipeline, id2label: Dict[int,str], texts: List[str]) -> pd.DataFrame:
    preds = pipe.predict(texts)
    if hasattr(pipe[-1], "predict_proba"):
        proba = pipe.predict_proba(texts)
    else:
        proba = None

    # Identify which class IDs correspond to hateful labels
    hate_ids = set()
    for i, name in id2label.items():
        if any(hint in name.lower() for hint in HATE_NAME_HINTS):
            hate_ids.add(i)

    rows = []
    for idx, t in enumerate(texts):
        spans = highlight_spans(t)
        if proba is not None:
            p_hate = float(np.sum(proba[idx, list(hate_ids)])) if hate_ids else 0.0
            p_hate = heuristic_boost(p_hate, spans)
        else:
            # If no class probabilities, approximate from label
            p_hate = 1.0 if int(preds[idx]) in hate_ids else 0.0
            p_hate = heuristic_boost(p_hate, spans)

        is_hate = 1 if p_hate >= 0.5 else 0
        rows.append({
            "text": t,
            "is_hateful": is_hate,
            "hateful_score": round(p_hate, 4),
            "model_label": id2label.get(int(preds[idx]), str(int(preds[idx]))),
            "highlights": spans
        })
    return pd.DataFrame(rows)

def hateful_zero_shot(texts: List[str]) -> pd.DataFrame:
    base_p = zero_shot_hate_prob(texts)
    rows = []
    for t, p in zip(texts, base_p):
        spans = highlight_spans(t)
        boosted = heuristic_boost(float(p), spans)
        is_hate = 1 if boosted >= ZS_HATE_THRESHOLD else 0
        rows.append({
            "text": t,
            "is_hateful": is_hate,
            "hateful_score": round(boosted, 4),
            "model_label": "zero-shot",
            "highlights": spans
        })
    return pd.DataFrame(rows)

def detect_hateful_language(texts: List[str]) -> pd.DataFrame:
    pipe, id2label = load_trained_model_if_available()
    if pipe is not None and id2label:
        return hateful_from_model(pipe, id2label, texts)
    return hateful_zero_shot(texts)

# ------------------- SAVE -------------------
def save_predictions(df: pd.DataFrame, base_name: str = "twh_hate_pred"):
    os.makedirs(OUT_DIR, exist_ok=True)
    csv_path  = os.path.join(OUT_DIR, f"{base_name}.csv")
    json_path = os.path.join(OUT_DIR, f"{base_name}.json")
    df.to_csv(csv_path, index=False, encoding="utf-8")

    with open(json_path, "w", encoding="utf-8") as f:
        payload = {
            "created_utc": datetime.utcnow().isoformat() + "Z",
            "items": df.to_dict(orient="records"),
            "summary": {
                "total": int(len(df)),
                "hateful": int((df["is_hateful"]==1).sum()),
                "not_hateful": int((df["is_hateful"]==0).sum())
            }
        }
        json.dump(payload, f, ensure_ascii=False, indent=2)

    print("[OUT] Saved CSV:", csv_path)
    print("[OUT] Saved JSON:", json_path)

# ------------------- MAIN -------------------
def main():
    # Load texts
    if INFER_SOURCE is None:
        texts = [t for t in SAMPLE_TEXTS if t.strip()]
        print(f"[INFO] Using SAMPLE_TEXTS ({len(texts)} items). Set INFER_SOURCE to run on a file.")
    else:
        if not os.path.exists(INFER_SOURCE):
            raise FileNotFoundError(INFER_SOURCE)
        texts = load_texts_from_file(INFER_SOURCE)
        print(f"[INFO] Loaded {len(texts)} texts from:", INFER_SOURCE)

    if not texts:
        print("[WARN] No texts to analyze.")
        return

    # Predict
    df = detect_hateful_language(texts)

    # Pretty print sample
    print("\n=== Predictions (first 10) ===")
    for _, r in df.head(10).iterrows():
        print(f"[{'HATE' if r['is_hateful']==1 else 'OK '}] score={r['hateful_score']}: {r['text'][:120]}")
        if r["highlights"]:
            # Make spans visible with **markers**
            s = r["text"]
            # careful: replace in reverse order to keep indices valid
            for (start, end, tag) in sorted(r["highlights"], key=lambda x: x[0], reverse=True):
                s = s[:start] + "**" + s[start:end] + "**" + s[end:]
            print("  highlights:", s)

    # Save
    save_predictions(df)

if __name__ == "__main__":
    main()


[INFO] Using SAMPLE_TEXTS (3 items). Set INFER_SOURCE to run on a file.
[INFO] Loaded trained model & labels.

=== Predictions (first 10) ===
[OK ] score=0.2355: Aaj ka match mast tha, sab chill!
[OK ] score=0.4855: I hate that community, they should go back.
  highlights: **I hate** **that community**, they should **go back**.
[OK ] score=0.2396: Yaar calm down, arguments se kuch solve nahi hota.
[OUT] Saved CSV: C:\Users\sagni\Downloads\Tox Watch Hinglish\twh_hate_pred.csv
[OUT] Saved JSON: C:\Users\sagni\Downloads\Tox Watch Hinglish\twh_hate_pred.json
