In [1]:
import os, re, json, warnings
from datetime import datetime
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
from joblib import dump, load

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report
)

warnings.filterwarnings("ignore", category=FutureWarning)

# ------------------ USER PATHS ------------------
TRAIN_PATH = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\archive\Hatespeech-Hindi_Train.csv"
VALID_PATH = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\archive\Hatespeech-Hindi_Valid.csv"
TEST_PATH  = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\archive\Hatespeech-Hindi_Test.csv"
OUT_DIR    = r"C:\Users\sagni\Downloads\Tox Watch Hinglish"

MODEL_PATH    = os.path.join(OUT_DIR, "twh_model.joblib")
LABELMAP_PATH = os.path.join(OUT_DIR, "twh_label_map.json")

# If None → evaluate on test split. Otherwise infer on this file (.txt/.csv/.json)
INFER_SOURCE = None
# Examples:
# INFER_SOURCE = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\new_comments.csv"
# INFER_SOURCE = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\snippets.txt"
# INFER_SOURCE = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\batch.json"

RANDOM_STATE = 42

TEXT_CANDS = [
    "text","tweet","sentence","content","message","post","comment",
    "clean_text","utterance","selftext","title"
]
LABEL_CANDS = [
    "label","category","class","target","task_1","task_2","subtask_a","hs_label","y"
]

# ------------------ HELPERS ------------------
def ensure_out():
    os.makedirs(OUT_DIR, exist_ok=True)

def read_csv(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin-1")

def detect_text_and_label(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series, str, str]:
    lower = {c.lower(): c for c in df.columns}

    # text detection (or title+selftext)
    tcol = None
    for c in TEXT_CANDS:
        if c in lower:
            tcol = lower[c]; break
    if tcol is None and "title" in lower and "selftext" in lower:
        text = (df[lower["title"]].fillna("").astype(str) + " " +
                df[lower["selftext"]].fillna("").astype(str)).str.strip()
        tcol_used = "title+selftext"
    else:
        if tcol is None:
            obj = [c for c in df.columns if df[c].dtype == object]
            if not obj:
                raise ValueError("No obvious text column. Rename a column to 'text'.")
            tcol = obj[0]
        text = df[tcol].astype(str)
        tcol_used = tcol

    text = text.fillna("").str.replace(r"\s+"," ", regex=True).str.strip()
    mask = text != ""
    text = text[mask]
    df = df.loc[text.index]

    # label detection (low-cardinality fallback)
    lcol = None
    for c in LABEL_CANDS:
        if c in lower:
            lcol = lower[c]; break
    if lcol is None:
        for c in df.columns:
            uniq = pd.Series(df[c].dropna().unique())
            if uniq.size <= 50 and (df[c].dtype == object or pd.api.types.is_integer_dtype(df[c])):
                lcol = c; break
    if lcol is None:
        raise ValueError("No label column found. Rename to 'label' or add to LABEL_CANDS.")

    labels = df[lcol].loc[text.index]
    return text, labels, tcol_used, lcol

def build_label_map(all_labels: Dict[str, pd.Series]) -> Dict[Any, int]:
    vals = []
    for s in all_labels.values():
        vals.extend(list(pd.Series(s).dropna().unique()))
    vals = list(dict.fromkeys(vals))                         # de-dup preserve order
    vals_sorted = sorted(vals, key=lambda x: str(x).lower()) # stable
    return {v: i for i, v in enumerate(vals_sorted)}

def apply_label_map(series: pd.Series, label2id: Dict[Any, int]) -> pd.Series:
    mapped = series.map(lambda x: label2id.get(x, None))
    return mapped[mapped.notna()].astype(int)

def build_pipeline() -> Pipeline:
    return Pipeline(steps=[
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            ngram_range=(1,2),       # uni+bi
            min_df=2,
            max_features=300_000,
            strip_accents="unicode"
        )),
        ("clf", LogisticRegression(
            solver="saga",
            penalty="l2",
            C=1.0,
            max_iter=400,
            n_jobs=-1,
            random_state=RANDOM_STATE,
            class_weight="balanced",
            multi_class="auto"
        ))
    ])

def sanitize_label_for_key(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9_]+", "_", s.strip()) or "label"

# ------------------ PREDICTION I/O ------------------
def load_texts_for_infer(path: str) -> List[str]:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return [ln.strip() for ln in f if ln.strip()]
    if ext == ".json":
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict) and "messages" in data:
            msgs = data["messages"]
            return [m["text"] if isinstance(m,dict) and "text" in m else str(m) for m in msgs]
        if isinstance(data, list):
            return [d["text"] if isinstance(d,dict) and "text" in d else str(d) for d in data]
        raise ValueError("Unsupported JSON schema. Use list or {'messages': [{'text': ...}, ...]}")
    if ext == ".csv":
        df = pd.read_csv(path)
        lower = {c.lower(): c for c in df.columns}
        tcol = None
        for c in TEXT_CANDS:
            if c in lower: tcol = lower[c]; break
        if tcol is None:
            if df.shape[1] == 1: tcol = df.columns[0]
            else: raise ValueError("CSV must contain a text-like column (e.g., 'text').")
        return df[tcol].astype(str).fillna("").str.strip().tolist()
    raise ValueError("Supported inference files: .txt, .csv, .json")

def save_predictions_dataframe(df: pd.DataFrame, base_name: str):
    csv_path  = os.path.join(OUT_DIR, f"{base_name}.csv")
    json_path = os.path.join(OUT_DIR, f"{base_name}.json")
    df.to_csv(csv_path, index=False, encoding="utf-8")
    with open(json_path, "w", encoding="utf-8") as f:
        payload = {
            "created_utc": datetime.utcnow().isoformat() + "Z",
            "items": df.to_dict(orient="records"),
            "summary": {
                "total": int(len(df)),
                "pred_counts": df["pred_label"].value_counts().to_dict()
            }
        }
        json.dump(payload, f, ensure_ascii=False, indent=2)
    print("[OUT] Saved:", csv_path)
    print("[OUT] Saved:", json_path)

# ------------------ TRAIN / LOAD ------------------
def train_and_save(TRAIN_PATH, VALID_PATH, TEST_PATH):
    # Load splits
    df_train = read_csv(TRAIN_PATH)
    df_valid = read_csv(VALID_PATH)
    df_test  = read_csv(TEST_PATH)

    # Detect columns
    t_tr, y_tr_raw, tcol_tr, lcol_tr = detect_text_and_label(df_train)
    t_va, y_va_raw, tcol_va, lcol_va = detect_text_and_label(df_valid)
    t_te, y_te_raw, tcol_te, lcol_te = detect_text_and_label(df_test)

    # Build consistent label map across all splits
    label2id = build_label_map({"train": y_tr_raw, "valid": y_va_raw, "test": y_te_raw})
    id2label = {int(v): str(k) for k, v in label2id.items()}

    # Apply mapping and align
    y_tr = apply_label_map(y_tr_raw, label2id)
    y_va = apply_label_map(y_va_raw, label2id)
    y_te = apply_label_map(y_te_raw, label2id)

    X_train = pd.concat([t_tr, t_va], axis=0).astype(str).values
    y_train = pd.concat([y_tr, y_va], axis=0).values
    X_test  = t_te.astype(str).values
    y_test  = y_te.values

    # Build & fit
    pipe = build_pipeline()
    pipe.fit(X_train, y_train)

    # Save model + label map
    dump(pipe, MODEL_PATH)
    with open(LABELMAP_PATH, "w", encoding="utf-8") as f:
        json.dump({"label2id": {str(k): int(v) for k,v in label2id.items()},
                   "id2label": {str(k): v for k,v in id2label.items()}},
                  f, ensure_ascii=False, indent=2)
    print("[SAVE] Model:", MODEL_PATH)
    print("[SAVE] Label map:", LABELMAP_PATH)

    # Evaluate on TEST
    y_pred = pipe.predict(X_test)
    if hasattr(pipe[-1], "predict_proba"):
        proba = pipe.predict_proba(X_test)
    else:
        proba = None

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
    report = classification_report(
        y_test, y_pred,
        target_names=[id2label[i] for i in sorted(set(y_test) | set(y_pred))],
        zero_division=0
    )
    with open(os.path.join(OUT_DIR, "twh_classification_report.txt"), "w", encoding="utf-8") as f:
        f.write(report)

    # Save predictions on test
    preds_df = pd.DataFrame({
        "text": X_test,
        "y_true": [id2label[int(i)] for i in y_test],
        "pred_label": [id2label[int(i)] for i in y_pred],
    })

    # add probability columns if available
    if proba is not None:
        for i in range(proba.shape[1]):
            lab = id2label[i]
            preds_df[f"p_{sanitize_label_for_key(lab)}"] = proba[:, i]

    save_predictions_dataframe(preds_df, base_name="twh_predictions_test")

    # Update / write summary
    summary_path = os.path.join(OUT_DIR, "twh_infer_summary.json")
    base = {}
    if os.path.exists(summary_path):
        try:
            with open(summary_path, "r", encoding="utf-8") as f:
                base = json.load(f)
        except Exception:
            base = {}
    base.setdefault("runs", [])
    base["runs"].append({
        "run_utc": datetime.utcnow().isoformat() + "Z",
        "mode": "TEST_EVAL",
        "sources": {"train": TRAIN_PATH, "valid": VALID_PATH, "test": TEST_PATH},
        "metrics": {
            "accuracy": round(float(acc), 4),
            "macro_precision": round(float(p), 4),
            "macro_recall": round(float(r), 4),
            "macro_f1": round(float(f1), 4)
        }
    })
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(base, f, ensure_ascii=False, indent=2)

    print("[TEST] Acc:", round(acc, 4), "| Macro-F1:", round(f1, 4))
    return pipe, id2label

def load_model_and_labels():
    if not (os.path.exists(MODEL_PATH) and os.path.exists(LABELMAP_PATH)):
        return None, None
    pipe = load(MODEL_PATH)
    with open(LABELMAP_PATH, "r", encoding="utf-8") as f:
        maps = json.load(f)
    id2label = {int(k): v for k, v in maps.get("id2label", {}).items()}
    return pipe, id2label

# ------------------ MAIN ------------------
def main():
    ensure_out()

    # Train if model not present; else load
    pipe, id2label = load_model_and_labels()
    if pipe is None:
        print("[INFO] No saved model found — training now.")
        pipe, id2label = train_and_save(TRAIN_PATH, VALID_PATH, TEST_PATH)
    else:
        print("[INFO] Loaded model and labels from disk.")

    # If INFER_SOURCE is provided, run external predictions
    if INFER_SOURCE is not None:
        if not os.path.exists(INFER_SOURCE):
            raise FileNotFoundError(INFER_SOURCE)
        texts = load_texts_for_infer(INFER_SOURCE)
        if not texts:
            print("[WARN] No texts found to predict.")
            return

        preds = pipe.predict(texts)
        if hasattr(pipe[-1], "predict_proba"):
            proba = pipe.predict_proba(texts)
        else:
            proba = None

        rows = {
            "text": texts,
            "pred_label": [id2label[int(i)] for i in preds],
        }
        if proba is not None:
            for i in range(proba.shape[1]):
                lab = id2label[i]
                rows[f"p_{sanitize_label_for_key(lab)}"] = proba[:, i]

        out_df = pd.DataFrame(rows)
        save_predictions_dataframe(out_df, base_name="twh_predictions_external")

        # Append to run history
        summary_path = os.path.join(OUT_DIR, "twh_infer_summary.json")
        base = {}
        if os.path.exists(summary_path):
            try:
                with open(summary_path, "r", encoding="utf-8") as f:
                    base = json.load(f)
            except Exception:
                base = {}
        base.setdefault("runs", [])
        base["runs"].append({
            "run_utc": datetime.utcnow().isoformat() + "Z",
            "mode": "EXTERNAL_INFER",
            "source_file": INFER_SOURCE,
            "count": int(len(out_df))
        })
        with open(summary_path, "w", encoding="utf-8") as f:
            json.dump(base, f, ensure_ascii=False, indent=2)

        print(f"[INFER] Ran predictions on: {INFER_SOURCE}")
        print(out_df.head(10))

if __name__ == "__main__":
    main()


[INFO] No saved model found — training now.




[SAVE] Model: C:\Users\sagni\Downloads\Tox Watch Hinglish\twh_model.joblib
[SAVE] Label map: C:\Users\sagni\Downloads\Tox Watch Hinglish\twh_label_map.json
[OUT] Saved: C:\Users\sagni\Downloads\Tox Watch Hinglish\twh_predictions_test.csv
[OUT] Saved: C:\Users\sagni\Downloads\Tox Watch Hinglish\twh_predictions_test.json
[TEST] Acc: 0.4737 | Macro-F1: 0.133
