In [1]:
import os
import re
import json
import pickle
from datetime import datetime
from typing import Dict, Any, Tuple, List

import numpy as np
import pandas as pd
import h5py

try:
    import yaml
    HAVE_YAML = True
except Exception:
    HAVE_YAML = False

# --------- USER PATHS ----------
TRAIN_PATH = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\archive\Hatespeech-Hindi_Train.csv"
VALID_PATH = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\archive\Hatespeech-Hindi_Valid.csv"
TEST_PATH  = r"C:\Users\sagni\Downloads\Tox Watch Hinglish\archive\Hatespeech-Hindi_Test.csv"
OUT_DIR    = r"C:\Users\sagni\Downloads\Tox Watch Hinglish"
# --------------------------------

TEXT_CANDS = [
    "text","tweet","sentence","content","message","post","comment",
    "clean_text","utterance","selftext","title"
]
LABEL_CANDS = [
    "label","category","class","target","task_1","task_2","subtask_a","hs_label","y"
]

def ensure_out_dir(path: str):
    os.makedirs(path, exist_ok=True)

def load_csv(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"CSV not found: {path}")
    try:
        df = pd.read_csv(path)
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding="utf-8", errors="ignore")
    # If still messy, try latin-1 silently
    if df.empty:
        try:
            df = pd.read_csv(path, encoding="latin-1")
        except Exception:
            pass
    return df

def detect_text_and_label(df: pd.DataFrame) -> Tuple[pd.Series, pd.Series, str, str]:
    cols_lower = {c.lower(): c for c in df.columns}

    # Text
    text_col = None
    for c in TEXT_CANDS:
        if c in cols_lower:
            text_col = cols_lower[c]
            break
    if text_col is None and "title" in cols_lower and "selftext" in cols_lower:
        # Reddit-style concat
        t = (df[cols_lower["title"]].fillna("").astype(str) + " " +
             df[cols_lower["selftext"]].fillna("").astype(str)).str.strip()
        text = t
        chosen_text_name = "title+selftext"
    else:
        if text_col is None:
            # fallback: first object/string column
            obj_cols = [c for c in df.columns if df[c].dtype == object]
            if not obj_cols:
                raise ValueError("Could not detect a text column. Rename one column to 'text'.")
            text_col = obj_cols[0]
        text = df[text_col].astype(str)
        chosen_text_name = text_col

    # Clean text
    text = text.fillna("").str.replace(r"\s+", " ", regex=True).str.strip()
    # Drop empty rows after cleaning
    mask_nonempty = text != ""
    text = text[mask_nonempty]
    df = df.loc[text.index]

    # Label
    label_col = None
    for c in LABEL_CANDS:
        if c in cols_lower:
            label_col = cols_lower[c]
            break
    if label_col is None:
        # Heuristic: find a low-cardinality column (<= 20 unique) that looks categorical
        for c in df.columns:
            uniq = pd.Series(df[c].dropna().unique())
            if uniq.size <= 20 and (df[c].dtype == object or pd.api.types.is_integer_dtype(df[c])):
                label_col = c
                break
    if label_col is None:
        raise ValueError("Could not detect a label column. Rename one column to 'label' or add to LABEL_CANDS.")

    labels_raw = df[label_col]
    labels_raw = labels_raw.loc[text.index]
    return text, labels_raw, chosen_text_name, label_col

def build_global_label_map(splits_raw: Dict[str, pd.Series]) -> Dict[Any, int]:
    """Create a single mapping for all unique labels (strings or ints) across splits."""
    uniq_values = []
    for s in splits_raw.values():
        uniq_values.extend(list(pd.Series(s).dropna().unique()))
    # Normalize to strings where appropriate to ensure consistent ordering of text labels
    # but keep numeric ints as ints if fully numeric
    # Strategy: cast to str for sorting consistency, then map back to original token as key
    uniq_values = list(dict.fromkeys(uniq_values))  # de-duplicate preserve order
    # Sort by string form to be stable/reproducible
    uniq_sorted = sorted(uniq_values, key=lambda x: str(x).lower())

    label2id = {v: i for i, v in enumerate(uniq_sorted)}
    return label2id

def apply_label_map(series: pd.Series, label2id: Dict[Any, int]) -> pd.Series:
    mapped = series.map(lambda x: label2id.get(x, None))
    # Drop rows where label is unknown
    mapped = mapped[mapped.notna()].astype(int)
    return mapped

def class_counts(df: pd.DataFrame) -> Dict[str, int]:
    c = df["label"].value_counts().to_dict()
    return {str(int(k)): int(v) for k, v in c.items()}

def write_h5(path: str, splits: Dict[str, pd.DataFrame]) -> None:
    with h5py.File(path, "w") as h5:
        for split, df in splits.items():
            grp = h5.create_group(split)
            str_dt = h5py.string_dtype(encoding="utf-8")
            texts = df["text"].astype(str).values
            labels = df["label"].astype(np.int16).values  # support >2 classes
            grp.create_dataset("text", data=texts, dtype=str_dt, compression="gzip")
            grp.create_dataset("label", data=labels, dtype=np.int16, compression="gzip")

def write_pkl(path: str, splits: Dict[str, pd.DataFrame], meta: Dict[str, Any]) -> None:
    payload = {
        "splits": {k: v.copy() for k, v in splits.items()},
        "meta": meta
    }
    with open(path, "wb") as f:
        pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)

def write_yaml(path: str, meta: Dict[str, Any]) -> None:
    if HAVE_YAML:
        with open(path, "w", encoding="utf-8") as f:
            yaml.safe_dump(meta, f, sort_keys=False, allow_unicode=True)
    else:
        # Fallback: JSON string into .yaml file
        with open(path, "w", encoding="utf-8") as f:
            f.write(json.dumps(meta, ensure_ascii=False, indent=2))

def write_jsonl(path: str, splits: Dict[str, pd.DataFrame]) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for split, df in splits.items():
            for _, row in df.iterrows():
                rec = {"split": split, "text": row["text"], "label": int(row["label"])}
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")

def main():
    ensure_out_dir(OUT_DIR)

    # Load and detect columns for each split
    dfs_raw = {
        "train": load_csv(TRAIN_PATH),
        "valid": load_csv(VALID_PATH),
        "test":  load_csv(TEST_PATH)
    }

    split_texts_raw: Dict[str, pd.Series] = {}
    split_labels_raw: Dict[str, pd.Series] = {}
    text_cols_used: Dict[str, str] = {}
    label_cols_used: Dict[str, str] = {}

    for split, df in dfs_raw.items():
        text, labels_raw, text_col, label_col = detect_text_and_label(df)
        split_texts_raw[split] = text
        split_labels_raw[split] = labels_raw.loc[text.index]
        text_cols_used[split] = text_col
        label_cols_used[split] = label_col

    # Build a single label2id mapping across all splits
    label2id = build_global_label_map(split_labels_raw)
    id2label = {int(v): str(k) for k, v in label2id.items()}

    # Apply mapping and build clean dataframes
    splits_clean: Dict[str, pd.DataFrame] = {}
    for split in ["train", "valid", "test"]:
        t = split_texts_raw[split]
        y_id = apply_label_map(split_labels_raw[split], label2id)
        # Align indices (drop any rows that lost labels after mapping)
        aligned = pd.DataFrame({"text": t, "label": y_id}).dropna()
        splits_clean[split] = aligned.reset_index(drop=True)

    # Meta/summary
    meta = {
        "dataset_name": "toxwatch_hinglish",
        "source_csv": {
            "train": TRAIN_PATH,
            "valid": VALID_PATH,
            "test": TEST_PATH
        },
        "created_utc": datetime.utcnow().isoformat() + "Z",
        "text_columns_used": text_cols_used,
        "label_columns_used": label_cols_used,
        "label2id": {str(k): int(v) for k, v in label2id.items()},
        "id2label": {str(k): v for k, v in id2label.items()},
        "sizes": {k: int(v.shape[0]) for k, v in splits_clean.items()},
        "class_balance": {split: class_counts(df) for split, df in splits_clean.items()},
        "notes": "Labels are mapped consistently across splits using label2id/id2label."
    }

    # Paths
    out_h5    = os.path.join(OUT_DIR, "toxwatch_hinglish.h5")
    out_pkl   = os.path.join(OUT_DIR, "toxwatch_hinglish.pkl")
    out_yaml  = os.path.join(OUT_DIR, "toxwatch_config.yaml")
    out_jsonl = os.path.join(OUT_DIR, "toxwatch_hinglish.jsonl")
    out_sum   = os.path.join(OUT_DIR, "toxwatch_summary.json")

    # Write artifacts
    write_h5(out_h5, splits_clean)
    write_pkl(out_pkl, splits_clean, meta)
    write_yaml(out_yaml, meta)
    write_jsonl(out_jsonl, splits_clean)
    with open(out_sum, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)

    # Console summary
    print("=== ToxWatch Hinglish Artifacts Written ===")
    print(f"H5:    {out_h5}")
    print(f"PKL:   {out_pkl}")
    print(f"YAML:  {out_yaml}")
    print(f"JSONL: {out_jsonl}")
    print(f"SUM:   {out_sum}")
    print("\nSizes:", meta["sizes"])
    print("Class balance:", meta["class_balance"])
    print("Label2ID:", meta["label2id"])

if __name__ == "__main__":
    main()


=== ToxWatch Hinglish Artifacts Written ===
H5:    C:\Users\sagni\Downloads\Tox Watch Hinglish\toxwatch_hinglish.h5
PKL:   C:\Users\sagni\Downloads\Tox Watch Hinglish\toxwatch_hinglish.pkl
YAML:  C:\Users\sagni\Downloads\Tox Watch Hinglish\toxwatch_config.yaml
JSONL: C:\Users\sagni\Downloads\Tox Watch Hinglish\toxwatch_hinglish.jsonl
SUM:   C:\Users\sagni\Downloads\Tox Watch Hinglish\toxwatch_summary.json

Sizes: {'train': 5728, 'valid': 811, 'test': 1653}
Class balance: {'train': {'14': 3050, '8': 1009, '12': 478, '15': 405, '0': 305, '13': 163, '7': 81, '5': 74, '1': 34, '6': 28, '11': 28, '9': 27, '4': 24, '2': 9, '3': 9, '10': 4}, 'valid': {'14': 435, '8': 144, '12': 68, '15': 57, '0': 43, '13': 23, '7': 11, '5': 10, '11': 4, '6': 4, '1': 4, '4': 3, '9': 3, '2': 1, '3': 1}, 'test': {'14': 873, '8': 289, '12': 138, '15': 117, '0': 89, '13': 47, '7': 24, '5': 22, '1': 11, '6': 9, '9': 9, '4': 8, '11': 8, '2': 3, '10': 3, '3': 3}}
Label2ID: {'defamation': 0, 'defamation,fake': 1, 'def