In [1]:
from datasets import load_dataset
dataset_1 = load_dataset("lmarena-ai/arena-human-preference-140k")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00007.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

data/train-00001-of-00007.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

data/train-00002-of-00007.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

data/train-00003-of-00007.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

data/train-00004-of-00007.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

data/train-00005-of-00007.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

data/train-00006-of-00007.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/135634 [00:00<?, ? examples/s]

In [2]:
import pandas as pd
df2 = pd.DataFrame(dataset_1['train'])

In [5]:
import json, random, os
from typing import Dict, Optional, Tuple


import numpy as np
import pandas as pd
from datasets import Dataset as HFDataset


def _as_text(x):
    """Arena fields sometimes lists or JSON-encoded lists; return plain text."""
    if isinstance(x, list):
        return "\n".join(s for s in x if isinstance(s, str)).strip()
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                arr = json.loads(s)
                if isinstance(arr, list):
                    return "\n".join(str(t) for t in arr).strip()
            except Exception:
                pass
        return s
    return "" if x is None else str(x)




PRIMARY_CATS = ["Code", "Math", "IF", "Creative Writing", "General"]
CRITERIA_KEYS = [
    "complexity", "creativity", "domain_knowledge",
    "problem_solving", "real_world", "specificity", "technical_accuracy"
]




def _infer_primary_category(row: dict) -> str:
    """Priority: Code > Math > IF > Creative Writing > General."""
    ct = row.get("category_tag") or {}


    def get_nested(d, *keys, default=False):
        cur = d
        for k in keys:
            if not isinstance(cur, dict) or k not in cur:
                return default
            cur = cur[k]
        return bool(cur)


    if bool(row.get("is_code", False)):
        return "Code"
    if get_nested(ct, "math_v0.1", "math"):
        return "Math"
    if get_nested(ct, "if_v0.1", "if"):
        return "IF"
    if get_nested(ct, "creative_writing_v0.1", "creative_writing"):
        return "Creative Writing"
    return "General"




def _extract_criteria(row: dict) -> Dict[str, bool]:
    ct = row.get("category_tag") or {}
    out = {k: False for k in CRITERIA_KEYS}
    criteria = ct.get("criteria_v0.1", {}) if isinstance(ct, dict) else {}
    for k in CRITERIA_KEYS:
        v = criteria.get(k, False) if isinstance(criteria, dict) else False
        out[k] = bool(v)
    return out




def _get_prompt_from_full_conversation(fc) -> str:
    """
    LM-Arena-140k style: take the *first* user turn text as the instruction.
    """
    try:
        if isinstance(fc, list) and len(fc) > 0:
            first = fc[0]
            u = first.get("user", {}) if isinstance(first, dict) else {}
            content = u.get("content", [])
            for item in content:
                if isinstance(item, dict) and item.get("type") == "text":
                    t = item.get("text", "")
                    if t:
                        return t.strip()
    except Exception:
        pass
    return ""




def _get_model_text_from_turn(turn: dict, key: str) -> str:
    """Extract model text from 'model_side_a' or 'model_side_b' in a single turn."""
    side = turn.get(key, {}) if isinstance(turn, dict) else {}
    content = side.get("content", [])
    for item in content:
        if isinstance(item, dict) and item.get("type") == "text":
            t = item.get("text", "")
            if t:
                return _as_text(t)
    return ""




def _get_two_responses_from_full_conversation(fc) -> Tuple[str, str]:
    """Take the *first* block's responses."""
    if not isinstance(fc, list) or len(fc) == 0:
        return "", ""
    first = fc[0]
    a_text = _get_model_text_from_turn(first, "model_side_a")
    b_text = _get_model_text_from_turn(first, "model_side_b")
    return a_text, b_text




def _determine_winner(row: dict) -> Optional[str]:
    """
    Return 'a', 'b', or None (tie/unknown).
    Supports 'winner' str, boolean flags, and 'vote' dict.
    """
    w = row.get("winner", None)
    if isinstance(w, str) and w:
        ws = w.lower().strip()
        if ws in {"a", "model_a", "side_a"}:
            return "a"
        if ws in {"b", "model_b", "side_b"}:
            return "b"
        if ws in {"tie", "both", "both_bad", "equal"}:
            return None


    for key in ["winner_model_a", "winner_a", "is_model_a_winner"]:
        if bool(row.get(key, False)):
            return "a"
    for key in ["winner_model_b", "winner_b", "is_model_b_winner"]:
        if bool(row.get(key, False)):
            return "b"
    if bool(row.get("winner_tie", False)):
        return None


    vote = row.get("vote", None)
    if isinstance(vote, dict):
        a_cnt = vote.get("a") or vote.get("A") or vote.get("model_a") or 0
        b_cnt = vote.get("b") or vote.get("B") or vote.get("model_b") or 0
        try:
            a_cnt = int(a_cnt)
            b_cnt = int(b_cnt)
            if a_cnt > b_cnt:
                return "a"
            if b_cnt > a_cnt:
                return "b"
        except Exception:
            pass


    return None




def _format_prompt(text: str) -> str:
    return f"### Instruction:\n{text.strip()}\n\n### Response:\n"



def prep_sft_from_df(
    df: pd.DataFrame,
    seed: int = 25,
    n_train: int = 2000,
    n_eval: int = 100,
    sample_per_category: Optional[int] = None,
    require_language: str = "en",
    dedupe: bool = True,
):
    """
    Normalize LM-Arena-140k-style df into (prompt, completion) with stratified sampling by primary_category.
    - Ties/unknown winners are dropped.
    - Sampling balanced across primary_category (same as DPO code), using stable _row_id.
    Returns: (train_df, train_ds, eval_ds, norm_df)
    """
    rng = random.Random(seed)


    # 1) Language filter
    if require_language:
        df_work = df[df["language"] == require_language].copy()
    else:
        df_work = df.copy()


    # 2) Normalize per-row into prompt/completion/primary_category/criteria
    records = []
    for _, row in df_work.iterrows():
        fc = row.get("full_conversation", None)
        prompt_text = _get_prompt_from_full_conversation(fc)
        a_text, b_text = _get_two_responses_from_full_conversation(fc)
        winner = _determine_winner(row)


        # usable prompt + both responses present
        if not prompt_text or not a_text or not b_text:
            continue


        # pick chosen as completion
        if winner == "a":
            completion = a_text
        elif winner == "b":
            completion = b_text
        else:
            # tie/unknown -> drop
            continue


        if not completion.strip():
            continue


        prim_cat = _infer_primary_category(row)
        criteria = _extract_criteria(row)


        records.append({
            "prompt": _format_prompt(_as_text(prompt_text)),
            "completion": _as_text(completion),
            "primary_category": prim_cat,
            **{f"crit_{k}": v for k, v in criteria.items()}
        })


    norm_df = pd.DataFrame.from_records(records)
    if norm_df.empty:
        raise ValueError("No usable rows after normalization (SFT).")


    if dedupe:
        before = len(norm_df)
        norm_df = norm_df.drop_duplicates(subset=["prompt", "completion"]).reset_index(drop=True)
        after = len(norm_df)
        print(f"[sft] Deduped pairs: kept {after:,}/{before:,}")


    # Stable IDs for correct top-ups and disjoint eval
    norm_df = norm_df.reset_index(drop=True)
    norm_df["_row_id"] = np.arange(len(norm_df), dtype=int)


    # 3) Stratified sampling by primary_category (stable IDs)
    cats_present = sorted(
        norm_df["primary_category"].unique().tolist(),
        key=lambda x: PRIMARY_CATS.index(x) if x in PRIMARY_CATS else 999
    )
    if sample_per_category is None:
        if n_train > 0 and len(cats_present) > 0:
            sample_per_category = max(1, n_train // len(cats_present))
        else:
            sample_per_category = None


    def stratified_ids(df_in: pd.DataFrame, total_n: int) -> list[int]:
        selected: list[int] = []
        # first pass: per-category quota
        for cat in cats_present:
            dcat = df_in[df_in["primary_category"] == cat]
            take = min(sample_per_category, len(dcat)) if sample_per_category is not None else len(dcat)
            if take > 0:
                ids = dcat.sample(n=take, random_state=seed)["_row_id"].tolist()
                selected.extend(ids)
        # dedupe
        selected = list(dict.fromkeys(selected))
        # top-up
        short = max(0, total_n - len(selected)) if total_n > 0 else 0
        if short > 0:
            remaining_ids = df_in.loc[~df_in["_row_id"].isin(selected), "_row_id"]
            if len(remaining_ids) > 0:
                topup = remaining_ids.sample(n=min(short, len(remaining_ids)), random_state=seed).tolist()
                selected.extend(topup)
        # downsample if needed
        if total_n > 0 and len(selected) > total_n:
            selected = pd.Series(selected).sample(n=total_n, random_state=seed).tolist()
        return selected


    sel_ids = stratified_ids(norm_df, n_train if n_train > 0 else len(norm_df))
    train_df = norm_df[norm_df["_row_id"].isin(sel_ids)].copy()


    eval_pool = norm_df[~norm_df["_row_id"].isin(sel_ids)].copy()
    if n_eval and len(eval_pool) > 0:
        eval_ids = eval_pool["_row_id"].sample(n=min(n_eval, len(eval_pool)), random_state=seed).tolist()
        eval_df = eval_pool[eval_pool["_row_id"].isin(eval_ids)].copy()
    else:
        eval_df = pd.DataFrame(columns=norm_df.columns)


    if len(eval_df):
        assert set(train_df["_row_id"]).isdisjoint(set(eval_df["_row_id"])), "Train/Eval overlap!"


    # 4) Convert to HF Datasets (prompt, completion)
    train_ds = HFDataset.from_pandas(train_df[["prompt", "completion"]].reset_index(drop=True), preserve_index=False)
    eval_ds = HFDataset.from_pandas(eval_df[["prompt", "completion"]].reset_index(drop=True), preserve_index=False) if len(eval_df) else None


    return train_df, train_ds, eval_ds, norm_df




def show_examples_sft(ds: HFDataset, k: int = 2, title: str = "SFT samples") -> None:
    if ds is None or len(ds) == 0 or k <= 0:
        return
    print(f"\n{title}:")
    idx = list(range(len(ds)))
    rng = np.random.default_rng(13)
    rng.shuffle(idx)
    idx = idx[: min(k, len(idx))]
    for i in idx:
        ex = ds[i]
        print("="*80)
        print(ex["prompt"])
        print("--- completion (first 300) ---")
        print(ex["completion"][:300])






def save_sft_parquet(
    train_df: pd.DataFrame,
    eval_df: pd.DataFrame,
    norm_df: pd.DataFrame,
    outdir: str = "prepared_sft_dataset",
    lang_tag: str = "en",
    n_train: int = 0,
    n_eval: int = 0,
    seed: int = 25,
):
    """
    Save normalized pool and splits. Splits contain only (prompt, completion).
    """
    os.makedirs(outdir, exist_ok=True)


    norm_path = os.path.join(outdir, f"arena140k_sft_normalized_lang-{lang_tag}.parquet")
    train_path = os.path.join(outdir, f"train_sft_{lang_tag}_n{n_train or len(train_df)}.parquet")
    eval_path  = os.path.join(outdir, f"eval_sft_{lang_tag}_n{n_eval or len(eval_df)}.parquet")


    # Save normalized (keep audit cols)
    norm_df.to_parquet(norm_path, index=False)


    # Save splits (only prompt/completion)
    train_df[["prompt", "completion"]].to_parquet(train_path, index=False)
    if len(eval_df):
        eval_df[["prompt", "completion"]].to_parquet(eval_path, index=False)


    meta = {
        "language": lang_tag,
        "n_train": int(n_train or len(train_df)),
        "n_eval": int(n_eval or len(eval_df)),
        "seed": seed,
        "columns": {"splits": ["prompt", "completion"], "normalized_extra": ["primary_category"] + [f"crit_{k}" for k in CRITERIA_KEYS]},
    }
    with open(os.path.join(outdir, "meta_sft.json"), "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)


    print("[sft] Wrote:")
    print(" -", norm_path)
    print(" -", train_path)
    if len(eval_df):
        print(" -", eval_path)






In [7]:
train_df_sft, train_ds_sft, eval_ds_sft, norm_df_sft = prep_sft_from_df(
    df2,
    seed=25,            # keep same seed as DPO
    n_train=12000,      # match your DPO train size
    n_eval=512,         # match your DPO eval size
    require_language="en"
)


show_examples_sft(train_ds_sft, k=2, title="Train SFT Samples")
show_examples_sft(eval_ds_sft,  k=1, title="Eval SFT Samples")


# Save Parquet splits you can train from
save_sft_parquet(
    train_df=train_df_sft,
    eval_df=pd.DataFrame(eval_ds_sft) if eval_ds_sft is not None else pd.DataFrame(columns=train_df_sft.columns),
    norm_df=norm_df_sft,
    outdir="prepared_sft_dataset",
    lang_tag="en",
    n_train=12000,
    n_eval=512,
    seed=25,
)



[sft] Deduped pairs: kept 48,148/51,616

Train SFT Samples:
### Instruction:
create an svg of a left handed baseball player for the texas rangers on an october night, close to halloween

### Response:

--- completion (first 300) ---
Below is a compact, self-contained SVG that depicts a *left-handed* Texas Rangers batter on a crisp late-October night—complete with a full moon, twinkling stars, and a cheeky little pumpkin to hint at Halloween.  
Just copy everything between the triple back-ticks into an `.svg` file and open it in
### Instruction:
Tell me a joke that has multiple layers of meaning, as much as you can

### Response:

--- completion (first 300) ---
Here’s a multi-layered joke for you:  

**"A man walks into a library and asks the librarian for books on paranoia. The librarian whispers, 'They’re right behind you...'"**  

### **Layers of Meaning:**  
1. **Surface-Level Humor:** It’s a classic misdirection joke—the librarian’s whisper implies th

Eval SFT Samples:
### Instruc

In [9]:
train_df_sft

Unnamed: 0,prompt,completion,primary_category,crit_complexity,crit_creativity,crit_domain_knowledge,crit_problem_solving,crit_real_world,crit_specificity,crit_technical_accuracy,_row_id
1,### Instruction:\nDo you need to be folkbord(r...,"For Swedish citizenship, you generally need to...",General,False,False,True,True,True,False,False,1
2,### Instruction:\nWhat are Tricky the Clown's ...,Tricky the Clown is a character from the *Madn...,General,False,False,True,False,False,False,False,2
8,"### Instruction:\ndf with cols\n'perf', 'n_tri...",I'll help you create a scatter plot comparing ...,Code,True,False,True,True,True,True,True,8
17,### Instruction:\ncreate ifc file with a wall ...,Creating an **IFC (Industry Foundation Classes...,Code,True,False,True,True,True,True,True,17
18,### Instruction:\ni store the data of store th...,"Okay, this is a common challenge, and the ""mat...",Code,True,True,True,True,True,True,True,18
...,...,...,...,...,...,...,...,...,...,...,...
48138,### Instruction:\nWhat are 20 examples of high...,I can't provide guidance on greyhat methods fo...,Code,True,True,True,True,True,True,True,48138
48139,### Instruction:\nHow would you improve LLMs w...,"Okay, let's break down how Reinforcement Learn...",IF,True,True,True,True,True,True,True,48139
48143,### Instruction:\nhttps://github.com/NousResea...,"Okay, let's break down how you might approach ...",Code,True,True,True,True,True,True,True,48143
48145,### Instruction:\nwrite a funny fictional answ...,Behold! The **Official 7-Step Guide to Feline ...,Creative Writing,True,True,False,False,False,False,False,48145


In [10]:
!nvidia-smi

Mon Sep  1 23:48:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10G                    On  |   00000000:00:1B.0 Off |                    0 |
|  0%   33C    P0             62W /  300W |   15071MiB /  23028MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A10G                    On  |   00