In [None]:
# For licensing see accompanying LICENSE file.
# Copyright (C) 2025 Apple Inc. All Rights Reserved.

# Generating JSONL file for human annotation

Transfer from standard CSV file to JSONL file to work with the POEM platform.

In [3]:
import pandas as pd
import json
import pathlib

DATA_PATH = pathlib.Path("../../data/external/truthful_qa/truthful_qa_5.csv")
SAVE_PATH = pathlib.Path("./tutorial_outputs/")
SAVE_PATH.mkdir(exist_ok=True)

df = pd.read_csv(DATA_PATH)

json_dicts = []

# optional: to skip already annotated rows
start_index = 0

for i, row in df.iterrows():
    if i < start_index:
        continue
    pref_text = row[row["preferred_text"]]
    nonpref_text_col = "text_a" if row["preferred_text"] == "text_b" else "text_b"
    nonpref_text = row[nonpref_text_col]

    assert row["preferred_text"] != nonpref_text_col, f"{row['preferred_text']} == {nonpref_text_col}"
    assert nonpref_text != pref_text, f"{nonpref_text} == {pref_text}"


    json_dicts.append(
        {
            "prompt_id": f"ageval-test-20240813-{i:07}",
            "messages":[
                {
                    "role": "user",
                    "content": row["prompt"],
                }
            ],
            "enrichments":
            {
                "task_type": "Open Q&A",
                "topic": "other_v1.1",
                "task_type_prob": 1,
                "topic_prob": 1,
                "meta":{"longfact-preferred": "text_1"},
            },
            "responses":
            {
                "text_1":
                {
                    "response": pref_text,
                    "metadata": {},
                },
                "text_2":
                {
                    "response": nonpref_text,
                    "metadata": {},
                },
            }
            }

    )

with open(SAVE_PATH / "test.jsonl", "w") as f:
    json.dump(json_dicts, f, indent=4)