In [None]:
import datasets
import numpy as np
import time


def process_data(
        hf_path: str = "lmarena-ai/arena-human-preference-140k",
        remove_token_counts: bool = True,
        max_conv_char_length: int = 5000,
        sample_size: int = 10000,
        seed: int = 42,
        ):

    dataset = datasets.load_dataset(hf_path)
    df = dataset["train"].to_pandas()

    ### Data subset selection
    print(f"Loading {hf_path} dataset")
    print(f"Dataset original size: {len(df)}")
    start_time = time.time()

    # limit to non-tie votes
    df = df[df["winner"].isin(["model_a", "model_b"])]
    print(f"Dataset size after filtering to non-tie votes: {len(df)}")
    # limit to English-language conversations
    df = df[df["language"] == "en"]
    print(f"Dataset size after filtering language to English: {len(df)}")
    df["marked_for_deletion"] = False


    ### Data cleaning and preprocessing

    if remove_token_counts:
        print("Removing token counts from each turn")
        # remove token counts from each turn
        # does not apply to lmarena-ai/arena-human-preference-140k
        # where this information is in another column
        def remove_token_counts(row):
            for conversation in ["conversation_a", "conversation_b"]:
                conv_list = row[conversation]
                for turn in conv_list:
                    if "num_tokens" in turn:
                        turn.pop("num_tokens")
                row[conversation] = conv_list
            return row

        df = df.apply(remove_token_counts, axis=1)

    def check_conversation_nonempty(row):
        for conversation in ["conversation_a", "conversation_b"]:
            conv_list = row[conversation]
            for turn in conv_list:
                content = turn.get("content")
                if isinstance(content, str):
                    pass
                elif isinstance(content, np.ndarray):
                    if len(content) == 0:
                        print(f"Conversation {conversation} is empty ({conv_list})")
                        row["marked_for_deletion"] = True
                        return row
        return row

    print("Checking if conversations are empty")
    df = df.apply(check_conversation_nonempty, axis=1)

    print("Filtering out conversations marked for deletion")
    df = df[~df["marked_for_deletion"]]
    print(f"Dataset size after filtering out conversations marked for deletion: {len(df)}")
    df.drop(columns=["marked_for_deletion"], inplace=True)

    print(f"Sampling {sample_size} conversations out of {len(df)}")
    df = df.sample(sample_size, random_state=seed)



    def get_text(row):
        """Newer version of LMArena data use different format for conversations.

        E.g. lmarena-ai/arena-human-preference-140k.
        """
        for conversation in ["conversation_a", "conversation_b"]:
            conv_list = row[conversation]
            for turn in conv_list:
                content = turn.get("content")
                if isinstance(content, str):
                    pass
                elif isinstance(content, np.ndarray):
                    assert len(content) == 1, f"Expected single turn but got multiple turns: {content}"
                    assert content[0]["type"] == "text", f"Expected text turn but got {content[0]['type']}: {content[0]}"
                    content = content[0]["text"]
                else:
                    raise ValueError(f"Unexpected content type: {type(content)}")

                turn["content"] = content

            row[conversation] = conv_list
        return row

    print("Extracting text from each turn")
    df = df.apply(get_text, axis=1)

    # set max conversation length to 4000 characters
    MAX_CONV_LENGTH = 10000 # in characters
    def truncate_conversation(row):
        for conversation in ["conversation_a", "conversation_b"]:
            conv_list = row[conversation]
            conv_str = str(conv_list)
            if len(conv_str) > MAX_CONV_LENGTH:
                row[conversation] = conv_str[:MAX_CONV_LENGTH] + "... (conversation truncated)"
                row[f"truncated_{conversation}"] = True
            else:
                row[f"truncated_{conversation}"] = False
        return row

    print(f"Truncating conversations to {max_conv_char_length} characters")
    df = df.apply(truncate_conversation, axis=1)

    df["text_a"] = df["conversation_a"]
    df["text_b"] = df["conversation_b"]
    df.drop(columns=["conversation_a", "conversation_b"], inplace=True)


    def transform_winner(row):
        if row["winner"] == "model_a":
            return "text_a"
        else:
            return "text_b"

    print("Transforming winner to preferred text")
    df["preferred_text"] = df.apply(transform_winner, axis=1)

    print("Saving to CSV")
    df.to_csv(f"../data/output/{hf_path.split('/')[-1]}_{sample_size}samples_{max_conv_char_length}chars_english.csv", index=False)

    end_time = time.time()
    print(f"Time taken: {end_time - start_time:.2f} seconds")


In [None]:
process_data(
    hf_path="lmarena-ai/arena-human-preference-140k",
    max_conv_char_length=5000,
    sample_size=5000,
    seed=42,
)