In [None]:
# Some of the code in this file is adapted from:
# https://github.com/rdnfn/icai/blob/main/notebooks/01_data_prepocessing.ipynb
# Licensed under the Apache License, Version 2.0 (the "License").

import alpaca_eval.constants
import numpy as np
import pathlib

ae_df = alpaca_eval.constants.ALPACAFARM_GOLD_CROSSANNOTATIONS()

def get_vote(group):
    vote_counts = group['preference'].value_counts()
    if vote_counts.max() == 2: # tie
        return np.nan
    else:
        return vote_counts.idxmax()
   
df = ae_df.groupby(['instruction','output_1','output_2']).apply(get_vote).reset_index(name="preference")

# drop ties
df = df.dropna()

# change format
df[["text_a","text_b"]] = df[["output_1","output_2"]]

for col in ["text_a", "text_b"]:
    df[col] = "Instruction:\n" + df["instruction"] + "\n\n\nAssistant:\n" + df[col]

# get preferred text column (values text_a, text_b) based on preference column (which has values 1 or 2)
df["preferred_text"] = np.where(df['preference'] == 1, 'text_a', 'text_b')

# shuffle the data
df = df.sample(frac=1, random_state=42)

# select only relevant columns
df = df[["text_a", "text_b", "preferred_text"]]

# save data
save_path = pathlib.Path("../../data/alpacaeval_crossannotated_rand42_noties.csv")
df.to_csv(save_path, index_label="index")

In [None]:
# ALTERNATIVE VERSION WITH SEPARATE PROMPT
# Adapted from/based on https://github.com/rdnfn/icai/blob/main/notebooks/01_data_prepocessing.ipynb

import alpaca_eval.constants
import numpy as np
import pathlib

ae_df = alpaca_eval.constants.ALPACAFARM_GOLD_CROSSANNOTATIONS()

def get_vote(group):
    vote_counts = group['preference'].value_counts()
    if vote_counts.max() == 2: # tie
        return np.nan
    else:
        return vote_counts.idxmax()
   
df = ae_df.groupby(['instruction','output_1','output_2']).apply(get_vote).reset_index(name="preference")

# drop ties
df = df.dropna()

# change format
df[["text_a","text_b"]] = df[["output_1","output_2"]]
df["prompt"] = df["instruction"]

# get preferred text column (values text_a, text_b) based on preference column (which has values 1 or 2)
df["preferred_text"] = np.where(df['preference'] == 1, 'text_a', 'text_b')

# shuffle the data
df = df.sample(frac=1, random_state=42)

# select only relevant columns
df = df[["text_a", "text_b", "prompt", "preferred_text"]]

# save data
save_path = pathlib.Path("../../data/alpacaeval_crossannotated_rand42_noties_prompt.csv")
df.to_csv(save_path, index_label="index")