In [None]:
# Notebook to preprocess feedback data for Inverse Consitutional AI algorithm

import pathlib
import inverse_cai
import numpy as np
import pandas as pd

DATA_DIR = pathlib.Path('../data')

In [None]:
# Utility functions for loading and saving data in same order
# create random shuffle of the data
# that can be saved and reproduced

def save_random_index(df, path, seed=42):
    np.random.seed(seed)
    random_index_series = pd.Series(np.random.permutation(df.index), index=df.index)
    random_index_series.to_csv(path, index=False, header=False)

def load_random_order_and_apply_to_df(df, path):
    rand_index = pd.read_csv(path, header=None).squeeze()
    assert len(rand_index) == len(df), f"Length of random index {len(rand_index)} does not match length of dataframe {len(df)}"
    return df.loc[rand_index.values]

In [None]:
# OPTIONAL: Fetch the anthropic dataset if not downloaded yet
import gzip
from urllib.request import urlopen
dataset_revision = "354e5c3cb8960630860dd5774a6ac2a58313bf4d"

anthropic_target_dir = DATA_DIR / "raw/anthropic/anthropic_helpful_base"
anthropic_target_dir.mkdir(parents=True, exist_ok=True)

# Download and unzip the file
url = f"https://github.com/anthropics/hh-rlhf/raw/{dataset_revision}/helpful-base/train.jsonl.gz"
with urlopen(url) as response:
    with gzip.open(response, 'rb') as gz:
        file_content = gz.read()

file_path = anthropic_target_dir / "train.jsonl"
with open(file_path, 'wb') as f:
    f.write(file_content)

In [None]:
# Preprocess feedback data from Anthropic
# Origin: https://github.com/anthropics/hh-rlhf
ANTH_PATH = DATA_DIR / "raw/anthropic/anthropic_helpful_base/train.jsonl"
anth_df = inverse_cai.data.loader.anthropic.load_original_jsonl_file(ANTH_PATH )
anth_canonical_order_path = DATA_DIR / "meta_data/anthropic/anthropic_helpful_base_train_canonical_rand_order.csv"

# OPTIONAL: save new random order
# save_random_index(anth_df, random_order_path, seed=42)

anth_df_canonical_order = load_random_order_and_apply_to_df(anth_df, anth_canonical_order_path)

LEN_PROCESSED = 1000
processed_path = DATA_DIR / f"processed/anthropic/anthropic_helpful_base_train_{LEN_PROCESSED}canonrand.csv"
processed_path.parent.mkdir(parents=True, exist_ok=True)
anth_df_canonical_order[:LEN_PROCESSED].to_csv(processed_path, index_label='index')

In [None]:
# OPTIONAL: load data from HuggingFace if not downloaded yet
# Make sure you have run huggingface-cli login, since this dataset is gated
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("lmsys/chatbot_arena_conversations")
chatbot_df = pd.DataFrame(dataset)
# Ensure dir exists
chatbot_df_path = DATA_DIR / 'raw/lmsys/chatbot_arena_conversations.csv'
chatbot_df_path.parent.mkdir(parents=True, exist_ok=True)
chatbot_df.to_csv(chatbot_df_path, index=False)

In [None]:
# Prepocess feedback data from Chatbot Arena
import pandas as pd

chatbot_df = inverse_cai.data.loader.lmsys.load_raw(chatbot_df_path, remove_ties=True)

chatbot_canonical_order_path = DATA_DIR / "meta_data/lmsys/chatbot_arena_conversations_canonical_rand_order.csv"

# OPTIONAL: save new random order
# save_random_index(chatbot_df, chatbot_canonical_order_path, seed=42)

chatbot_df_canonical_order = load_random_order_and_apply_to_df(chatbot_df, chatbot_canonical_order_path)

LEN_PROCESSED = 1000
processed_path = DATA_DIR / f"processed/lmsys/chatbot_arena_conversations_{LEN_PROCESSED}canonrand.csv"
processed_path.parent.mkdir(parents=True, exist_ok=True)
chatbot_df_canonical_order[:LEN_PROCESSED].to_csv(processed_path, index_label='index')


In [None]:
import alpaca_eval.constants
import numpy as np
import pandas as pd

# Load the gold cross-annotations dataset
df = alpaca_eval.constants.ALPACAFARM_GOLD_CROSSANNOTATIONS()
original_df = df.copy(deep=True)

# Set a random seed for reproducibility
np.random.seed(42)


### Part 1: combining four preferences into a single consensus preference
# NOTE: AlpacaEval has 4 preferences per instruction-output pair, for our purposes
# we will combine these into a single preference.

TIES = 0

# Function to determine the preferred output based on a simple majority vote
def majority_vote(group):
    counts = group['preference'].value_counts()
    if counts.max() == 2:  # Tie (2 vs 2)
        global TIES
        TIES += 1
        return np.random.choice([1,2])
    return counts.idxmax()

# Group by the unique sets of outputs per instruction and aggregate preferences
aggregated_df = df.groupby(['instruction', 'output_1', 'output_2']).apply(majority_vote).reset_index(name='preference')

# Add metadata columns from the original DataFrame
aggregated_metadata = df[['instruction', 'dataset', 'datasplit', 'time_per_example', 'price_per_example']].drop_duplicates()

# Merge the metadata into the aggregated DataFrame
print(f"Number of ties: {TIES}")
df = pd.merge(aggregated_df, aggregated_metadata, on='instruction')


### Part 2: move dataset into our standard format

df[["text_a", "text_b"]] = df[["output_1","output_2"]]

# prepend instruction column to both text_a and text_b
for col in ["text_a", "text_b"]:
    df[col] = "Instruction:\n" + df["instruction"] + "\n\n\nAssistant:\n" + df[col]

# get preferred text column (values text_a, text_b) based on preference column (which has values 1 or 2)
df["preferred_text"] = np.where(df['preference'] == 1, 'text_a', 'text_b')

# shuffle the data
df = df.sample(frac=1, random_state=42)

processed_path = DATA_DIR / f"processed/tatsu_lab/alpacaeval_goldcrossannotations_rand.csv"
processed_path.parent.mkdir(parents=True, exist_ok=True)
df[["text_a", "text_b", "preferred_text"]].to_csv(processed_path, index_label='index')


In [None]:
# sanity check
def add_combined_instruction_outputs_column(df):
    df['instruction_output'] = df['instruction'] + ' ' + df['output_1'] + ' ' + df['output_2']
    return df

df = add_combined_instruction_outputs_column(df)
original_df = add_combined_instruction_outputs_column(original_df)

for i in range(5):
    value_counts = original_df['preference'][4*(i):4*(i+1)].value_counts()
    instruction_output = original_df['instruction_output'][4*i]
    if value_counts.max() == 2:
        pass # Tie
    else:
        assert value_counts.idxmax() == df[df['instruction_output'] == instruction_output]['preference'].values[0]
