In [1]:
import json

PATH = "../data/raw/prism/conversations.jsonl"
USER_PATH = "../data/raw/prism/survey.jsonl"

In [2]:
import pandas as pd
import numpy as np

df = pd.read_json(path_or_buf=PATH, lines=True)
user_df = pd.read_json(path_or_buf=USER_PATH, lines=True)


In [3]:
# normalize all dict columns inside dataframe

def json_normalize_columns(df):
    df = df.copy()
    for col in df.columns:
        if type(df[col][0]) == dict:
            normalized_col = pd.json_normalize(df[col])
            normalized_col.columns = [f"{col}_{sub_col}" for sub_col in normalized_col.columns]
            df = pd.concat([df.drop([col], axis=1), normalized_col], axis=1)
    return df

In [4]:
user_df = json_normalize_columns(user_df)

In [5]:
convs = pd.merge(df, user_df, on='user_id')

In [6]:
def get_chosen_rejected_first_response(row):
    conv_hist = row['conversation_history']
    first_model_responses = [
        response for response in conv_hist if response['turn'] == 0 and response['role'] == 'model'
    ]
    for response in first_model_responses:
        if response['if_chosen'] == True:
            chosen_response = response['content']
            chosen_model = response['model_name']
            break

    rejected_responses = []
    rejected_models = []

    for response in first_model_responses:
        if response['if_chosen'] == False:
            rejected_responses.append(response['content'])
            rejected_models.append(response['model_name'])

    # chose rejected response randomly
    if len(rejected_responses) > 0:
        rejected_index = np.random.choice(len(rejected_responses))
        rejected_response = rejected_responses[rejected_index]
        rejected_model = rejected_models[rejected_index]
    else:
        rejected_response = None
        rejected_model = None




    return chosen_response, chosen_model, rejected_responses, rejected_models, rejected_response, rejected_model

In [7]:
convs['chosen_response'], convs['chosen_model'], convs['rejected_responses'], convs['rejected_models'], convs['rejected_response'], convs['rejected_model'] = zip(*convs.apply(get_chosen_rejected_first_response, axis=1))

In [8]:
import random

random.seed(42)

# prepend instruction column to both text_a and text_b

def create_texta_textb_preftext(row):
    instruction = row["opening_prompt"]
    preferred_text = row["chosen_response"]
    rejected_text = row["rejected_response"]

    preferred_text = "Instruction:\n" + instruction + "\n\n\nAssistant:\n" + preferred_text
    rejected_text = "Instruction:\n" + instruction + "\n\n\nAssistant:\n" + rejected_text

    text_a_pref = random.choice([True, False])

    if text_a_pref:
        text_a = preferred_text
        text_b = rejected_text
        preferred_text = "text_a"
    else:
        text_a = rejected_text
        text_b = preferred_text
        preferred_text = "text_b"

    return text_a, text_b, preferred_text

convs.dropna(subset=['chosen_response', 'rejected_response'], inplace=True)

convs['text_a'], convs['text_b'], convs['preferred_text'] = zip(*convs.apply(create_texta_textb_preftext, axis=1))

In [10]:
NUM_ROWS = 2000

# shuffle
convs = convs.sample(frac=1, random_state=42).reset_index(drop=True)

if NUM_ROWS is not None:
    convs_csv = convs[:NUM_ROWS]
    file_name = f"prism_rand{NUM_ROWS}_incl_metadata_v2.csv"
else:
    convs_csv = convs
    file_name = "prism_rand_incl_metadata_v2.csv"
convs_csv.to_csv(f"../data/processed/prism/{file_name}", index_label="index")

