In [1]:
import pandas as pd
import numpy as np
import ast
import re

df = pd.read_csv("../data/raw/lmsys/chatbot_arena_kaggle2024_train.csv", dtype={"prompt": str, "response_a": str, "response_b": str, "model_a": str, "model_b": str})

print(f"Original dataset size: {len(df)}")

# get sum of all appearances of each model
vc_a = df["model_a"].value_counts()
vc_b = df["model_b"].value_counts()
combined_vc = pd.concat([vc_a, vc_b]).groupby(level=0).sum()
combined_vc = combined_vc.sort_values(ascending=False)
print("Model counts (appearances as either model A or B):")
pd.set_option('display.max_rows', None)
print(combined_vc)
pd.reset_option('display.max_rows')

# select datapoints with subset of models
model_list = ['gpt-4-1106-preview', 'gpt-3.5-turbo-0613', 'claude-2.1', 'claude-2.0', 'claude-1', 'gpt-4-0314', 'llama-2-70b-chat', 'llama-2-13b-chat', 'llama-2-7b-chat', 'mixtral-8x7b-instruct-v0.1', "gpt-4-0613", "tulu-2-dpo-70b", "vicuna-33b"]
df = df[df['model_a'].isin(model_list) & df['model_b'].isin(model_list)]

# remove ties
df = df[df['winner_tie'] == 0]

# get number of prompts and responses per model
def convert_to_list(list_str):
    try:
        # fiz invalid escape sequence issue
        list_str = list_str.replace(r'\/', '/')
        return ast.literal_eval(list_str)
    except Exception as e:
        try:
            # replace null with None, note this may very slightly affect the string meaning
            list_str = re.sub("null", "None", list_str)
            return ast.literal_eval(list_str)
        except Exception as e2:
            print("Error parsing:", list_str)
            print(e)
            print(e2)
            return []

for col in ["prompt", "response_a", "response_b"]:
    df[col] = df[col].apply(convert_to_list)
    df[col + "_length"] = df[col].apply(len)
    df[col] = df[col].apply(lambda x: x[0] if len(x) > 0 else None)

# filter out row with either prompt/response_a/response_b being len more than 1
df = df[(df["prompt_length"] == 1) & (df["response_a_length"] == 1) & (df["response_b_length"] == 1)]

# drop rows with na in prompt, response_a, response_b
size_with_na = len(df)
df = df.dropna(subset=["prompt", "response_a", "response_b"])
size_without_na = len(df)
print(f"Size with na: {size_with_na}, size without na: {size_without_na}, number of rows removed: {size_with_na - size_without_na}")

# add winner_model and loser_model columns
df['winner_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 1 else x['model_b'], axis=1)
df['loser_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 0 else x['model_b'], axis=1)

# check if each col of prompt, response_a, response_b is encodable in utf-8
# drop rows that are not encodable
import warnings
warnings.filterwarnings("error")

for col in ["prompt", "response_a", "response_b"]:
    for i, row in df.iterrows():
        try:
            row[col].encode("utf-8", errors="strict")
        except Exception as e:
            print(f"Error encoding {col} for row {i}: {e}")
            df.drop(i, inplace=True)


Original dataset size: 57477
Model counts (appearances as either model A or B):
gpt-4-1106-preview                7387
gpt-3.5-turbo-0613                7083
gpt-4-0613                        6165
claude-2.1                        5583
claude-instant-1                  4136
gpt-4-0314                        4122
claude-1                          3978
vicuna-33b                        3720
mixtral-8x7b-instruct-v0.1        3545
vicuna-13b                        3448
llama-2-70b-chat                  3428
gpt-3.5-turbo-1106                3352
mistral-medium                    3315
llama-2-13b-chat                  2607
claude-2.0                        2456
zephyr-7b-beta                    2401
palm-2                            1977
llama-2-7b-chat                   1793
wizardlm-70b                      1644
openchat-3.5                      1632
mistral-7b-instruct               1617
koala-13b                         1598
vicuna-7b                         1591
wizardlm-13b           

In [2]:
df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,prompt_length,response_a_length,response_b_length,winner_model,loser_model
33,2298796,gpt-4-1106-preview,claude-2.1,make a haiku on bacon the food,"Crisp strips of delight,\nSizzling dance, morn...",Here is a haiku about bacon:\n\nSizzling in pa...,1,0,0,1,1,1,gpt-4-1106-preview,claude-2.1
35,2802516,claude-2.0,claude-1,i have this: import logo from './ressources/lo...,The navigation is only visible on the root pat...,This is happening because you have defined the...,0,1,0,1,1,1,claude-1,claude-2.0
37,2912862,gpt-4-1106-preview,gpt-4-0613,"texts = [text.split() for text in df[""text_pro...",The code snippet you provided is used to creat...,The code you've provided is writing a vocab fi...,0,1,0,1,1,1,gpt-4-0613,gpt-4-1106-preview
45,3499263,gpt-4-0613,claude-1,write the lyrics to a rap song about some dude...,"(Verse 1)\nYo, let's take a minute, prepare to...",Here's an attempt at some rap lyrics about a d...,1,0,0,1,1,1,gpt-4-0613,claude-1
46,3503031,llama-2-13b-chat,gpt-4-0613,tell me a joke containing Tiger and Mobile phone?,"Sure, here's a joke for you:\n\nWhy did the ti...",Why don't tigers use mobile phones? \n\nBecaus...,0,1,0,1,1,1,gpt-4-0613,llama-2-13b-chat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57464,4294068603,gpt-4-0613,gpt-3.5-turbo-0613,"Sarah (female) has three brothers, her brother...",Sarah has two sisters.,Sarah has one sister.,0,1,0,1,1,1,gpt-3.5-turbo-0613,gpt-4-0613
57465,4294081229,gpt-4-0314,gpt-4-0613,please describe how the following two statemen...,The two statements can both be correct and not...,These two statements may seem contradictory at...,1,0,0,1,1,1,gpt-4-0314,gpt-4-0613
57467,4294254797,gpt-4-1106-preview,claude-2.0,can you provide a modular python code that wil...,Certainly! Below is a modular Python example t...,Here is a modular Python code to format SQL co...,1,0,0,1,1,1,gpt-4-1106-preview,claude-2.0
57472,4294656694,gpt-4-0613,claude-1,"A simple mnemonic for π:\n""How I wish I could ...","Sure, let's break it down:\n\n1. ""How"" has 3 l...",Here is how that mnemonic represents the digit...,1,0,0,1,1,1,gpt-4-0613,claude-1


In [3]:
df["winner_model"].value_counts()

winner_model
gpt-4-1106-preview            1929
gpt-4-0613                    1122
claude-2.1                     846
gpt-3.5-turbo-0613             768
gpt-4-0314                     684
claude-1                       627
vicuna-33b                     474
mixtral-8x7b-instruct-v0.1     441
llama-2-70b-chat               404
claude-2.0                     341
llama-2-13b-chat               175
tulu-2-dpo-70b                 135
llama-2-7b-chat                124
Name: count, dtype: int64

In [4]:
df["loser_model"].value_counts()

loser_model
claude-2.1                    1472
gpt-3.5-turbo-0613            1133
gpt-4-0613                    1046
gpt-4-1106-preview             639
claude-1                       591
vicuna-33b                     556
mixtral-8x7b-instruct-v0.1     540
gpt-4-0314                     540
claude-2.0                     457
llama-2-70b-chat               408
llama-2-13b-chat               283
llama-2-7b-chat                215
tulu-2-dpo-70b                 190
Name: count, dtype: int64

In [5]:
len(df)


8070

In [6]:
# subselect 2k samples
# set random seed
#df_final = df.sample(2000, random_state=42)
df_final = df


In [8]:
# bucket ids into 8 evenly distributed buckets, to detect trends over time
df_final["id_bucket"] = pd.qcut(df_final["id"], 8, labels=False)
df_final["id_bucket"].value_counts()


id_bucket
0    1009
1    1009
3    1009
4    1009
6    1009
7    1009
2    1008
5    1008
Name: count, dtype: int64

In [9]:
import ast

#df_final = df_final.rename(columns={"response_a": "text_a", "response_b": "text_b"})
df_final["text_a"] = df_final["response_a"]
df_final["text_b"] = df_final["response_b"]
df_final["preferred_text"] = df_final.apply(lambda x: "text_a" if x["winner_model_a"] == 1 else "text_b", axis=1)

# format into correct text
# prepend instruction column to both text_a and text_b
for col in ["text_a", "text_b"]:
    df_final[col] = "Instruction:\n" + df_final["prompt"] + "\n\n\nAssistant:\n" + df_final[col]

# shuffle
df_final: pd.DataFrame = df_final.sample(frac=1, random_state=42)

df_final.to_csv(f"../data/processed/lmsys/chatbot_arena_kaggle2024_train_{len(df_final)}random_v3.csv", index=False)