In [1]:
import pandas as pd

df = pd.read_csv("../data/raw/lmsys/chatbot_arena_kaggle2024_train.csv")

In [2]:
# we randomly sample datapoints in such a way, that for each of the selected models there is at least 100 samples winning or losing, or if there are less than 100 samples, we take all of them.

In [3]:
# select all datapoints that where both model_a and model_b are in the list
# [gpt-4-0314 (highest ranking gpt-4), gpt-4-1106-preview (highest ranking gpt-4 turbo), llama-2-70b-chat, llama-2-7b-chat, mixtral-8x7b-instruct-v0.1]

model_list = ['gpt-4-0314', 'gpt-4-1106-preview', 'llama-2-70b-chat', 'llama-2-13b-chat', 'llama-2-7b-chat', 'mixtral-8x7b-instruct-v0.1']

df = df[df['model_a'].isin(model_list) & df['model_b'].isin(model_list)]


In [None]:
import ast

# remove ties
df = df[df['winner_tie'] == 0]

# get number of prompts and responses per model
def convert_to_list(list_str):
    try:
        return ast.literal_eval(list_str)
    except:
        print("Error parsing:", list_str)
        return []

for col in ["prompt", "response_a", "response_b"]:
    df[col] = df[col].apply(convert_to_list)
    df[col + "_length"] = df[col].apply(len)
    df[col] = df[col].apply(lambda x: x[0] if len(x) > 0 else None)

# filter out row with either prompt/response_a/response_b being len more than 1
df = df[(df["prompt_length"] == 1) & (df["response_a_length"] == 1) & (df["response_b_length"] == 1)]

# add winner_model and loser_model columns
df['winner_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 1 else x['model_b'], axis=1)
df['loser_model'] = df.apply(lambda x: x['model_a'] if x['winner_model_a'] == 0 else x['model_b'], axis=1)

In [5]:
# check if each col of prompt, response_a, response_b is encodable in utf-8
# drop rows that are not encodable
for col in ["prompt", "response_a", "response_b"]:
    for i, row in df.iterrows():
        try:
            row[col].encode("utf-8")
        except:
            df.drop(i, inplace=True)

In [None]:
df["winner_model"].value_counts()

In [None]:
df["loser_model"].value_counts()

In [None]:
# starting from the least frequent model, we select the first 100 wins and 100 losses

# get winner list of models sorted by frequency
models_winner = df["winner_model"].value_counts().index.tolist()

# get loser list of models sorted by frequency
models_loser = df["loser_model"].value_counts().index.tolist()

num_samples = 10000

def generate_dataframe_options(df, models_winner, models_loser):
    df_final = pd.DataFrame()
    df = df.copy()
    models_winner = models_winner.copy()
    models_loser = models_loser.copy()

    for model in models_loser:
        if len(df_final) > 0:
            # get number of losses for this model already in the dataframe
            n_losses = df_final[df_final["loser_model"] == model].shape[0]
        else:
            n_losses = 0

        # get df without the losses already in the final dataframe
        df_model = df[df["loser_model"] == model]

        if n_losses > 0:
            # make sure we only consider new datapoints
            df_model = df_model[~df_model["id"].isin(df_final["id"])]

        # shuffle df_model
        df_model = df_model.sample(frac=1)

        # get 100 - n_losses
        df_model = df_model.head(max(0,100 - n_losses))

        # print(f"Adding {df_model.shape[0]} losses for {model}")
        df_final = pd.concat([df_final, df_model])
    for model in models_winner:
        # get number of wins for this model already in the dataframe
        n_wins = df_final[df_final["winner_model"] == model].shape[0]

        # get df without the wins already in the final dataframe
        df_model = df[df["winner_model"] == model]
        df_model = df_model[~df_model["id"].isin(df_final["id"])]

        # shuffle df_model
        df_model = df_model.sample(frac=1)

        # get 100 - n_wins
        df_model = df_model.head(max(0, 100 - n_wins))

        # print(f"Adding {df_model.shape[0]} wins for {model} (total prev {n_wins})")
        df_final = pd.concat([df_final, df_model])

    # drop any duplicates
    df_final = df_final.drop_duplicates()

    return df_final

df_final_options = []

import tqdm

for i in tqdm.tqdm(range(num_samples)):
    df_final_options.append(generate_dataframe_options(df, models_winner, models_loser))


In [None]:
# sort dataframes by number of rows
df_final_options = sorted(df_final_options, key=lambda x: x.shape[0])

# get the first 10 dataframes
df_final_options = df_final_options[:10]

# print lengths
for i, df_final in enumerate(df_final_options):
    print(f"Option {i}: {df_final.shape[0]}")

In [None]:
df_final = df_final_options[0].copy()
print(df_final["winner_model"].value_counts())
print(df_final["loser_model"].value_counts())


In [15]:
import ast

#df_final = df_final.rename(columns={"response_a": "text_a", "response_b": "text_b"})
df_final["text_a"] = df_final["response_a"]
df_final["text_b"] = df_final["response_b"]
df_final["preferred_text"] = df_final.apply(lambda x: "text_a" if x["winner_model_a"] == 1 else "text_b", axis=1)

# format into correct text
# prepend instruction column to both text_a and text_b
for col in ["text_a", "text_b"]:
    df_final[col] = "Instruction:\n" + df_final["prompt"] + "\n\n\nAssistant:\n" + df_final[col]

# shuffle
df_final: pd.DataFrame = df_final.sample(frac=1, random_state=42)

df_final.to_csv("../data/processed/lmsys/chatbot_arena_kaggle2024_train_balanced.csv", index=False)


In [None]:
# check how long the prompt lists are and if they are the same for all datapoints

df_final["prompt_length"] = df_final["prompt"].apply(lambda x: len(ast.literal_eval(x)))
df_final[df_final["prompt_length"] != 1].iloc[0]["prompt"]

In [None]:
val_counts = df["model_a"].value_counts().to_dict()

for key, value in val_counts.items():
    print(f"{key}: {value}")

In [None]:
print(len(df["model_a"].value_counts().to_dict()))

In [None]:
path = "./api_calls.jsonl"

import json

# count number of total chars in entire jsonl file

total_chars = 0
line_dicts = []
with open(path, "r") as f:
    for i, line in enumerate(f):
        total_chars += len(line)
        if '"type":"return_value"' in line:
            line_dict = ast.literal_eval(line)
            line_dicts.append(line_dict)



print((total_chars / 1000000) * 0.150)

In [None]:
total_prompt_tokens = 0
total_completion_tokens = 0

for line_dict in line_dicts:
    total_prompt_tokens += line_dict["message"]["token_usage"]["prompt_tokens"]
    total_completion_tokens += line_dict["message"]["token_usage"]["completion_tokens"]


total_cost = (total_completion_tokens * 0.6 +  total_prompt_tokens * 0.15) / 1000000

print(f"total cost: {total_cost}")