In [None]:
# Setting up prompts from MultiPref

import numpy as np
import random
import pathlib
import pandas as pd

multipref_path = pathlib.Path("../data/input/allenai/multipref_original.csv")

# Read the CSV file into a DataFrame
df = pd.read_csv(multipref_path)
print(f"Number of unique data sources: {len(df['source'].unique())}")

# filter out rows from anthropic helpful/harmless
df = df[~df["source"].isin(["anthropic/helpful-base", "anthropic/harmless-base"])]

prompts = df["text"].unique()

num_prompts = len(prompts)
print(f"Number of prompts: {num_prompts}")

# Set a random seed for reproducibility
random_seed = 4321
random.seed(random_seed)
np.random.seed(random_seed)

# Shuffle the prompts
shuffled_prompts = list(prompts)
random.shuffle(shuffled_prompts)

# save shuffled prompts to json
# Save the shuffled prompts to a JSON file
import json
import os

# Create output directory if it doesn't exist
os.makedirs("../data/output", exist_ok=True)

# Save the shuffled prompts to a JSON file
output_path = pathlib.Path("../data/input/allenai/multipref_shuffled_prompts.json")
if not output_path.exists():
    with open(output_path, "w") as f:
        json.dump(shuffled_prompts, f, indent=2)
else:
    # Load the shuffled prompts from the JSON file
    with open(output_path, "r") as f:
        shuffled_prompts = json.load(f)


chosen_prompts = shuffled_prompts[:500]

In [None]:
from feedback_forensics.tools.model_comparison import run_model_on_prompts_async

output_path = pathlib.Path("../data/output/model_comparison_v2/")
# not using all models to limit costs
model_list = [
    #"openai/gpt-4o-mini-2024-07-18",
    #"openai/gpt-4o-2024-11-20",
    "openai/gpt-4o-2024-08-06", # current default version
    #"openai/gpt-4o-2024-05-13",
    #"openai/chatgpt-4o-latest", # unfortuantely no pinned version available
    "openai/gpt-4.1-2025-04-14",
    #"openai/gpt-4.1-mini-2025-04-14",
    "openai/gpt-3.5-turbo",
    "openrouter/mistralai/mistral-medium-3",
    "openrouter/mistralai/mistral-medium",
    "openrouter/mistralai/mistral-7b-instruct-v0.1",
    "openrouter/meta-llama/llama-4-maverick",
    "openrouter/meta-llama/llama-3.3-70b-instruct",
    "openrouter/meta-llama/llama-3-70b-instruct",
    "openrouter/meta-llama/llama-2-70b-chat",
]

print(f"Running {len(model_list)} models")

for model in model_list:
    print(f"Running '{model}'")
    await run_model_on_prompts_async(
        prompts=chosen_prompts,
        model_name=model,
        output_path=output_path,
        max_concurrent=30, # Adjust based on your needs
        max_tokens=4096, # max of gpt-3.5-turbo and gpt-4o-2024-05-13
    )

In [None]:
# load all generations
generations = []
for file in output_path.glob("**/generations/**/*.jsonl"):
    print(file)
    with open(file, "r") as f:
        for line in f:
            try:
                generations.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error parsing line in {file}: {e}")

generations = pd.DataFrame(generations)
generations.head()

In [None]:
reference_model = "openai/gpt-4o-2024-08-06" # default version of gpt-4o as of 11.05.2025

# create a new table with the reference model
comparison_data = []

for model in model_list:
    if model == reference_model:
        continue

    # Get all prompts that have both model and reference model responses
    model_responses = generations[generations['model'] == model]
    ref_responses = generations[generations['model'] == reference_model]

    # Find common prompt_ids
    common_prompt_ids = set(model_responses['prompt_id']).intersection(set(ref_responses['prompt_id']))

    for prompt_id in common_prompt_ids:
        model_row = model_responses[model_responses['prompt_id'] == prompt_id].iloc[0]
        ref_row = ref_responses[ref_responses['prompt_id'] == prompt_id].iloc[0]

        comparison_data.append({
            'prompt_id': prompt_id,
            'prompt': model_row['prompt'],
            'text_a': model_row['response'],
            'text_b': ref_row['response'],
            'model_a': model,
            'model_b': reference_model
        })

# Convert to DataFrame
comparison_df = pd.DataFrame(comparison_data)
print(f"Created comparison dataset with {len(comparison_df)} pairs")

comparison_df.to_csv("../data/output/model_comparison/comparison_data_v2.csv", index=False)

In [None]:
!icai-exp data_path="../data/output/model_comparison/comparison_data_v2.csv" s0_added_standard_principles_to_test="[v4]" annotator.skip=true s0_skip_principle_generation=true