In [1]:
# Loading the Arena data
# Llama-4-Maverick-03-26-Experimental vs other models
# From https://huggingface.co/spaces/lmarena-ai/Llama-4-Maverick-03-26-Experimental_battles/tree/main/data

import pathlib
import pandas as pd
import json
import os

arena_path = pathlib.Path("../data/input/clean-llama4.jsonl")
data = []
with open(arena_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

arena_df = pd.DataFrame(data)
print(f"Columns:{list(arena_df.columns)}")
print(f"Length:{len(arena_df)}")
arena_df.head(3)


Columns:['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn', 'anony', 'language', 'tstamp', 'conv_metadata', 'is_code', 'is_refusal', 'metadata_a', 'metadata_b', 'dedup_tag', 'category_tag', 'category', 'outcome', 'opponent', 'redacted_messages_a', 'redacted_messages_b', 'redacted']
Length:2382


Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,anony,language,...,metadata_a,metadata_b,dedup_tag,category_tag,category,outcome,opponent,redacted_messages_a,redacted_messages_b,redacted
0,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,Llama-4-Maverick-03-26-Experimental,model_b,46c8580211c21011fbb509d9d2bc9945,"[{'role': 'user', 'content': 'Sténose de la ca...","[{'role': 'user', 'content': 'Sténose de la ca...",1,True,French,...,,,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'specificity': False, 'doma...",Llama-4-Maverick-03-26-Experimental battles,Llama-4-Maverick-03-26-Experimental Won,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...","[{'role': 'user', 'content': 'Sténose de la ca...",False
1,e8745be6affc47bfb94120b193600a66,llama-3.3-70b-instruct,Llama-4-Maverick-03-26-Experimental,model_b,8d99ff45127a5424099074ab1107dff2,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...","[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",1,True,Chinese,...,,,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'specificity': True, 'domai...",Llama-4-Maverick-03-26-Experimental battles,Llama-4-Maverick-03-26-Experimental Won,llama-3.3-70b-instruct,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...","[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",False
2,65aaba9761bf4dd2be96cb77c6f5f8a6,Llama-4-Maverick-03-26-Experimental,phi-4,tie (bothbad),11ab977c0f5b55a32f7a7b26a5fa54f1,"[{'role': 'user', 'content': 'Why do most scor...","[{'role': 'user', 'content': 'Why do most scor...",1,True,English,...,,,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'specificity': True, 'domai...",Llama-4-Maverick-03-26-Experimental battles,Tie,phi-4,"[{'role': 'user', 'content': 'Why do most scor...","[{'role': 'user', 'content': 'Why do most scor...",False


In [2]:
# Data cleaning: remove the token number from the conversations
def remove_num_tokens(x):
    """
    Remove 'num_tokens' field from conversation data which can be in different formats:
    - string representation of a list/dict
    - list of dictionaries
    - dictionary
    """
    if isinstance(x, str):
        # Handle string representation of data
        parsed_data = json.loads(json.dumps(eval(x)))
        return [{k: v for k, v in item.items() if k != 'num_tokens'} for item in parsed_data]
    elif isinstance(x, list):
        # Handle list of dictionaries
        return [{k: v for k, v in item.items() if k != 'num_tokens'} for item in x]
    else:
        # Return unchanged if none of the above
        print(f"Unknown type: {type(x)}")
        print(f"beginning of x: {x[:100]}")
        print(f"end of x: {x[-100:]}")
        return x

# Apply the function to both conversation columns
for col in ["conversation_a", "conversation_b"]:
    arena_df[col] = arena_df[col].apply(remove_num_tokens)


# spot check if num_tokens is present in the conversation_a and conversation_b columns
sample_a = arena_df["conversation_a"].iloc[0]
sample_b = arena_df["conversation_b"].iloc[0]

if isinstance(sample_a, list) and len(sample_a) > 0:
    print(f"Sample conversation_a keys: {sample_a[0].keys()}")
if isinstance(sample_b, list) and len(sample_b) > 0:
    print(f"Sample conversation_b keys: {sample_b[0].keys()}")

# Check if num_tokens was successfully removed
has_num_tokens_a = isinstance(sample_a, list) and len(sample_a) > 0 and 'num_tokens' in sample_a[0]
has_num_tokens_b = isinstance(sample_b, list) and len(sample_b) > 0 and 'num_tokens' in sample_b[0]
print(f"num_tokens still present in conversation_a: {has_num_tokens_a}")
print(f"num_tokens still present in conversation_b: {has_num_tokens_b}")


Sample conversation_a keys: dict_keys(['role', 'content'])
Sample conversation_b keys: dict_keys(['role', 'content'])
num_tokens still present in conversation_a: False
num_tokens still present in conversation_b: False


In [3]:
# get just the prompt (we only generate for single-turn for simplicity)
arena_singleturn_df = arena_df[arena_df["turn"] == 1].copy()
print(f"Len of only single turn conversation: {len(arena_singleturn_df)}")
arena_singleturn_df.loc[:, "prompt_a"] = arena_singleturn_df["conversation_a"].apply(
    lambda x: x[0]['content'] if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) and 'content' in x[0] else None
)
arena_singleturn_df.loc[:, "prompt_b"] = arena_singleturn_df["conversation_b"].apply(
    lambda x: x[0]['content'] if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) and 'content' in x[0] else None
)

# sanity check: prompt_a should equal prompt_b
assert arena_singleturn_df["prompt_a"].equals(arena_singleturn_df["prompt_b"])


Len of only single turn conversation: 2069


In [4]:
# generate new responses using openrouter Llama 4 Maverick
import json
from tqdm import tqdm
import os
import inverse_cai.models

model = inverse_cai.models.get_model("openrouter/meta-llama/llama-4-maverick", max_tokens=100000)

# Create output directory if it doesn't exist
os.makedirs("data/output", exist_ok=True)

# Open a file to write the results
output_file = "../data/output/llama4_maverick_openrouter_responses.jsonl"

# Process each prompt and save the response
# Check if file exists to continue from where we left off
import os

# Function to get already processed prompt IDs
def get_processed_ids():
    processed_ids = set()
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            for line in f:
                try:
                    record = json.loads(line)
                    processed_ids.add(record["question_id"])
                except:
                    pass
    return processed_ids

# Get already processed prompt IDs
processed_ids = get_processed_ids()

# Open file in append mode to continue where we left off
with open(output_file, "a") as f:
    for idx, row in tqdm(arena_singleturn_df.iterrows(), total=len(arena_singleturn_df)):
        question_id = row["question_id"]
        prompt = row["prompt_a"]

        # Skip if already processed
        if question_id in processed_ids:
            continue

        try:
            # Create message and generate response
            msg = {"role": "user", "content": prompt}
            generation = model.invoke([msg])

            # Create a record with prompt, response, and metadata
            record = {
                "question_id": question_id,
                "prompt": prompt,
                "response": generation.content,
                "full_response": dict(generation),
                "model": "meta-llama/llama-4-maverick"
            }

            # Write the record to the file
            f.write(json.dumps(record) + "\n")

            # Add to processed set
            processed_ids.add(question_id)

        except Exception as e:
            print(f"Error processing prompt {question_id}: {e}")

100%|██████████| 2069/2069 [00:00<00:00, 27072.27it/s]


In [5]:
# Load the newly generated responses (Llama 4 Maverick via openrouter)
data = []
with open(output_file, 'r') as file:
    for line in file:
        data.append(json.loads(line))

gen_df = pd.DataFrame(data)
gen_df.head(3)



Unnamed: 0,question_id,prompt,response,full_response,model
0,caedec12e65841c5857d9879f892abb9,Sténose de la carotide interne extra crânienne...,La sténose de la carotide interne extra-crânie...,{'content': 'La sténose de la carotide interne...,meta-llama/llama-4-maverick
1,e8745be6affc47bfb94120b193600a66,续写接下来三章，共8500字，第一章写主角与红豆的，懵懂的恋爱，要读者心痒痒的感觉。团藏出手...,## 第一章 懵懂的恋爱与初次任务\n\n春日的暖阳洒在木叶村的街道上，给宁静的村庄带来了一...,{'content': '## 第一章 懵懂的恋爱与初次任务 春日的暖阳洒在木叶村的街道上...,meta-llama/llama-4-maverick
2,65aaba9761bf4dd2be96cb77c6f5f8a6,Why do most scores written for brass neglect t...,You're right; many scores written for brass in...,{'content': 'You're right; many scores written...,meta-llama/llama-4-maverick


In [6]:
# fix formatting to standard form
gen_df["conversation"] = gen_df.apply(
    lambda row: [
        {
            'role': 'user',
            'content': row['prompt'],
        },
        {
            'role': 'assistant',
            'content': row['response'],
        }
    ],
    axis=1
)


# Select subset of generations that were not stopped by external factors
# (e.g. max tokens)
gen_df["finish_reason"] = gen_df["full_response"].apply(
        lambda x: x.get("response_metadata", {}).get("finish_reason", None)
        if isinstance(x, dict) else None
    )

print(gen_df["finish_reason"].value_counts())
gen_df = gen_df[gen_df["finish_reason"] != "length"]


finish_reason
stop    2069
Name: count, dtype: int64


In [7]:
# merge arena df and new generations into one df
# with three model responses per row (if generation available)

merged_df = arena_df[["question_id"]].copy()
merged_df["response_a"] = arena_df["conversation_a"]
merged_df["response_b"] = arena_df["conversation_b"]
merged_df["model_a"] = arena_df["model_a"]
merged_df["model_b"] = arena_df["model_b"]

merged_df.head()

Unnamed: 0,question_id,response_a,response_b,model_a,model_b
0,caedec12e65841c5857d9879f892abb9,"[{'role': 'user', 'content': 'Sténose de la ca...","[{'role': 'user', 'content': 'Sténose de la ca...",llama-3.3-70b-instruct,Llama-4-Maverick-03-26-Experimental
1,e8745be6affc47bfb94120b193600a66,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...","[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",llama-3.3-70b-instruct,Llama-4-Maverick-03-26-Experimental
2,65aaba9761bf4dd2be96cb77c6f5f8a6,"[{'role': 'user', 'content': 'Why do most scor...","[{'role': 'user', 'content': 'Why do most scor...",Llama-4-Maverick-03-26-Experimental,phi-4
3,1d620078d63e4becb3bef25ef545ace3,"[{'role': 'user', 'content': 'create me fun we...","[{'role': 'user', 'content': 'create me fun we...",command-a-03-2025,Llama-4-Maverick-03-26-Experimental
4,b31f569c0eec4f75ad52ff6ff1996610,"[{'role': 'user', 'content': 'Comment puis-je ...","[{'role': 'user', 'content': 'Comment puis-je ...",mistral-small-24b-instruct-2501,Llama-4-Maverick-03-26-Experimental


In [8]:
import numpy as np

# Merge merged_df with gen_df on question_id
# Left join to keep all rows from merged_df, even if there's no match in gen_df
merged_df = merged_df.merge(
    gen_df[["question_id", "conversation"]],
    left_on="question_id",
    right_on="question_id",
    how="left"
)

# Rename the conversation column to response_c for consistency
merged_df.rename(columns={"conversation": "response_c"}, inplace=True)

# Add model_c column to indicate the source of response_c
merged_df["model_c"] = np.where(merged_df["response_c"].notna(), "llama-4-maverick-openrouter", None)

print(f"Merged dataframe shape: {merged_df.shape}")
print(f"Number of rows with response_c: {merged_df['response_c'].notna().sum()}")

# Reorder columns for better organization
merged_df = merged_df[['question_id',
                       'model_a', 'response_a',
                       'model_b', 'response_b',
                       'model_c', 'response_c']]


merged_df.head(3)


Merged dataframe shape: (2382, 7)
Number of rows with response_c: 2069


Unnamed: 0,question_id,model_a,response_a,model_b,response_b,model_c,response_c
0,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca..."
1,e8745be6affc47bfb94120b193600a66,llama-3.3-70b-instruct,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第..."
2,65aaba9761bf4dd2be96cb77c6f5f8a6,Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Why do most scor...",phi-4,"[{'role': 'user', 'content': 'Why do most scor...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Why do most scor..."


In [9]:
# Generate a new df with pairings a vs b, a vs c, and b vs c (only skipping specific pairings if a response is missing)
pairwise_comparisons = []

# Process each row in the merged dataframe
for _, row in merged_df.iterrows():

    # Check if response exists and is valid
    response_a_avail = isinstance(row['response_a'], list) or not pd.isna(row['response_a'])
    response_b_avail = isinstance(row['response_b'], list) or not pd.isna(row['response_b'])
    response_c_avail = isinstance(row['response_c'], list) or not pd.isna(row['response_c'])

    # Create a vs b pairing if both responses exist
    if response_a_avail and response_b_avail:
        pairwise_comparisons.append({
            'question_id': row['question_id'],
            'model_a': row['model_a'],
            'text_a': row['response_a'],
            'model_b': row['model_b'],
            'text_b': row['response_b'],
            'pairing': 'a_vs_b'
        })

    # Create a vs c pairing if both responses exist
    if response_a_avail and response_c_avail:
        pairwise_comparisons.append({
            'question_id': row['question_id'],
            'model_a': row['model_a'],
            'text_a': row['response_a'],
            'model_b': row['model_c'],
            'text_b': row['response_c'],
            'pairing': 'a_vs_c'
        })

    # Create b vs c pairing if both responses exist
    if response_b_avail and response_c_avail:
        pairwise_comparisons.append({
            'question_id': row['question_id'],
            'model_a': row['model_b'],
            'text_a': row['response_b'],
            'model_b': row['model_c'],
            'text_b': row['response_c'],
            'pairing': 'b_vs_c'
        })

# Create the pairwise comparisons dataframe
pairwise_df = pd.DataFrame(pairwise_comparisons)

# Display information about the pairwise comparisons
print(f"Total number of pairwise comparisons: {len(pairwise_df)}")
print(f"Number of unique questions: {pairwise_df['question_id'].nunique()}")
print(f"Pairings distribution:\n{pairwise_df['pairing'].value_counts()}")

# Display the first few rows
pairwise_df.head()


Total number of pairwise comparisons: 6520
Number of unique questions: 2382
Pairings distribution:
pairing
a_vs_b    2382
a_vs_c    2069
b_vs_c    2069
Name: count, dtype: int64


Unnamed: 0,question_id,model_a,text_a,model_b,text_b,pairing
0,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",a_vs_b
1,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca...",a_vs_c
2,caedec12e65841c5857d9879f892abb9,Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca...",b_vs_c
3,e8745be6affc47bfb94120b193600a66,llama-3.3-70b-instruct,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",a_vs_b
4,e8745be6affc47bfb94120b193600a66,llama-3.3-70b-instruct,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': '续写接下来三章，共8500字，第...",a_vs_c


In [10]:
# now add pseudo model annotators based on the models column
# e.g. if "model X" is model_b, then there should be a column named "model X" with entry "text_b"
# Create columns for each unique model
unique_models = set(pairwise_df['model_a'].tolist() + pairwise_df['model_b'].tolist())
print(f"Num of unique models: {len(unique_models)}")
print("List of unique models:")
for model in sorted(list(unique_models)):
    print(f"- {model}")

# For each row, populate the model columns with the appropriate text
for index, row in pairwise_df.iterrows():
    # Set model_a's column to text_a
    pairwise_df.loc[index, row['model_a'].lower()] = "text_a"

    # Set model_b's column to text_b
    pairwise_df.loc[index, row['model_b'].lower()] = "text_b"

# Display the updated dataframe
print(f"Added {len(unique_models)} model-specific columns")
pairwise_df.head(3)

Num of unique models: 31
List of unique models:
- Llama-4-Maverick-03-26-Experimental
- amazon-nova-lite-v1.0
- amazon-nova-micro-v1.0
- amazon-nova-pro-v1.0
- chatgpt-4o-latest-20250129
- chatgpt-4o-latest-20250326
- claude-3-5-haiku-20241022
- claude-3-5-sonnet-20241022
- claude-3-7-sonnet-20250219
- command-a-03-2025
- deepseek-r1
- deepseek-v3-0324
- gemini-2.0-flash-001
- gemini-2.0-flash-lite-preview-02-05
- gemini-2.0-flash-thinking-exp-01-21
- gemini-2.5-pro-exp-03-25
- gemma-3-27b-it
- gpt-4.5-preview-2025-02-27
- gpt-4o-mini-2024-07-18
- grok-3-preview-02-24
- llama-3.1-405b-instruct-bf16
- llama-3.3-70b-instruct
- llama-4-maverick-openrouter
- mistral-large-2411
- mistral-small-24b-instruct-2501
- o1-2024-12-17
- o3-mini
- o3-mini-high
- phi-4
- qwen2.5-max
- qwq-32b
Added 31 model-specific columns


Unnamed: 0,question_id,model_a,text_a,model_b,text_b,pairing,llama-3.3-70b-instruct,llama-4-maverick-03-26-experimental,llama-4-maverick-openrouter,phi-4,...,amazon-nova-lite-v1.0,o1-2024-12-17,gpt-4.5-preview-2025-02-27,grok-3-preview-02-24,mistral-large-2411,gemini-2.0-flash-lite-preview-02-05,gemini-2.5-pro-exp-03-25,gemini-2.0-flash-thinking-exp-01-21,amazon-nova-micro-v1.0,deepseek-v3-0324
0,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",a_vs_b,text_a,text_b,,,...,,,,,,,,,,
1,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca...",a_vs_c,text_a,,text_b,,...,,,,,,,,,,
2,caedec12e65841c5857d9879f892abb9,Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca...",b_vs_c,,text_a,text_b,,...,,,,,,,,,,


In [11]:
# Add human annotations from arena_df
print("Adding human annotations from the original dataset...")

# Function to flip winner if models are reversed
def get_winner(match, row):
    if match['model_a'] == row['model_a']:
        return match['winner']
    elif match['winner'] in ['model_a', 'model_b']:
        return 'model_b' if match['winner'] == 'model_a' else 'model_a'
    return match['winner']  # Keep 'tie' as is

# Merge datasets on question_id
merged = pd.merge(
    pairwise_df,
    arena_df[['question_id', 'model_a', 'model_b', 'winner']],
    on='question_id',
    suffixes=('', '_arena')
)

# Filter for matching model pairs (in either order)
merged = merged[
    ((merged['model_a'] == merged['model_a_arena']) & (merged['model_b'] == merged['model_b_arena'])) |
    ((merged['model_a'] == merged['model_b_arena']) & (merged['model_b'] == merged['model_a_arena']))
]

# Apply the winner transformation
merged['human_annotation'] = merged.apply(lambda x: get_winner(x, x), axis=1)

# Update the original dataframe
pairwise_df = pd.merge(
    pairwise_df,
    merged[['question_id', 'model_a', 'model_b', 'human_annotation']],
    on=['question_id', 'model_a', 'model_b'],
    how='left'
)

# rename human_annotation to human
pairwise_df.rename(columns={'human_annotation': 'human'}, inplace=True)

# rename "model_a" values in "human" col to "text_a" and "model_b" values to "text_b"
print(pairwise_df['human'].value_counts())
pairwise_df['human'] = pairwise_df['human'].replace({'model_a': 'text_a', 'model_b': 'text_b'})

# Print statistics
annotation_counts = pairwise_df['human'].value_counts(dropna=True)
print(f"Added {pairwise_df['human'].notna().sum()} human annotations")
print(f"Annotation distribution:\n{annotation_counts}")

# Print original stats
print(f"Original stats: {arena_df['winner'].value_counts()}")



Adding human annotations from the original dataset...
human
model_b          895
model_a          879
tie              320
tie (bothbad)    288
Name: count, dtype: int64
Added 2382 human annotations
Annotation distribution:
human
text_b           895
text_a           879
tie              320
tie (bothbad)    288
Name: count, dtype: int64
Original stats: winner
model_b          895
model_a          879
tie              320
tie (bothbad)    288
Name: count, dtype: int64


In [12]:
pairwise_df.head(3)

Unnamed: 0,question_id,model_a,text_a,model_b,text_b,pairing,llama-3.3-70b-instruct,llama-4-maverick-03-26-experimental,llama-4-maverick-openrouter,phi-4,...,o1-2024-12-17,gpt-4.5-preview-2025-02-27,grok-3-preview-02-24,mistral-large-2411,gemini-2.0-flash-lite-preview-02-05,gemini-2.5-pro-exp-03-25,gemini-2.0-flash-thinking-exp-01-21,amazon-nova-micro-v1.0,deepseek-v3-0324,human
0,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",a_vs_b,text_a,text_b,,,...,,,,,,,,,,text_b
1,caedec12e65841c5857d9879f892abb9,llama-3.3-70b-instruct,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca...",a_vs_c,text_a,,text_b,,...,,,,,,,,,,
2,caedec12e65841c5857d9879f892abb9,Llama-4-Maverick-03-26-Experimental,"[{'role': 'user', 'content': 'Sténose de la ca...",llama-4-maverick-openrouter,"[{'role': 'user', 'content': 'Sténose de la ca...",b_vs_c,,text_a,text_b,,...,,,,,,,,,,


In [13]:
## Add another annotator that selects all models that do not have "llama-4-maverick" in their name

# Create a function to determine if a model is a Llama 4 Maverick variant
def is_llama4_maverick(model_name):
    return 'llama-4-maverick' in model_name.lower()

# Create a new column for the anti-maverick annotator
def anti_maverick_preference(row):
    # If both models are Maverick or both are not Maverick, no preference
    if is_llama4_maverick(row['model_a']) == is_llama4_maverick(row['model_b']):
        return None
    # If model_a is not Maverick but model_b is, prefer model_a
    elif not is_llama4_maverick(row['model_a']) and is_llama4_maverick(row['model_b']):
        return 'text_a'
    # If model_a is Maverick but model_b is not, prefer model_b
    else:
        return 'text_b'

# Apply the anti-maverick annotator
pairwise_df['non-llama4'] = pairwise_df.apply(anti_maverick_preference, axis=1)

# Print statistics
print(f"Added {pairwise_df['non-llama4'].notna().sum()} non-llama4-maverick annotations")
print(f"Non-maverick annotation distribution:\n{pairwise_df['non-llama4'].value_counts(dropna=True)}")


Added 4451 non-llama4-maverick annotations
Non-maverick annotation distribution:
non-llama4
text_a    3215
text_b    1236
Name: count, dtype: int64


In [14]:
## Add another annotator that only annotates llama4-maverick-03-26-experimental vs llama-4-maverick-openrouter comparisons

# Create a function to identify experimental vs public maverick comparisons
def is_exp_vs_public_maverick(row):
    models = [row['model_a'].lower(), row['model_b'].lower()]
    return ('llama-4-maverick-03-26-experimental' in models and
            'llama-4-maverick-openrouter' in models)

# Apply the annotator - always prefer the experimental version
def exp_vs_public_preference(row):
    if not is_exp_vs_public_maverick(row):
        return None

    if 'llama-4-maverick-03-26-experimental' in row['model_a'].lower():
        return 'text_a'
    else:
        return 'text_b'

# Add the new annotator column
key = 'llama4-maverick-exp-vs-public'
pairwise_df[key] = pairwise_df.apply(exp_vs_public_preference, axis=1)

# Print statistics
print(f"Added {pairwise_df[key].notna().sum()} experimental vs public maverick annotations")
print(f"Experimental vs public annotation distribution:\n{pairwise_df[key].value_counts(dropna=True)}")


Added 2069 experimental vs public maverick annotations
Experimental vs public annotation distribution:
llama4-maverick-exp-vs-public
text_a    2069
Name: count, dtype: int64


In [15]:
# save the resulting csv

# Define the output filename
output_filename = "../data/output/llama4_exp_vs_public_vs_other_v2.csv"

# Save the dataframe to CSV
pairwise_df.to_csv(output_filename, index=False)

print(f"Saved pairwise comparisons to {output_filename}")
print(pairwise_df.columns)


Saved pairwise comparisons to ../data/output/llama4_exp_vs_public_vs_other_v2.csv
Index(['question_id', 'model_a', 'text_a', 'model_b', 'text_b', 'pairing',
       'llama-3.3-70b-instruct', 'llama-4-maverick-03-26-experimental',
       'llama-4-maverick-openrouter', 'phi-4', 'command-a-03-2025',
       'mistral-small-24b-instruct-2501', 'chatgpt-4o-latest-20250326',
       'qwq-32b', 'amazon-nova-pro-v1.0', 'qwen2.5-max', 'o3-mini',
       'gemini-2.0-flash-001', 'gemma-3-27b-it', 'claude-3-7-sonnet-20250219',
       'o3-mini-high', 'llama-3.1-405b-instruct-bf16',
       'claude-3-5-haiku-20241022', 'claude-3-5-sonnet-20241022',
       'gpt-4o-mini-2024-07-18', 'deepseek-r1', 'chatgpt-4o-latest-20250129',
       'amazon-nova-lite-v1.0', 'o1-2024-12-17', 'gpt-4.5-preview-2025-02-27',
       'grok-3-preview-02-24', 'mistral-large-2411',
       'gemini-2.0-flash-lite-preview-02-05', 'gemini-2.5-pro-exp-03-25',
       'gemini-2.0-flash-thinking-exp-01-21', 'amazon-nova-micro-v1.0',
       