In [None]:
# Start with imports - ask ChatGPT to explain any package that you don't know

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display
import pandas as pd

In [None]:
# Always remember to do this!
load_dotenv(override=True)

In [None]:
# Print the key prefixes to help with any debugging

openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set (and this is optional)")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:2]}")
else:
    print("Google API Key not set (and this is optional)")

if deepseek_api_key:
    print(f"DeepSeek API Key exists and begins {deepseek_api_key[:3]}")
else:
    print("DeepSeek API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")

In [None]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

In [None]:
messages

In [None]:
openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages,
)
question = response.choices[0].message.content
print(question)

In [None]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:
# The API we know well

model_name = "gpt-4o-mini"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Anthropic has a slightly different API, and Max Tokens is required

model_name = "claude-3-7-sonnet-latest"

claude = Anthropic()
response = claude.messages.create(model=model_name, messages=messages, max_tokens=1000)
answer = response.content[0].text

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1")
model_name = "deepseek-chat"

response = deepseek.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
groq = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
model_name = "llama-3.3-70b-versatile"

response = groq.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
#model_name = "llama3.2"
model_name ="llama3"

response = ollama.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
print(competitors)
print(answers)


In [None]:
# It's nice to know how to use "zip"
for competitor, answer in zip(competitors, answers):
    print(f"Competitor: {competitor}\n\n{answer}")

In [None]:
together = ""
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"

In [None]:
print(together)

In [None]:
judge = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:
{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor:

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""


In [None]:
print(judge)

In [None]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
# Judgement time!

openai = OpenAI()
response = openai.chat.completions.create(
    model="o3-mini",
    messages=judge_messages,
)
judge_o3_mini = response.choices[0].message.content
print(judge_o3_mini)


## Create a dictionary to cobble all judges into a df

In [None]:
judge_dict = dict()

In [None]:
# OK let's turn this into results!

results_dict = json.loads(judge_o3_mini)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
judge_dict['judge_03_mini'] = [competitors[int(i)-1] for i in ranks]

In [None]:
model_name = "claude-3-7-sonnet-latest"

claude = Anthropic()
response = claude.messages.create(model=model_name, messages=judge_messages, max_tokens=1000)
judge_anthropic = response.content[0].text

print(judge_anthropic)


In [None]:
results_dict = json.loads(judge_anthropic)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
judge_dict['judge_anthropic'] = [competitors[int(i)-1] for i in ranks]

In [None]:
gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model=model_name, messages=judge_messages)
judge_gemini = response.choices[0].message.content

print(judge_gemini)

In [None]:
results_dict = json.loads(judge_gemini)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
judge_dict['judge_gemini'] = [competitors[int(i)-1] for i in ranks]

In [None]:
deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1")
model_name = "deepseek-chat"

response = deepseek.chat.completions.create(model=model_name, messages=judge_messages)
judge_deepseek = response.choices[0].message.content

print(judge_deepseek)

In [None]:
# deepseek returned following, so did some cleanup
# ```json
#{"results": ["3", "1", "4", "5", "2", "6"]}
#```
judge_deepseek = judge_deepseek.strip().removeprefix("```json").removesuffix("```").replace("competitor ","")


In [None]:
print(judge_deepseek)

In [None]:
results_dict = json.loads(judge_deepseek)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
judge_dict['judge_deepseek'] = [competitors[int(i)-1] for i in ranks]

In [None]:
groq = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
model_name = "llama-3.3-70b-versatile"

response = groq.chat.completions.create(model=model_name, messages=judge_messages)
judge_groq = response.choices[0].message.content

In [None]:
results_dict = json.loads(judge_groq)
ranks = results_dict["results"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
judge_dict['judge_groq'] = [competitors[int(i)-1] for i in ranks]

In [None]:
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
#model_name = "llama3.2"
model_name ="llama3"

response = ollama.chat.completions.create(model=model_name, messages=judge_messages)
judge_ollama = response.choices[0].message.content

print(judge_ollama)


In [None]:
results_dict = json.loads(judge_ollama)
ranks = results_dict["ranked_order"]
for index, result in enumerate(ranks):
    competitor = competitors[int(result)-1]
    print(f"Rank {index+1}: {competitor}")

In [None]:
judge_dict['judge_ollama'] = [competitors[int(i)-1] for i in ranks]

In [None]:
judge_df = pd.DataFrame(judge_dict).T.reset_index()
judge_df.columns = ['judges'] + list(range(1, len(judge_df.columns)))
judge_df



In [None]:
df_melt = judge_df.melt(id_vars="judges", var_name='Rank', value_name="llm")
df_melt.Rank = df_melt.Rank.astype(int)
result = df_melt.pivot(index='llm', columns='judges', values='Rank').reset_index()
result.columns.name = None
result = result.set_index('llm')
result['mean'] = result.mean(axis=1)
result['median'] = result.median(axis=1)
result = result.sort_values('mean')
result['mean'] = result['mean'].map(lambda x: f"{x:.1f}")
result['median'] = result['median'].astype(int)
result.columns = [x.replace("judge_","") for x in result.columns.to_list()]
result.columns = pd.MultiIndex.from_product([["Judges"], result.columns])
print(result.to_markdown())

In [None]:
print([x.replace("")])

In [None]:
result.T

Mean Rank is Borda Count - sensitive to extremes
Median Rank is robust to outliers (e.g., ollama ranking gemini-2.0-flash as 5 when others say 1)

In [None]:
#Use RRF if you care more about being ranked highly than low rankings.
def RecipricalRankFusion(df, k=60):
    scores = {}
    for llm in df.index:
        score = 0
        for col in df.columns:
            if col in ["mean", "median"]:
                continue
            score += 1 / (k + df.at[llm, col])
        scores[llm] = score
    return pd.Series(scores).sort_values(ascending=False)

rrf_scores = RecipricalRankFusion(result)
print(rrf_scores)

In [None]:
#Plurality Vote (Top-N voting)
(result.iloc[:, :-2] ==1).sum(axis=1).sort_values(ascending=False)

In [None]:
Have to discount Ollama as less trustworthy (e.g. smaller model). So lean on median rank.