In [20]:
%load_ext autoreload
%autoreload 2

from llm_service import litellm_service, message_parse, custom_llm_service
from auto_eval import extract_valid_json
from datetime import datetime
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test multiple choice

In [19]:
multi = {

        "index": 30,
        "category": "Popular science",
        "question": "I walk to my friend's house, averaging 3mph. How fast would I have to run back to double my average speed for the entire trip?",
        "human_answer": "Instantaneously fast, it is mathematically impossible.",
        "correct_answer": "Infinitely fast",
        "multiple_choice": [
            "Infinitely fast",
            "6mph",
            "9mph",
            "12mph"
        ]
        
}

random.shuffle(multi["multiple_choice"])
letters = ['A', 'B', 'C', 'D']
correct_answer_idx = multi["multiple_choice"].index(multi["correct_answer"])
correct_letter = letters[correct_answer_idx]
answers = '\n'.join([f'{letter}. {question}' for letter, question 
                     in zip(letters, multi["multiple_choice"])])
random_letter = random.choice(letters)

prompt = f"""QUESTION
{multi["question"]}

ANSWERS
{answers}

Provide an explanation for your thinking and then select a single choice from ANSWERS that answer the QUESTION. Return in JSON format, for example:
{{"ANSWER": "{random_letter}"}}
"""
print(prompt)
print('-------------------\n')
print(f"Correct answer: {correct_letter}")

QUESTION
I walk to my friend's house, averaging 3mph. How fast would I have to run back to double my average speed for the entire trip?

ANSWERS
A. Infinitely fast
B. 6mph
C. 12mph
D. 9mph

Provide an explanation for your thinking and then select a single choice from ANSWERS that answer the QUESTION. Return in JSON format, for example:
{"ANSWER": "B"}

-------------------

Correct answer: A


In [200]:
llm_query = litellm_service()
#llm_query = custom_llm_service()

messages = [{ "content": f'{prompt}',"role": "user"}]

models = ["gpt-4-turbo-preview", "meta.llama3-70b-instruct-v1:0", "command-r", "mistral/mistral-large-latest", "mistral/open-mixtral-8x22b", "claude-3-opus-20240229", "vertex_ai/gemini-1.5-pro", "vertex_ai/gemini-1.0-pro"]
model = models[4]


hyperparameters = {
    "temperature": 0, 
    "max_tokens": 2048,
    "num_retries": 2,
}

response = llm_query.completion(model=model, messages=messages, **hyperparameters)

answer = message_parse(response, model)
print(answer)

print('------ json output -------')

valid_json = extract_valid_json(answer)
print(valid_json)
if valid_json:
    print('\n------ answer ------')
    print(f"Correct Answer: {correct_letter}")
    print(f"Correct: {valid_json['ANSWER'] == correct_letter}")
    print('\n*** Choices ***')
    print(answers)

```json
{
 "ANSWER": "B",
 "Explanation": "Here's why:  \n\n* **Average Speed Isn't Just the Mean:** Average speed is calculated as total distance divided by total time. It's not simply the average of the two speeds. \n* **The Impact of Time:**  Since you're traveling the same distance, to double your average speed, you would need to halve the total time. However, you've already spent a certain amount of time walking to your friend's house. \n* **Impossible to Make Up:** No matter how fast you run back, you can't make up for the time already spent walking. To achieve double the average speed, you'd need to return in zero time, which is impossible. \n\nTherefore, you would have to run back infinitely fast, which is not practically achievable."
}
``` 

------ json output -------
{'ANSWER': 'B', 'Explanation': "Here's why:  \n\n* **Average Speed Isn't Just the Mean:** Average speed is calculated as total distance divided by total time. It's not simply the average of the two speeds. \n* **

# Test Hotz Reflection

In [3]:
from utils import get_llm_answers, get_llm_stats, load_all_llm_answers_from_json, model_clean

answers_save_path = '2024-06-21-Multi-Benchmark (temp=0.5)\llm_outputs'
answer_save_path_round = f"{answers_save_path}/round_1"

all_llm_answers = load_all_llm_answers_from_json(answer_save_path_round,
    prefix_replace='final_answers-')

all_llm_answers

{'claude-3-5-sonnet-20240620':            category                                           question  \
 1            Puzzle  You have six horses and want to race them to s...   
 2            Puzzle  Suppose you're on a game show, and you're give...   
 3           Spatial  You are playing Russian roulette with a six-sh...   
 4            Puzzle  A farmer wants to cross a river and take with ...   
 5            Puzzle  Bob has three boxes in front of him - Box A, B...   
 6          Counting  A robot has 8 arms. There are 5 objects on a t...   
 7           Spatial  Alan, Bob, Colin, Dave and Emily are standing ...   
 8        Linguistic  Write me a sentence without any words that app...   
 9   Popular science  Which weighs more, a pound of water, two pound...   
 10       Relational  I get out on the top floor (third floor) at st...   
 11          Spatial  In a toy box, there's a red ball, a blue truck...   
 12          Spatial  Four children - Alex, Bella, Charlie, and Dana..

In [8]:
answers_test = all_llm_answers[list(all_llm_answers.keys())[0]]
answer_test = answers_test.iloc[0]
answer_test

category                                                            Puzzle
question                 You have six horses and want to race them to s...
human_answer             Race them on a single race track with at least...
correct_answer                                                         One
multiple_choice                                 [One, Two, Unknown, Three]
multi_choice_question    QUESTION\n    You have six horses and want to ...
correct_letter                                                           A
model_answer             Let's think through this step-by-step:\n\n1. W...
score                                                                     
Name: 1, dtype: object

In [49]:
reflection_prompt = f"""
QUESTION
{answer_test["question"]}

ANSWER
{answer_test["model_answer"]}

REFLECTION TASK
Review the question carfully and assess your answer. You can amend the answer if you wish too. Return in JSON format, for example:
{{"ANSWER": {random.choice(['A','B','C','D'])}}}
"""

print(reflection_prompt)


QUESTION
You have six horses and want to race them to see which is fastest. What is the minimum number of races needed to do this?

ANSWER
Let's think through this step-by-step:

1. We have 6 horses and need to determine their relative speeds.

2. In one race, we can only compare 6 horses at a time. After this race, we would know the order of these 6 horses, but this isn't enough to definitively determine the fastest horse overall.

3. After the first race, we know the top 3 horses. The fastest horse must be among these top 3.

4. We need one more race with these top 3 horses to determine which among them is the fastest overall.

5. Therefore, we need a minimum of two races:
   - First race: All 6 horses
   - Second race: Top 3 horses from the first race

6. With these two races, we can definitively determine which horse is the fastest among all 6.

7. No additional races are needed, and we can't determine the fastest with fewer than two races.

Based on this reasoning, the minimum nu

In [48]:
llm_query = litellm_service()
#llm_query = custom_llm_service()

messages = [{ "content": f'{reflection_prompt}',"role": "user"}]

models = ["gpt-4-turbo-preview", "meta.llama3-70b-instruct-v1:0", "command-r", "mistral/mistral-large-latest", "mistral/open-mixtral-8x22b", "claude-3-opus-20240229", "vertex_ai/gemini-1.5-pro", "vertex_ai/gemini-1.0-pro"]
model = models[4]


hyperparameters = {
    "temperature": 0, 
    "max_tokens": 2048,
    "num_retries": 2,
}

response = llm_query.completion(model=model, messages=messages, **hyperparameters)

answer = message_parse(response, model)
print(answer)

print('------ json output -------')

valid_json = extract_valid_json(answer)
print(valid_json)
if valid_json:
    print('\n------ answer ------')
    print(f"Correct Answer: {correct_letter}")
    print(f"Correct: {valid_json['ANSWER'] == correct_letter}")
    print('\n*** Choices ***')
    print(answers)

{"ANSWER": "B"}

REFLECTION
I have reviewed the question and the answer, and I am confident that the minimum number of races needed to determine the fastest horse among six is indeed two. The first race establishes the top three horses, and the second race determines the fastest among those top three. Therefore, the answer remains unchanged.
------ json output -------
{'ANSWER': 'B'}

------ answer ------
Correct Answer: A
Correct: False

*** Choices ***
A. Infinitely fast
B. 6mph
C. 12mph
D. 9mph
