### Level 1 questions: Solve step by step

### First look for types of tools required for each question

#### 1. load dataset

In [2]:
import os
from datasets import load_dataset

def load_gaia_dataset(levels=["2023_level1"], loader_path="./GAIA.py", split="validation"):
    """
    Load the GAIA dataset for the specified levels.
    
    Args:
        levels: List of dataset levels to process
        loader_path: Path to the GAIA loader script
        split: Dataset split to use
    
    Returns:
        Dictionary containing loaded datasets by level
    """
    loaded_datasets = {}
    
    print(f"Loading GAIA dataset for levels: {levels}")
    
    for level in levels:
        print(f"\nLoading level: {level}")
        
        try:
            dataset = load_dataset(loader_path, name=level, split=split)
            loaded_datasets[level] = dataset
            print(f"Successfully loaded {len(dataset)} examples from {level}")
        except Exception as e:
            print(f"Error loading dataset {level}: {e}")
    
    return loaded_datasets

# Example usage:
levels = ["2023_level1"]  # Change as needed
loader_path = "./GAIA.py"
split = "validation"

datasets = load_gaia_dataset(levels, loader_path, split)

Loading GAIA dataset for levels: ['2023_level1']

Loading level: 2023_level1
Successfully loaded 53 examples from 2023_level1


#### 2. load examples (for val)

In [3]:
from tqdm import tqdm

def get_examples_from_dataset(datasets):
    """
    Extract all examples from loaded datasets.
    
    Args:
        datasets: Dictionary containing loaded datasets by level
    
    Returns:
        List of all examples
    """
    all_examples = []
    
    for level, dataset in datasets.items():
        print(f"Extracting examples from level: {level}")
        
        for i, example in tqdm(enumerate(dataset), total=len(dataset)):
            all_examples.append({
                "level": level,
                "index": i,
                "example": example
            })
    
    print(f"Total examples extracted: {len(all_examples)}")
    return all_examples

# Example usage - requires output from previous step:
examples = get_examples_from_dataset(datasets)
examples

Extracting examples from level: 2023_level1


100%|██████████| 53/53 [00:00<00:00, 10154.77it/s]

Total examples extracted: 53





[{'level': '2023_level1',
  'index': 0,
  'example': {'task_id': 'e1fc63a2-da7a-432f-be78-7c4a95598703',
   'Question': 'If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.',
   'Level': '1',
   'Final answer': '17',
   'file_name': '',
   'file_path': '',
   'Annotator Metadata': {'Steps': '1. Googled Eliud Kipchoge marathon pace to find 4min 37sec/mile\n2. Converted into fractions of hours.\n3. Found moon periapsis in miles (225,623 miles).\n4. Multiplied the two to find the number of hours and rounded to the nearest 100 hours.',
    'Number of steps': '4',
    'How long did this take?': '20 Minutes',
    'Tools': '1. A web browser.\n2. A search 

#### 3. extract ques and answers one by one from examples

In [4]:
def extract_question_and_answer(example):
    """
    Extract question and expected answer from an example.
    
    Args:
        example: Dictionary containing the example data
    
    Returns:
        Dictionary with extracted question and expected answer
    """
    question = example["example"]["Question"]
    expected_answer = example["example"].get("Final answer", "")
    
    return {
        "level": example["level"],
        "index": example["index"],
        "task_id": example["example"].get("task_id", ""),
        "question": question,
        "expected_answer": expected_answer
    }

# Example usage - requires output from previous step:
extracted_data = []
for example in examples:
    extracted_data.append(extract_question_and_answer(example))

# Print the first extracted question and answer
if extracted_data:
    print(f"First example - Question: {extracted_data[0]['question']}")
    print(f"First example - Expected Answer: {extracted_data[0]['expected_answer']}")

First example - Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.
First example - Expected Answer: 17


#### 4. method to read associated file content (some examples have a file attached)

In [5]:
def read_file(file_path):
    """
    Read content from a file.
    
    Args:
        file_path: Path to the file to read
    
    Returns:
        File content as string
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        # Try with a different encoding if UTF-8 fails
        with open(file_path, 'r', encoding='latin-1') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def get_file_content(example):
    """
    Get file content for an example if available.
    
    Args:
        example: Dictionary containing example data with extracted information
    
    Returns:
        Dictionary with original data plus file content
    """
    # Get file path information from the original example
    original_example = examples[example["index"]]["example"]
    file_path = original_example.get("file_path", "")
    file_name = original_example.get("file_name", "")
    
    file_content = ""
    # Try to read from file_path
    if file_path and os.path.exists(file_path):
        try:
            file_content = read_file(file_path)
            print(f"Read file content from {file_path}: {len(file_content)} characters")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    # If that fails, try with file_name
    elif file_name and os.path.exists(file_name):
        try:
            file_content = read_file(file_name)
            print(f"Read file content from {file_name}: {len(file_content)} characters")
        except Exception as e:
            print(f"Error reading file {file_name}: {e}")
    
    # Add file content to the result
    result = example.copy()
    result["file_content"] = file_content
    result["has_file_content"] = bool(file_content)
    
    return result

# Example usage - requires output from previous steps:
examples_with_files = []
for example in extracted_data:
    examples_with_files.append(get_file_content(example))

# Check how many examples have file content
file_count = sum(1 for example in examples_with_files if example["has_file_content"])
print(f"Examples with file content: {file_count} out of {len(examples_with_files)}")

Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx: 17525 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx: 5115 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png: 63079 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png: 133565 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt: 97 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx: 388988 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx: 12370 c

#### 5. example of web search

In [6]:
from utils.search_capability import *
from utils.search_capability import search_and_parse

results = search_and_parse("eliud kipchoge nationality", max_results = 3)
print(f"Found {len(results['search_results'])} results.")
print(f"Content length: {len(results['parsed_content'])} characters")

Searching for: eliud kipchoge nationality
Searching with DDGS...
DDGS error: https://lite.duckduckgo.com/lite/ 202 Ratelimit
Searching with googlesearch-python...
Found 3 results with googlesearch-python.
Extracting content from search results...
Found 3 results.
Content length: 94065 characters


##### 6. Check iteration over questions and then try iterating to identify tools

In [7]:
# Iterate through all the example questions with proper formatting
for example in extracted_data:
    print(f"Task ID: {example['task_id']}")
    print(f"Question: {example['question']}")
    print(f"Expected Answer: {example['expected_answer']}")


Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.
Expected Answer: 17
Task ID: 8e867cd7-cff9-4e6c-867a-ff5ddc2550be
Question: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
Expected Answer: 3
Task ID: ec09fa32-d03f-4bf8-84b0-1f16922c3ae4
Question: Here's a fun riddle that I think you'll enjoy.

You have been selected to play the final round of the hit new game show "Pick That Ping-Pong". In this round, you will be competing for a large cash prize. Your job will be to pick one of several differ

##### Iterate over level 1 questions and extract tools 

In [None]:
import ollama
import json
import time
import re

# Assuming extracted_data is already defined and contains a list of dictionaries
# Each dictionary has 'task_id', 'question', and 'expected_answer' keys

# Create a new dictionary to store questions and their proposed tools
results = {}

# Define the prompt template
prompt_template = """You are an AI assistant tasked with determining the tools required to answer a given question from the GAIA benchmark.

Available Tools:
- WEB SEARCH: For retrieving up-to-date or external information.
- CALCULATOR: For performing mathematical computations.
- CODE EXECUTOR: For writing and executing code snippets.

Instructions:
1. Analyze the question carefully.
2. Identify which of the above tools are necessary to answer the question.
3. Provide a Python list containing the names of the required tools. If no tools are needed, return an empty list.

Example:
Question: "What is the capital of France?"
Answer: ["WEB SEARCH"]

Question: "Calculate the sum of 123 and 456."
Answer: ["CALCULATOR"]

Question: "Write a Python function to sort a list of numbers."
Answer: ["CODE EXECUTOR"]

Question: "Explain the process of photosynthesis."
Answer: []

Now, analyze the following question:

Question: "{question}"
Answer: """

# Define the extraction prompt template
extraction_prompt_template = """Extract ONLY the Python list of tools from the following response. Return just the Python list, nothing else.

Response:
{response}

Python list: """

# Function to try to extract a Python list using regex
def extract_list_with_regex(text):
    # Look for a Python list pattern
    pattern = r'\[(?:"[^"]*"|\'[^\']*\'|[^\[\]]*)*\]'
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
    return None

# Iterate over each example in extracted_data
for example in extracted_data:
    task_id = example['task_id']
    question = example['question']
    expected_answer = example['expected_answer']
    
    print("\n" + "="*80)
    print(f"Processing Task ID: {task_id}")
    print(f"Question: {question}")
    
    # Format the prompt with the current question
    formatted_prompt = prompt_template.format(question=question)
    
    # Send the prompt to the model
    try:
        # Get initial response
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[
                {'role': 'user', 'content': formatted_prompt}
            ]
        )
        
        # Extract the model's initial response
        initial_response = response['message']['content'].strip()
        
        # Print the initial response
        print("\nLLM Initial Output:")
        print(initial_response)
        
        # Try to extract the list with regex first
        extracted_list = extract_list_with_regex(initial_response)
        
        # If regex fails, send back to the model to extract just the list
        if not extracted_list:
            print("\nRegex extraction failed. Sending back to model for extraction...")
            
            extraction_prompt = extraction_prompt_template.format(response=initial_response)
            extraction_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': extraction_prompt}
                ]
            )
            
            extracted_list = extraction_response['message']['content'].strip()
            
            # Try regex again on the extracted response
            regex_result = extract_list_with_regex(extracted_list)
            if regex_result:
                extracted_list = regex_result
        
        # Validate if the extracted_list is actually a Python list
        is_valid = False
        try:
            # Try to evaluate as a Python list
            parsed_list = eval(extracted_list)
            if isinstance(parsed_list, list):
                is_valid = True
        except:
            is_valid = False
        
        # Print the extracted list
        print("\nExtracted List:")
        print(extracted_list)
        print(f"Valid Python list: {is_valid}")
        
        # Store the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'initial_response': initial_response,
            'extracted_list': extracted_list,
            'is_valid_list': is_valid
        }
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(1)
        
    except Exception as e:
        print(f"\nError processing task {task_id}: {str(e)}")
        # Store the error in the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'initial_response': f"ERROR: {str(e)}",
            'extracted_list': None,
            'is_valid_list': False
        }

# Save the results to a JSON file
with open("gaia_tool_results.json", "w") as file:
    json.dump(results, file, indent=4)

print("\n" + "="*80)
print(f"Processed {len(results)} questions and saved results to 'gaia_tool_results.json'")


Processing Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.

LLM Initial Output:
To answer this question, we need to perform a series of calculations and look up some information from external sources. Here's how I'll break it down:

1. First, we need to find the minimum perigee value of the Moon's closest approach. We can use the WEB SEARCH tool for this.
2. Next, we need to retrieve Eliud Kipchoge's marathon pace and convert it into a suitable unit for calculation (e.g., kilometers per hour). Again, we'll use WEB SEARCH for this information.
3. Then, we need to calculate the distance bet

In [17]:
import pandas as pd
resuldts_df = pd.DataFrame(results).T

In [19]:
resuldts_df["extracted_list"]

e1fc63a2-da7a-432f-be78-7c4a95598703                     ["WEB SEARCH", "CALCULATOR"]
8e867cd7-cff9-4e6c-867a-ff5ddc2550be                                   ["WEB SEARCH"]
ec09fa32-d03f-4bf8-84b0-1f16922c3ae4                                               []
5d0080cb-90d7-4712-bc33-848150e917d3                     ["WEB_SEARCH", "CALCULATOR"]
a1e91b78-d3d8-4675-bb8d-62741b4b68a6                                   ["WEB SEARCH"]
46719c30-f4c3-4cad-be07-d5cb21eee6bb    ["WEB SEARCH", "CALCULATOR", "CODE EXECUTOR"]
4b6bb5f7-f634-410e-815d-e673ab7f8632                  ["WEB SEARCH", "CODE EXECUTOR"]
cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb                                   ["WEB SEARCH"]
2d83110e-a098-4ebb-9987-066c06fa42d0                                ["CODE EXECUTOR"]
5cfb274c-0207-4aa7-9575-6ac0bd95d9b2                                ["CODE EXECUTOR"]
27d5d136-8563-469e-92bf-fd103c28b57c                                   ["WEB SEARCH"]
dc28cf18-6431-458b-83ef-64b3ce566c10                  

## Add code to ask for web search query suggestions if web search is required

In [20]:
import ollama
import json
import time
import re

# Assuming extracted_data is already defined and contains a list of dictionaries
# Each dictionary has 'task_id', 'question', and 'expected_answer' keys

# Create a new dictionary to store questions and their proposed tools
results = {}

# Define the primary tool selection prompt template
tool_prompt_template = """You are an AI assistant tasked with determining the tools required to answer a given question from the GAIA benchmark.

Available Tools:
- WEB SEARCH: For retrieving up-to-date or external information.
- CALCULATOR: For performing mathematical computations.
- CODE EXECUTOR: For writing and executing code snippets.

Instructions:
1. Analyze the question carefully.
2. Identify which of the above tools are necessary to answer the question.
3. Provide a Python list containing the names of the required tools. If no tools are needed, return an empty list.

Example:
Question: "What is the capital of France?"
Answer: ["WEB SEARCH"]

Question: "Calculate the sum of 123 and 456."
Answer: ["CALCULATOR"]

Question: "Write a Python function to sort a list of numbers."
Answer: ["CODE EXECUTOR"]

Question: "Explain the process of photosynthesis."
Answer: []

Now, analyze the following question:

Question: "{question}"
Answer: """

# Define the web search query generation prompt template
search_query_prompt_template = """You are an AI assistant helping to generate effective web search queries for a question.

The following question has been identified as requiring web search to answer:

Question: "{question}"

Instructions:
1. Analyze the question carefully.
2. Identify the key information needs.
3. Generate a list of specific search queries that would be most effective for finding the required information.
4. Return a Python list containing these search queries as strings.

Example:
Question: "What were the main causes of World War I?"
Answer: ["causes of World War I", "World War I origins", "assassination of Archduke Franz Ferdinand", "European alliances before World War I"]

Now, please generate search queries for the question above:"""

# Define the extraction prompt template
extraction_prompt_template = """Extract ONLY the Python list from the following response. Return just the Python list, nothing else.

Response:
{response}

Python list: """

# Function to try to extract a Python list using regex
def extract_list_with_regex(text):
    # Look for a Python list pattern
    pattern = r'\[(?:"[^"]*"|\'[^\']*\'|[^\[\]]*)*\]'
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
    return None

# Function to normalize tool names for consistent comparison
def normalize_tool_name(tool_name):
    # Convert to uppercase and remove underscores and spaces
    normalized = tool_name.upper().replace('_', '').replace(' ', '')
    return normalized

# Dictionary to map normalized tool names back to standard formats
TOOL_NAME_MAP = {
    'WEBSEARCH': 'WEB SEARCH',
    'CODEEXECUTOR': 'CODE EXECUTOR'
}

# Function to standardize tool names in a list
def standardize_tool_names(tools_list):
    standardized_list = []
    for tool in tools_list:
        normalized = normalize_tool_name(tool)
        # Map back to standard format if possible, otherwise use original
        standardized_list.append(TOOL_NAME_MAP.get(normalized, tool))
    return standardized_list

# Iterate over each example in extracted_data
for example in extracted_data:
    task_id = example['task_id']
    question = example['question']
    expected_answer = example['expected_answer']
    
    print("\n" + "="*80)
    print(f"Processing Task ID: {task_id}")
    print(f"Question: {question}")
    
    # Format the tool prompt with the current question
    formatted_prompt = tool_prompt_template.format(question=question)
    
    # Send the prompt to the model
    try:
        # Get initial response for tool selection
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[
                {'role': 'user', 'content': formatted_prompt}
            ]
        )
        
        # Extract the model's initial response
        initial_response = response['message']['content'].strip()
        
        print("\nLLM Initial Output (Tool Selection):")
        print(initial_response)
        
        # Try to extract the tools list with regex first
        tools_list_str = extract_list_with_regex(initial_response)
        
        # If regex fails, send back to the model to extract just the list
        if not tools_list_str:
            print("\nRegex extraction failed. Sending back to model for extraction...")
            
            extraction_prompt = extraction_prompt_template.format(response=initial_response)
            extraction_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': extraction_prompt}
                ]
            )
            
            tools_list_str = extraction_response['message']['content'].strip()
            
            # Try regex again on the extracted response
            regex_result = extract_list_with_regex(tools_list_str)
            if regex_result:
                tools_list_str = regex_result
        
        # Parse the tools list
        raw_tools_list = []
        standardized_tools_list = []
        is_valid_tools = False
        try:
            # Try to evaluate as a Python list
            raw_tools_list = eval(tools_list_str)
            if isinstance(raw_tools_list, list):
                is_valid_tools = True
                # Standardize the tool names
                standardized_tools_list = standardize_tool_names(raw_tools_list)
                print("\nStandardized Tools List:")
                print(standardized_tools_list)
        except Exception as e:
            print(f"Error parsing tools list: {str(e)}")
            raw_tools_list = []
            standardized_tools_list = []
        
        # Initialize search queries list
        search_queries = []
        is_valid_queries = False
        
        # Check if web search is required using normalized comparison
        web_search_required = False
        if is_valid_tools:
            for tool in raw_tools_list:
                if normalize_tool_name(tool) == 'WEBSEARCH':
                    web_search_required = True
                    break
        
        # If web search is required, generate search queries
        if web_search_required:
            print("\nWEB SEARCH required. Generating search queries...")
            
            # Format the search query prompt
            search_prompt = search_query_prompt_template.format(question=question)
            
            # Get search query response
            search_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': search_prompt}
                ]
            )
            
            # Extract the search query response
            search_response_text = search_response['message']['content'].strip()
            
            print("\nLLM Output (Search Queries):")
            print(search_response_text)
            
            # Try to extract the search queries list with regex
            queries_list_str = extract_list_with_regex(search_response_text)
            
            # If regex fails, send back to the model to extract just the list
            if not queries_list_str:
                print("\nRegex extraction failed for search queries. Sending back to model...")
                
                extraction_prompt = extraction_prompt_template.format(response=search_response_text)
                extraction_response = ollama.chat(
                    model='llama3.1:8b',
                    messages=[
                        {'role': 'user', 'content': extraction_prompt}
                    ]
                )
                
                queries_list_str = extraction_response['message']['content'].strip()
                
                # Try regex again on the extracted response
                regex_result = extract_list_with_regex(queries_list_str)
                if regex_result:
                    queries_list_str = regex_result
            
            # Parse the search queries list
            try:
                # Try to evaluate as a Python list
                search_queries = eval(queries_list_str)
                if isinstance(search_queries, list):
                    is_valid_queries = True
            except Exception as e:
                print(f"Error parsing search queries list: {str(e)}")
                search_queries = []
            
            print("\nExtracted Search Queries:")
            print(search_queries)
            print(f"Valid Python list: {is_valid_queries}")
        
        # Store the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'tools': {
                'initial_response': initial_response,
                'extracted_list': tools_list_str,
                'raw_tools': raw_tools_list,
                'standardized_tools': standardized_tools_list,
                'is_valid_list': is_valid_tools
            },
            'web_search_queries': {
                'required': web_search_required,
                'queries': search_queries,
                'is_valid_list': is_valid_queries
            }
        }
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(1)
        
    except Exception as e:
        print(f"\nError processing task {task_id}: {str(e)}")
        # Store the error in the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'tools': {
                'initial_response': f"ERROR: {str(e)}",
                'extracted_list': None,
                'raw_tools': [],
                'standardized_tools': [],
                'is_valid_list': False
            },
            'web_search_queries': {
                'required': False,
                'queries': [],
                'is_valid_list': False
            }
        }

# Save the results to a JSON file
with open("gaia_tool_search_results.json", "w") as file:
    json.dump(results, file, indent=4)

print("\n" + "="*80)
print(f"Processed {len(results)} questions and saved results to 'gaia_tool_search_results.json'")


Processing Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.

LLM Initial Output (Tool Selection):
To answer this question, we need to perform a series of calculations involving units conversion (from kilometers to hours), mathematical computation (to calculate the time taken), and formatting the output (rounding to the nearest thousand hours).

Here's the step-by-step analysis:

1. **Units Conversion:** We first need to convert the distance from kilometers to meters since we know that Eliud Kipchoge's marathon pace is in meters per minute, not kilometers.
2. **Mathematical Computation:** W

In [21]:
import pandas as pd
resuldts_df = pd.DataFrame(results).T

In [22]:
resuldts_df

Unnamed: 0,question,expected_answer,tools,web_search_queries
e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,17,"{'initial_response': 'To answer this question,...","{'required': False, 'queries': [], 'is_valid_l..."
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,3,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['""Mercedes Sosa..."
ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,3,{'initial_response': 'After analyzing the ques...,"{'required': False, 'queries': [], 'is_valid_l..."
5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,0.1777,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['University of ..."
a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,3,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['YouTube video ..."
46719c30-f4c3-4cad-be07-d5cb21eee6bb,Of the authors (First M. Last) that worked on ...,Mapping Human Oriented Information to Software...,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['""First M. Last..."
4b6bb5f7-f634-410e-815d-e673ab7f8632,"In Series 9, Episode 11 of Doctor Who, the Doc...",THE CASTLE,{'initial_response': 'A fan of Doctor Who! To...,"{'required': True, 'queries': [], 'is_valid_li..."
cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb,An office held a Secret Santa gift exchange wh...,Fred,"{'initial_response': 'To answer this question,...","{'required': False, 'queries': [], 'is_valid_l..."
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht...",Right,{'initial_response': 'A clever question! Afte...,"{'required': False, 'queries': [], 'is_valid_l..."
5cfb274c-0207-4aa7-9575-6ac0bd95d9b2,Each cell in the attached spreadsheet represen...,No,{'initial_response': 'A challenging question! ...,"{'required': True, 'queries': ['pathfinding al..."
