### Level 1 questions: Solve step by step

### First look for types of tools required for each question

#### 1. load dataset

In [3]:
import os
from datasets import load_dataset

def load_gaia_dataset(levels=["2023_level1"], loader_path="./GAIA.py", split="validation"):
    """
    Load the GAIA dataset for the specified levels.
    
    Args:
        levels: List of dataset levels to process
        loader_path: Path to the GAIA loader script
        split: Dataset split to use
    
    Returns:
        Dictionary containing loaded datasets by level
    """
    loaded_datasets = {}
    
    print(f"Loading GAIA dataset for levels: {levels}")
    
    for level in levels:
        print(f"\nLoading level: {level}")
        
        try:
            dataset = load_dataset(loader_path, name=level, split=split)
            loaded_datasets[level] = dataset
            print(f"Successfully loaded {len(dataset)} examples from {level}")
        except Exception as e:
            print(f"Error loading dataset {level}: {e}")
    
    return loaded_datasets

# Example usage:
levels = ["2023_level1"]  # Change as needed
loader_path = "./GAIA.py"
split = "validation"

datasets = load_gaia_dataset(levels, loader_path, split)

Loading GAIA dataset for levels: ['2023_level1']

Loading level: 2023_level1
Successfully loaded 53 examples from 2023_level1


#### 2. load examples (for val)

In [4]:
from tqdm import tqdm

def get_examples_from_dataset(datasets):
    """
    Extract all examples from loaded datasets.
    
    Args:
        datasets: Dictionary containing loaded datasets by level
    
    Returns:
        List of all examples
    """
    all_examples = []
    
    for level, dataset in datasets.items():
        print(f"Extracting examples from level: {level}")
        
        for i, example in tqdm(enumerate(dataset), total=len(dataset)):
            all_examples.append({
                "level": level,
                "index": i,
                "example": example
            })
    
    print(f"Total examples extracted: {len(all_examples)}")
    return all_examples

# Example usage - requires output from previous step:
examples = get_examples_from_dataset(datasets)
examples

Extracting examples from level: 2023_level1


100%|██████████| 53/53 [00:00<00:00, 10153.84it/s]

Total examples extracted: 53





[{'level': '2023_level1',
  'index': 0,
  'example': {'task_id': 'e1fc63a2-da7a-432f-be78-7c4a95598703',
   'Question': 'If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.',
   'Level': '1',
   'Final answer': '17',
   'file_name': '',
   'file_path': '',
   'Annotator Metadata': {'Steps': '1. Googled Eliud Kipchoge marathon pace to find 4min 37sec/mile\n2. Converted into fractions of hours.\n3. Found moon periapsis in miles (225,623 miles).\n4. Multiplied the two to find the number of hours and rounded to the nearest 100 hours.',
    'Number of steps': '4',
    'How long did this take?': '20 Minutes',
    'Tools': '1. A web browser.\n2. A search 

#### 3. extract ques and answers one by one from examples

In [5]:
def extract_question_and_answer(example):
    """
    Extract question and expected answer from an example.
    
    Args:
        example: Dictionary containing the example data
    
    Returns:
        Dictionary with extracted question and expected answer
    """
    question = example["example"]["Question"]
    expected_answer = example["example"].get("Final answer", "")
    
    return {
        "level": example["level"],
        "index": example["index"],
        "task_id": example["example"].get("task_id", ""),
        "question": question,
        "expected_answer": expected_answer
    }

# Example usage - requires output from previous step:
extracted_data = []
for example in examples:
    extracted_data.append(extract_question_and_answer(example))

# Print the first extracted question and answer
if extracted_data:
    print(f"First example - Question: {extracted_data[0]['question']}")
    print(f"First example - Expected Answer: {extracted_data[0]['expected_answer']}")

First example - Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.
First example - Expected Answer: 17


#### 4. method to read associated file content (some examples have a file attached)

In [5]:
def read_file(file_path):
    """
    Read content from a file.
    
    Args:
        file_path: Path to the file to read
    
    Returns:
        File content as string
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        # Try with a different encoding if UTF-8 fails
        with open(file_path, 'r', encoding='latin-1') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def get_file_content(example):
    """
    Get file content for an example if available.
    
    Args:
        example: Dictionary containing example data with extracted information
    
    Returns:
        Dictionary with original data plus file content
    """
    # Get file path information from the original example
    original_example = examples[example["index"]]["example"]
    file_path = original_example.get("file_path", "")
    file_name = original_example.get("file_name", "")
    
    file_content = ""
    # Try to read from file_path
    if file_path and os.path.exists(file_path):
        try:
            file_content = read_file(file_path)
            print(f"Read file content from {file_path}: {len(file_content)} characters")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    # If that fails, try with file_name
    elif file_name and os.path.exists(file_name):
        try:
            file_content = read_file(file_name)
            print(f"Read file content from {file_name}: {len(file_content)} characters")
        except Exception as e:
            print(f"Error reading file {file_name}: {e}")
    
    # Add file content to the result
    result = example.copy()
    result["file_content"] = file_content
    result["has_file_content"] = bool(file_content)
    
    return result

# Example usage - requires output from previous steps:
examples_with_files = []
for example in extracted_data:
    examples_with_files.append(get_file_content(example))

# Check how many examples have file content
file_count = sum(1 for example in examples_with_files if example["has_file_content"])
print(f"Examples with file content: {file_count} out of {len(examples_with_files)}")

Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx: 17525 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx: 5115 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png: 63079 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png: 133565 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt: 97 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx: 388988 characters
Read file content from /mnt/data4/home/rrao/projects/GaiaBenchmarkHF/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx: 12370 c

#### 5. example of web search

In [6]:
from utils.search_capability import *
from utils.search_capability import search_and_parse

results = search_and_parse("eliud kipchoge nationality", max_results = 3)
print(f"Found {len(results['search_results'])} results.")
print(f"Content length: {len(results['parsed_content'])} characters")

Searching for: eliud kipchoge nationality
Searching with DDGS...
DDGS error: https://lite.duckduckgo.com/lite/ 202 Ratelimit
Searching with googlesearch-python...
Found 3 results with googlesearch-python.
Extracting content from search results...
Found 3 results.
Content length: 94065 characters


##### 6. Check iteration over questions and then try iterating to identify tools

In [7]:
# Iterate through all the example questions with proper formatting
for example in extracted_data:
    print(f"Task ID: {example['task_id']}")
    print(f"Question: {example['question']}")
    print(f"Expected Answer: {example['expected_answer']}")


Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.
Expected Answer: 17
Task ID: 8e867cd7-cff9-4e6c-867a-ff5ddc2550be
Question: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
Expected Answer: 3
Task ID: ec09fa32-d03f-4bf8-84b0-1f16922c3ae4
Question: Here's a fun riddle that I think you'll enjoy.

You have been selected to play the final round of the hit new game show "Pick That Ping-Pong". In this round, you will be competing for a large cash prize. Your job will be to pick one of several differ

##### Iterate over level 1 questions and extract tools 

In [None]:
import ollama
import json
import time
import re

# Assuming extracted_data is already defined and contains a list of dictionaries
# Each dictionary has 'task_id', 'question', and 'expected_answer' keys

# Create a new dictionary to store questions and their proposed tools
results = {}

# Define the prompt template
prompt_template = """You are an AI assistant tasked with determining the tools required to answer a given question from the GAIA benchmark.

Available Tools:
- WEB SEARCH: For retrieving up-to-date or external information.
- CALCULATOR: For performing mathematical computations.
- CODE EXECUTOR: For writing and executing code snippets.

Instructions:
1. Analyze the question carefully.
2. Identify which of the above tools are necessary to answer the question.
3. Provide a Python list containing the names of the required tools. If no tools are needed, return an empty list.

Example:
Question: "What is the capital of France?"
Answer: ["WEB SEARCH"]

Question: "Calculate the sum of 123 and 456."
Answer: ["CALCULATOR"]

Question: "Write a Python function to sort a list of numbers."
Answer: ["CODE EXECUTOR"]

Question: "Explain the process of photosynthesis."
Answer: []

Now, analyze the following question:

Question: "{question}"
Answer: """

# Define the extraction prompt template
extraction_prompt_template = """Extract ONLY the Python list of tools from the following response. Return just the Python list, nothing else.

Response:
{response}

Python list: """

# Function to try to extract a Python list using regex
def extract_list_with_regex(text):
    # Look for a Python list pattern
    pattern = r'\[(?:"[^"]*"|\'[^\']*\'|[^\[\]]*)*\]'
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
    return None

# Iterate over each example in extracted_data
for example in extracted_data:
    task_id = example['task_id']
    question = example['question']
    expected_answer = example['expected_answer']
    
    print("\n" + "="*80)
    print(f"Processing Task ID: {task_id}")
    print(f"Question: {question}")
    
    # Format the prompt with the current question
    formatted_prompt = prompt_template.format(question=question)
    
    # Send the prompt to the model
    try:
        # Get initial response
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[
                {'role': 'user', 'content': formatted_prompt}
            ]
        )
        
        # Extract the model's initial response
        initial_response = response['message']['content'].strip()
        
        # Print the initial response
        print("\nLLM Initial Output:")
        print(initial_response)
        
        # Try to extract the list with regex first
        extracted_list = extract_list_with_regex(initial_response)
        
        # If regex fails, send back to the model to extract just the list
        if not extracted_list:
            print("\nRegex extraction failed. Sending back to model for extraction...")
            
            extraction_prompt = extraction_prompt_template.format(response=initial_response)
            extraction_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': extraction_prompt}
                ]
            )
            
            extracted_list = extraction_response['message']['content'].strip()
            
            # Try regex again on the extracted response
            regex_result = extract_list_with_regex(extracted_list)
            if regex_result:
                extracted_list = regex_result
        
        # Validate if the extracted_list is actually a Python list
        is_valid = False
        try:
            # Try to evaluate as a Python list
            parsed_list = eval(extracted_list)
            if isinstance(parsed_list, list):
                is_valid = True
        except:
            is_valid = False
        
        # Print the extracted list
        print("\nExtracted List:")
        print(extracted_list)
        print(f"Valid Python list: {is_valid}")
        
        # Store the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'initial_response': initial_response,
            'extracted_list': extracted_list,
            'is_valid_list': is_valid
        }
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(1)
        
    except Exception as e:
        print(f"\nError processing task {task_id}: {str(e)}")
        # Store the error in the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'initial_response': f"ERROR: {str(e)}",
            'extracted_list': None,
            'is_valid_list': False
        }

# Save the results to a JSON file
with open("gaia_tool_results.json", "w") as file:
    json.dump(results, file, indent=4)

print("\n" + "="*80)
print(f"Processed {len(results)} questions and saved results to 'gaia_tool_results.json'")


Processing Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.

LLM Initial Output:
To answer this question, we need to perform a series of calculations and look up some information from external sources. Here's how I'll break it down:

1. First, we need to find the minimum perigee value of the Moon's closest approach. We can use the WEB SEARCH tool for this.
2. Next, we need to retrieve Eliud Kipchoge's marathon pace and convert it into a suitable unit for calculation (e.g., kilometers per hour). Again, we'll use WEB SEARCH for this information.
3. Then, we need to calculate the distance bet

In [17]:
import pandas as pd
resuldts_df = pd.DataFrame(results).T

In [19]:
resuldts_df["extracted_list"]

e1fc63a2-da7a-432f-be78-7c4a95598703                     ["WEB SEARCH", "CALCULATOR"]
8e867cd7-cff9-4e6c-867a-ff5ddc2550be                                   ["WEB SEARCH"]
ec09fa32-d03f-4bf8-84b0-1f16922c3ae4                                               []
5d0080cb-90d7-4712-bc33-848150e917d3                     ["WEB_SEARCH", "CALCULATOR"]
a1e91b78-d3d8-4675-bb8d-62741b4b68a6                                   ["WEB SEARCH"]
46719c30-f4c3-4cad-be07-d5cb21eee6bb    ["WEB SEARCH", "CALCULATOR", "CODE EXECUTOR"]
4b6bb5f7-f634-410e-815d-e673ab7f8632                  ["WEB SEARCH", "CODE EXECUTOR"]
cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb                                   ["WEB SEARCH"]
2d83110e-a098-4ebb-9987-066c06fa42d0                                ["CODE EXECUTOR"]
5cfb274c-0207-4aa7-9575-6ac0bd95d9b2                                ["CODE EXECUTOR"]
27d5d136-8563-469e-92bf-fd103c28b57c                                   ["WEB SEARCH"]
dc28cf18-6431-458b-83ef-64b3ce566c10                  

## Add code to ask for web search query suggestions if web search is required

In [20]:
import ollama
import json
import time
import re

# Assuming extracted_data is already defined and contains a list of dictionaries
# Each dictionary has 'task_id', 'question', and 'expected_answer' keys

# Create a new dictionary to store questions and their proposed tools
results = {}

# Define the primary tool selection prompt template
tool_prompt_template = """You are an AI assistant tasked with determining the tools required to answer a given question from the GAIA benchmark.

Available Tools:
- WEB SEARCH: For retrieving up-to-date or external information.
- CALCULATOR: For performing mathematical computations.
- CODE EXECUTOR: For writing and executing code snippets.

Instructions:
1. Analyze the question carefully.
2. Identify which of the above tools are necessary to answer the question.
3. Provide a Python list containing the names of the required tools. If no tools are needed, return an empty list.

Example:
Question: "What is the capital of France?"
Answer: ["WEB SEARCH"]

Question: "Calculate the sum of 123 and 456."
Answer: ["CALCULATOR"]

Question: "Write a Python function to sort a list of numbers."
Answer: ["CODE EXECUTOR"]

Question: "Explain the process of photosynthesis."
Answer: []

Now, analyze the following question:

Question: "{question}"
Answer: """

# Define the web search query generation prompt template
search_query_prompt_template = """You are an AI assistant helping to generate effective web search queries for a question.

The following question has been identified as requiring web search to answer:

Question: "{question}"

Instructions:
1. Analyze the question carefully.
2. Identify the key information needs.
3. Generate a list of specific search queries that would be most effective for finding the required information.
4. Return a Python list containing these search queries as strings.

Example:
Question: "What were the main causes of World War I?"
Answer: ["causes of World War I", "World War I origins", "assassination of Archduke Franz Ferdinand", "European alliances before World War I"]

Now, please generate search queries for the question above:"""

# Define the extraction prompt template
extraction_prompt_template = """Extract ONLY the Python list from the following response. Return just the Python list, nothing else.

Response:
{response}

Python list: """

# Function to try to extract a Python list using regex
def extract_list_with_regex(text):
    # Look for a Python list pattern
    pattern = r'\[(?:"[^"]*"|\'[^\']*\'|[^\[\]]*)*\]'
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
    return None

# Function to normalize tool names for consistent comparison
def normalize_tool_name(tool_name):
    # Convert to uppercase and remove underscores and spaces
    normalized = tool_name.upper().replace('_', '').replace(' ', '')
    return normalized

# Dictionary to map normalized tool names back to standard formats
TOOL_NAME_MAP = {
    'WEBSEARCH': 'WEB SEARCH',
    'CODEEXECUTOR': 'CODE EXECUTOR'
}

# Function to standardize tool names in a list
def standardize_tool_names(tools_list):
    standardized_list = []
    for tool in tools_list:
        normalized = normalize_tool_name(tool)
        # Map back to standard format if possible, otherwise use original
        standardized_list.append(TOOL_NAME_MAP.get(normalized, tool))
    return standardized_list

# Iterate over each example in extracted_data
for example in extracted_data:
    task_id = example['task_id']
    question = example['question']
    expected_answer = example['expected_answer']
    
    print("\n" + "="*80)
    print(f"Processing Task ID: {task_id}")
    print(f"Question: {question}")
    
    # Format the tool prompt with the current question
    formatted_prompt = tool_prompt_template.format(question=question)
    
    # Send the prompt to the model
    try:
        # Get initial response for tool selection
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[
                {'role': 'user', 'content': formatted_prompt}
            ]
        )
        
        # Extract the model's initial response
        initial_response = response['message']['content'].strip()
        
        print("\nLLM Initial Output (Tool Selection):")
        print(initial_response)
        
        # Try to extract the tools list with regex first
        tools_list_str = extract_list_with_regex(initial_response)
        
        # If regex fails, send back to the model to extract just the list
        if not tools_list_str:
            print("\nRegex extraction failed. Sending back to model for extraction...")
            
            extraction_prompt = extraction_prompt_template.format(response=initial_response)
            extraction_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': extraction_prompt}
                ]
            )
            
            tools_list_str = extraction_response['message']['content'].strip()
            
            # Try regex again on the extracted response
            regex_result = extract_list_with_regex(tools_list_str)
            if regex_result:
                tools_list_str = regex_result
        
        # Parse the tools list
        raw_tools_list = []
        standardized_tools_list = []
        is_valid_tools = False
        try:
            # Try to evaluate as a Python list
            raw_tools_list = eval(tools_list_str)
            if isinstance(raw_tools_list, list):
                is_valid_tools = True
                # Standardize the tool names
                standardized_tools_list = standardize_tool_names(raw_tools_list)
                print("\nStandardized Tools List:")
                print(standardized_tools_list)
        except Exception as e:
            print(f"Error parsing tools list: {str(e)}")
            raw_tools_list = []
            standardized_tools_list = []
        
        # Initialize search queries list
        search_queries = []
        is_valid_queries = False
        
        # Check if web search is required using normalized comparison
        web_search_required = False
        if is_valid_tools:
            for tool in raw_tools_list:
                if normalize_tool_name(tool) == 'WEBSEARCH':
                    web_search_required = True
                    break
        
        # If web search is required, generate search queries
        if web_search_required:
            print("\nWEB SEARCH required. Generating search queries...")
            
            # Format the search query prompt
            search_prompt = search_query_prompt_template.format(question=question)
            
            # Get search query response
            search_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': search_prompt}
                ]
            )
            
            # Extract the search query response
            search_response_text = search_response['message']['content'].strip()
            
            print("\nLLM Output (Search Queries):")
            print(search_response_text)
            
            # Try to extract the search queries list with regex
            queries_list_str = extract_list_with_regex(search_response_text)
            
            # If regex fails, send back to the model to extract just the list
            if not queries_list_str:
                print("\nRegex extraction failed for search queries. Sending back to model...")
                
                extraction_prompt = extraction_prompt_template.format(response=search_response_text)
                extraction_response = ollama.chat(
                    model='llama3.1:8b',
                    messages=[
                        {'role': 'user', 'content': extraction_prompt}
                    ]
                )
                
                queries_list_str = extraction_response['message']['content'].strip()
                
                # Try regex again on the extracted response
                regex_result = extract_list_with_regex(queries_list_str)
                if regex_result:
                    queries_list_str = regex_result
            
            # Parse the search queries list
            try:
                # Try to evaluate as a Python list
                search_queries = eval(queries_list_str)
                if isinstance(search_queries, list):
                    is_valid_queries = True
            except Exception as e:
                print(f"Error parsing search queries list: {str(e)}")
                search_queries = []
            
            print("\nExtracted Search Queries:")
            print(search_queries)
            print(f"Valid Python list: {is_valid_queries}")
        
        # Store the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'tools': {
                'initial_response': initial_response,
                'extracted_list': tools_list_str,
                'raw_tools': raw_tools_list,
                'standardized_tools': standardized_tools_list,
                'is_valid_list': is_valid_tools
            },
            'web_search_queries': {
                'required': web_search_required,
                'queries': search_queries,
                'is_valid_list': is_valid_queries
            }
        }
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(1)
        
    except Exception as e:
        print(f"\nError processing task {task_id}: {str(e)}")
        # Store the error in the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'tools': {
                'initial_response': f"ERROR: {str(e)}",
                'extracted_list': None,
                'raw_tools': [],
                'standardized_tools': [],
                'is_valid_list': False
            },
            'web_search_queries': {
                'required': False,
                'queries': [],
                'is_valid_list': False
            }
        }

# Save the results to a JSON file
with open("gaia_tool_search_results.json", "w") as file:
    json.dump(results, file, indent=4)

print("\n" + "="*80)
print(f"Processed {len(results)} questions and saved results to 'gaia_tool_search_results.json'")


Processing Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.

LLM Initial Output (Tool Selection):
To answer this question, we need to perform a series of calculations involving units conversion (from kilometers to hours), mathematical computation (to calculate the time taken), and formatting the output (rounding to the nearest thousand hours).

Here's the step-by-step analysis:

1. **Units Conversion:** We first need to convert the distance from kilometers to meters since we know that Eliud Kipchoge's marathon pace is in meters per minute, not kilometers.
2. **Mathematical Computation:** W

In [21]:
import pandas as pd
resuldts_df = pd.DataFrame(results).T

In [22]:
resuldts_df

Unnamed: 0,question,expected_answer,tools,web_search_queries
e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,17,"{'initial_response': 'To answer this question,...","{'required': False, 'queries': [], 'is_valid_l..."
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,3,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['""Mercedes Sosa..."
ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,3,{'initial_response': 'After analyzing the ques...,"{'required': False, 'queries': [], 'is_valid_l..."
5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,0.1777,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['University of ..."
a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,3,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['YouTube video ..."
46719c30-f4c3-4cad-be07-d5cb21eee6bb,Of the authors (First M. Last) that worked on ...,Mapping Human Oriented Information to Software...,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['""First M. Last..."
4b6bb5f7-f634-410e-815d-e673ab7f8632,"In Series 9, Episode 11 of Doctor Who, the Doc...",THE CASTLE,{'initial_response': 'A fan of Doctor Who! To...,"{'required': True, 'queries': [], 'is_valid_li..."
cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb,An office held a Secret Santa gift exchange wh...,Fred,"{'initial_response': 'To answer this question,...","{'required': False, 'queries': [], 'is_valid_l..."
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht...",Right,{'initial_response': 'A clever question! Afte...,"{'required': False, 'queries': [], 'is_valid_l..."
5cfb274c-0207-4aa7-9575-6ac0bd95d9b2,Each cell in the attached spreadsheet represen...,No,{'initial_response': 'A challenging question! ...,"{'required': True, 'queries': ['pathfinding al..."


##### add search and summarized results

In [23]:
import ollama
import json
import time
import re

# Import the necessary functions from the utils modules similar to the first code
from utils.search_capability import search_and_parse
from utils.summarize import summarize_text

# Assuming extracted_data is already defined and contains a list of dictionaries
# Each dictionary has 'task_id', 'question', and 'expected_answer' keys

# Create a new dictionary to store questions and their proposed tools
results = {}

# Define the primary tool selection prompt template
tool_prompt_template = """You are an AI assistant tasked with determining the tools required to answer a given question from the GAIA benchmark.

Available Tools:
- WEB SEARCH: For retrieving up-to-date or external information.
- CALCULATOR: For performing mathematical computations.
- CODE EXECUTOR: For writing and executing code snippets.

Instructions:
1. Analyze the question carefully.
2. Identify which of the above tools are necessary to answer the question.
3. Provide a Python list containing the names of the required tools. If no tools are needed, return an empty list.

Example:
Question: "What is the capital of France?"
Answer: ["WEB SEARCH"]

Question: "Calculate the sum of 123 and 456."
Answer: ["CALCULATOR"]

Question: "Write a Python function to sort a list of numbers."
Answer: ["CODE EXECUTOR"]

Question: "Explain the process of photosynthesis."
Answer: []

Now, analyze the following question:

Question: "{question}"
Answer: """

# Define the web search query generation prompt template
search_query_prompt_template = """You are an AI assistant helping to generate effective web search queries for a question.

The following question has been identified as requiring web search to answer:

Question: "{question}"

Instructions:
1. Analyze the question carefully.
2. Identify the key information needs.
3. Generate a list of specific search queries that would be most effective for finding the required information.
4. Return a Python list containing these search queries as strings.

Example:
Question: "What were the main causes of World War I?"
Answer: ["causes of World War I", "World War I origins", "assassination of Archduke Franz Ferdinand", "European alliances before World War I"]

Now, please generate search queries for the question above:"""

# Define the extraction prompt template
extraction_prompt_template = """Extract ONLY the Python list from the following response. Return just the Python list, nothing else.

Response:
{response}

Python list: """

# Function to try to extract a Python list using regex
def extract_list_with_regex(text):
    # Look for a Python list pattern
    pattern = r'\[(?:"[^"]*"|\'[^\']*\'|[^\[\]]*)*\]'
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
    return None

# Function to normalize tool names for consistent comparison
def normalize_tool_name(tool_name):
    # Convert to uppercase and remove underscores and spaces
    normalized = tool_name.upper().replace('_', '').replace(' ', '')
    return normalized

# Dictionary to map normalized tool names back to standard formats
TOOL_NAME_MAP = {
    'WEBSEARCH': 'WEB SEARCH',
    'CODEEXECUTOR': 'CODE EXECUTOR'
}

# Function to standardize tool names in a list
def standardize_tool_names(tools_list):
    standardized_list = []
    for tool in tools_list:
        normalized = normalize_tool_name(tool)
        # Map back to standard format if possible, otherwise use original
        standardized_list.append(TOOL_NAME_MAP.get(normalized, tool))
    return standardized_list

# Function to perform web search and summarize content
def search_and_summarize(query, max_content_length=50000, target_summary_length=20000):
    """
    Perform web search and summarize the returned content
    
    Args:
        query: Search query string
        max_content_length: Maximum length for raw content before truncation
        target_summary_length: Target length for summarized content
        
    Returns:
        Dictionary with raw and summarized content
    """
    try:
        # Perform the web search
        search_results = search_and_parse(query)
        parsed_content = search_results.get('parsed_content', '')
        
        # Check if search returned too much content and truncate if needed
        if len(parsed_content) > max_content_length:
            print(f"Search returned {len(parsed_content)} characters, truncating to {max_content_length}")
            parsed_content = parsed_content[:max_content_length]
            search_results['parsed_content'] = parsed_content
            
        print(f"Completed search with {len(parsed_content)} characters of content")
        
        # Summarize the search results
        summarized_content = ""
        if parsed_content:
            summarized_content = summarize_text(
                text=parsed_content,
                target_len=target_summary_length,
                chunk_size=10000,
                truncate=True,
                model="llama3:8b",
                temperature=0
            )
            print(f"Summarized content: {len(summarized_content)} characters")
        
        return {
            'raw_content': parsed_content,
            'summarized_content': summarized_content,
            'raw_length': len(parsed_content),
            'summary_length': len(summarized_content)
        }
    except Exception as e:
        print(f"Error during search and summarize: {e}")
        return {
            'raw_content': "",
            'summarized_content': "",
            'raw_length': 0,
            'summary_length': 0,
            'error': str(e)
        }

# Iterate over each example in extracted_data
for example in extracted_data:
    task_id = example['task_id']
    question = example['question']
    expected_answer = example['expected_answer']
    
    print("\n" + "="*80)
    print(f"Processing Task ID: {task_id}")
    print(f"Question: {question}")
    
    # Format the tool prompt with the current question
    formatted_prompt = tool_prompt_template.format(question=question)
    
    # Send the prompt to the model
    try:
        # Get initial response for tool selection
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[
                {'role': 'user', 'content': formatted_prompt}
            ]
        )
        
        # Extract the model's initial response
        initial_response = response['message']['content'].strip()
        
        print("\nLLM Initial Output (Tool Selection):")
        print(initial_response)
        
        # Try to extract the tools list with regex first
        tools_list_str = extract_list_with_regex(initial_response)
        
        # If regex fails, send back to the model to extract just the list
        if not tools_list_str:
            print("\nRegex extraction failed. Sending back to model for extraction...")
            
            extraction_prompt = extraction_prompt_template.format(response=initial_response)
            extraction_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': extraction_prompt}
                ]
            )
            
            tools_list_str = extraction_response['message']['content'].strip()
            
            # Try regex again on the extracted response
            regex_result = extract_list_with_regex(tools_list_str)
            if regex_result:
                tools_list_str = regex_result
        
        # Parse the tools list
        raw_tools_list = []
        standardized_tools_list = []
        is_valid_tools = False
        try:
            # Try to evaluate as a Python list
            raw_tools_list = eval(tools_list_str)
            if isinstance(raw_tools_list, list):
                is_valid_tools = True
                # Standardize the tool names
                standardized_tools_list = standardize_tool_names(raw_tools_list)
                print("\nStandardized Tools List:")
                print(standardized_tools_list)
        except Exception as e:
            print(f"Error parsing tools list: {str(e)}")
            raw_tools_list = []
            standardized_tools_list = []
        
        # Initialize search queries list
        search_queries = []
        is_valid_queries = False
        
        # Check if web search is required using normalized comparison
        web_search_required = False
        if is_valid_tools:
            for tool in raw_tools_list:
                if normalize_tool_name(tool) == 'WEBSEARCH':
                    web_search_required = True
                    break
        
        # Initialize web search results
        web_search_results = []
        
        # If web search is required, generate search queries
        if web_search_required:
            print("\nWEB SEARCH required. Generating search queries...")
            
            # Format the search query prompt
            search_prompt = search_query_prompt_template.format(question=question)
            
            # Get search query response
            search_response = ollama.chat(
                model='llama3.1:8b',
                messages=[
                    {'role': 'user', 'content': search_prompt}
                ]
            )
            
            # Extract the search query response
            search_response_text = search_response['message']['content'].strip()
            
            print("\nLLM Output (Search Queries):")
            print(search_response_text)
            
            # Try to extract the search queries list with regex
            queries_list_str = extract_list_with_regex(search_response_text)
            
            # If regex fails, send back to the model to extract just the list
            if not queries_list_str:
                print("\nRegex extraction failed for search queries. Sending back to model...")
                
                extraction_prompt = extraction_prompt_template.format(response=search_response_text)
                extraction_response = ollama.chat(
                    model='llama3.1:8b',
                    messages=[
                        {'role': 'user', 'content': extraction_prompt}
                    ]
                )
                
                queries_list_str = extraction_response['message']['content'].strip()
                
                # Try regex again on the extracted response
                regex_result = extract_list_with_regex(queries_list_str)
                if regex_result:
                    queries_list_str = regex_result
            
            # Parse the search queries list
            try:
                # Try to evaluate as a Python list
                search_queries = eval(queries_list_str)
                if isinstance(search_queries, list):
                    is_valid_queries = True
            except Exception as e:
                print(f"Error parsing search queries list: {str(e)}")
                search_queries = []
            
            print("\nExtracted Search Queries:")
            print(search_queries)
            print(f"Valid Python list: {is_valid_queries}")
            
            # Execute web searches for each query and store results
            if is_valid_queries and search_queries:
                print("\nExecuting web searches and summarizing content...")
                for i, query in enumerate(search_queries):
                    print(f"\nSearching for query {i+1}/{len(search_queries)}: '{query}'")
                    search_result = search_and_summarize(
                        query, 
                        max_content_length=50000, 
                        target_summary_length=20000
                    )
                    web_search_results.append({
                        'query': query,
                        'result': search_result
                    })
                    # Add a small delay between searches
                    time.sleep(2)
        
        # Store the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'tools': {
                'initial_response': initial_response,
                'extracted_list': tools_list_str,
                'raw_tools': raw_tools_list,
                'standardized_tools': standardized_tools_list,
                'is_valid_list': is_valid_tools
            },
            'web_search_queries': {
                'required': web_search_required,
                'queries': search_queries,
                'is_valid_list': is_valid_queries
            },
            'web_search_results': web_search_results
        }
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(1)
        
    except Exception as e:
        print(f"\nError processing task {task_id}: {str(e)}")
        # Store the error in the results
        results[task_id] = {
            'question': question,
            'expected_answer': expected_answer,
            'tools': {
                'initial_response': f"ERROR: {str(e)}",
                'extracted_list': None,
                'raw_tools': [],
                'standardized_tools': [],
                'is_valid_list': False
            },
            'web_search_queries': {
                'required': False,
                'queries': [],
                'is_valid_list': False
            },
            'web_search_results': []
        }

# Save the results to a JSON file
with open("gaia_tool_search_results.json", "w") as file:
    json.dump(results, file, indent=4)

print("\n" + "="*80)
print(f"Processed {len(results)} questions and saved results to 'gaia_tool_search_results.json'")


Processing Task ID: e1fc63a2-da7a-432f-be78-7c4a95598703
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.

LLM Initial Output (Tool Selection):
To answer this question, we need to perform some mathematical computations to calculate the time it would take Eliud Kipchoge to run the distance between the Earth and the Moon.

Here are the steps:

1. We need to find the minimum perigee value of the Moon's closest approach on Wikipedia.
2. We can use this value to calculate the distance between the Earth and the Moon.
3. Next, we need to know Eliud Kipchoge's marathon pace and convert it to a speed in kilometers per hour.
4. Finally, we can 

In [25]:
results.keys()

dict_keys(['e1fc63a2-da7a-432f-be78-7c4a95598703', '8e867cd7-cff9-4e6c-867a-ff5ddc2550be', 'ec09fa32-d03f-4bf8-84b0-1f16922c3ae4', '5d0080cb-90d7-4712-bc33-848150e917d3', 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', '46719c30-f4c3-4cad-be07-d5cb21eee6bb', '4b6bb5f7-f634-410e-815d-e673ab7f8632', 'cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb', '2d83110e-a098-4ebb-9987-066c06fa42d0', '5cfb274c-0207-4aa7-9575-6ac0bd95d9b2', '27d5d136-8563-469e-92bf-fd103c28b57c', 'dc28cf18-6431-458b-83ef-64b3ce566c10', 'b816bfce-3d80-4913-a07d-69b752ce6377', '72e110e7-464c-453c-a309-90a95aed6538', '42576abe-0deb-4869-8c63-225c2d75a95a', 'b415aba4-4b68-4fc6-9b89-2c812e55a3e1', 'cca530fc-4052-43b2-b130-b30968d8aa44', '935e2cff-ae78-4218-b3f5-115589b19dae', '4fc2f1ae-8625-45b5-ab34-ad4433bc21f8', '5188369a-3bbe-43d8-8b94-11558f909a08', '6f37996b-2ac7-44b0-8e68-6d28256631b4', '9318445f-fe6a-4e1b-acbf-c68228c9906a', '389793a7-ca17-4e82-81cb-2b3a2391b4b9', '4b650a35-8529-4695-89ed-8dc7a500a498', 'a3fbeb63-0e8c-4a11-bff6-0e3b

In [27]:
results.keys()

dict_keys(['e1fc63a2-da7a-432f-be78-7c4a95598703', '8e867cd7-cff9-4e6c-867a-ff5ddc2550be', 'ec09fa32-d03f-4bf8-84b0-1f16922c3ae4', '5d0080cb-90d7-4712-bc33-848150e917d3', 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', '46719c30-f4c3-4cad-be07-d5cb21eee6bb', '4b6bb5f7-f634-410e-815d-e673ab7f8632', 'cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb', '2d83110e-a098-4ebb-9987-066c06fa42d0', '5cfb274c-0207-4aa7-9575-6ac0bd95d9b2', '27d5d136-8563-469e-92bf-fd103c28b57c', 'dc28cf18-6431-458b-83ef-64b3ce566c10', 'b816bfce-3d80-4913-a07d-69b752ce6377', '72e110e7-464c-453c-a309-90a95aed6538', '42576abe-0deb-4869-8c63-225c2d75a95a', 'b415aba4-4b68-4fc6-9b89-2c812e55a3e1', 'cca530fc-4052-43b2-b130-b30968d8aa44', '935e2cff-ae78-4218-b3f5-115589b19dae', '4fc2f1ae-8625-45b5-ab34-ad4433bc21f8', '5188369a-3bbe-43d8-8b94-11558f909a08', '6f37996b-2ac7-44b0-8e68-6d28256631b4', '9318445f-fe6a-4e1b-acbf-c68228c9906a', '389793a7-ca17-4e82-81cb-2b3a2391b4b9', '4b650a35-8529-4695-89ed-8dc7a500a498', 'a3fbeb63-0e8c-4a11-bff6-0e3b

In [30]:
results['e1fc63a2-da7a-432f-be78-7c4a95598703']

{'question': 'If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.',
 'expected_answer': '17',
 'tools': {'initial_response': 'To answer this question, we need to perform some mathematical computations to calculate the time it would take Eliud Kipchoge to run the distance between the Earth and the Moon.\n\nHere are the steps:\n\n1. We need to find the minimum perigee value of the Moon\'s closest approach on Wikipedia.\n2. We can use this value to calculate the distance between the Earth and the Moon.\n3. Next, we need to know Eliud Kipchoge\'s marathon pace and convert it to a speed in kilometers per hour.\n4. Finally, we can divide the distance by 

#### add final summary from web search content

##### add final search summary

In [41]:
from utils.summarize import summarize_text

def add_final_summary_to_results(results):
    """
    Extracts summarized_content from web_search_results, combines them,
    summarizes the combined text, and adds it to the results dictionary.
    
    Args:
        results: Dictionary containing web_search_results
        
    Returns:
        Updated results dictionary with final_summarized_web_content
    """
    # Handle the case where input isn't a dictionary
    if not isinstance(results, dict):
        print("Error: Input must be a dictionary")
        return results
    
    # Check if this is a dictionary with web_search_results directly
    # (Single results case without nested IDs)
    if 'web_search_results' in results and isinstance(results['web_search_results'], list):
        # Extract all summarized contents
        summarized_contents = []
        for search_result in results['web_search_results']:
            if 'result' in search_result and 'summarized_content' in search_result['result']:
                # Add the query as a heading before each summarized content for context
                query = search_result.get('query', 'Unknown Query')
                content = search_result['result']['summarized_content']
                summarized_contents.append(f"Search for: {query}\n\n{content}")
        
        # If no summarized contents were found, return empty string
        if not summarized_contents:
            results['final_summarized_web_content'] = ""
            return results
        
        # Combine all summarized contents with clear separation
        combined_text = "\n\n---\n\n".join(summarized_contents)
        
        # Summarize the combined text
        final_summary = summarize_text(
            text=combined_text,
            target_len=500,
            chunk_size=8000,  # Default value, adjust if needed
            truncate=False,
            model='qwen2.5:72b',  # Using the default model from your utils
            temperature=0.3,
            show_progress=True
        )
        
        # Add the final summary to the results dictionary
        results['final_summarized_web_content'] = final_summary
        return results
    
    # Otherwise, try to process as a multi-entry results dictionary
    for result_id, result_data in results.items():
        # Skip if the value is not a dictionary
        if not isinstance(result_data, dict):
            continue
            
        # Skip if the entry doesn't have web_search_results or it's empty
        if 'web_search_results' not in result_data or not result_data['web_search_results']:
            result_data['final_summarized_web_content'] = ""
            continue
        
        # Extract all summarized contents from this entry
        summarized_contents = []
        for search_result in result_data['web_search_results']:
            if 'result' in search_result and 'summarized_content' in search_result['result']:
                query = search_result.get('query', 'Unknown Query')
                content = search_result['result']['summarized_content']
                summarized_contents.append(f"Search for: {query}\n\n{content}")
        
        # If no summarized contents were found, add empty string and continue
        if not summarized_contents:
            result_data['final_summarized_web_content'] = ""
            continue
        
        # Combine all summarized contents with clear separation
        combined_text = "\n\n---\n\n".join(summarized_contents)
        
        # Summarize the combined text
        final_summary = summarize_text(
            text=combined_text,
            target_len=500,
            chunk_size=8000,
            truncate=False,
            model='llama3.1:8b',
            temperature=0.3,
            show_progress=True
        )
        
        # Add the final summary to the results dictionary entry
        result_data['final_summarized_web_content'] = final_summary
    
    return results

results1 = add_final_summary_to_results(results)

Text split into 7 chunks for processing
Processing chunk 1/7 (length: 7940 characters)
Chunk 1 summarized to 1050 characters
Processing chunk 2/7 (length: 7739 characters)
Chunk 2 summarized to 995 characters
Processing chunk 3/7 (length: 7591 characters)
Chunk 3 summarized to 941 characters
Processing chunk 4/7 (length: 7727 characters)
Chunk 4 summarized to 1083 characters
Processing chunk 5/7 (length: 7892 characters)
Chunk 5 summarized to 964 characters
Processing chunk 6/7 (length: 7948 characters)
Chunk 6 summarized to 1144 characters
Processing chunk 7/7 (length: 3215 characters)
Chunk 7 summarized to 892 characters
Summarization complete. Final summary length: 7081 characters
Text split into 5 chunks for processing
Processing chunk 1/5 (length: 7896 characters)
Chunk 1 summarized to 1153 characters
Processing chunk 2/5 (length: 7765 characters)
Chunk 2 summarized to 1002 characters
Processing chunk 3/5 (length: 7905 characters)
Chunk 3 summarized to 1125 characters
Processing c

In [44]:
# Remove the top-level 'final_summarized_web_content' key from results1
if 'final_summarized_web_content' in results1:
    del results1['final_summarized_web_content']

In [47]:
import json
import pickle

# Save the results1 dictionary to a JSON file
with open("gaia_analysis_results_summarized.json", "w", encoding="utf-8") as f:
    json.dump(results1, f, indent=4, ensure_ascii=False)

# Also save as pickle for preserving Python object structure
with open("gaia_analysis_results_summarized.pkl", "wb") as f:
    pickle.dump(results1, f)

print(f"Results saved to 'gaia_analysis_results_summarized.json' and 'gaia_analysis_results_summarized.pkl'")

Results saved to 'gaia_analysis_results_summarized.json' and 'gaia_analysis_results_summarized.pkl'


In [1]:
import pickle

# Load the previously saved results from the pickle file
with open("gaia_analysis_results_summarized.pkl", "rb") as f:
    results = pickle.load(f)

# Print the number of items loaded
print(f"Loaded {len(results)} items from 'gaia_analysis_results_summarized.pkl'")

Loaded 53 items from 'gaia_analysis_results_summarized.pkl'


In [2]:
import pandas as pd

results_df = pd.DataFrame(results).T

In [None]:
results_df.columns

Index(['question', 'expected_answer', 'tools', 'web_search_queries',
       'web_search_results', 'final_summarized_web_content'],
      dtype='object')

: 

##### add also file content

In [6]:
def add_file_content_to_results(results, examples):
    """
    Add file content from examples to results dictionary.
    
    Args:
        results: Dictionary with task_ids as keys
        examples: List of examples from GAIA dataset
    
    Returns:
        Updated results dictionary with file content
    """
    import os
    
    def read_file(file_path):
        """Read content from a file with fallback encoding."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            # Try with a different encoding if UTF-8 fails
            with open(file_path, 'r', encoding='latin-1') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            return ""
    
    print(f"Processing {len(examples)} examples to add file content to results")
    files_added = 0
    
    for example in examples:
        # Extract task_id to match with results dictionary
        task_id = example["example"].get("task_id", "")
        
        # Skip if no task_id or if task_id is not in results
        if not task_id or task_id not in results:
            continue
        
        # Get file path information
        file_path = example["example"].get("file_path", "")
        file_name = example["example"].get("file_name", "")
        
        file_content = ""
        # Try to read from file_path
        if file_path and os.path.exists(file_path):
            try:
                file_content = read_file(file_path)
                files_added += 1
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")
        # If that fails, try with file_name
        elif file_name and os.path.exists(file_name):
            try:
                file_content = read_file(file_name)
                files_added += 1
            except Exception as e:
                print(f"Error reading file {file_name}: {e}")
        
        # Add file content to the results dict
        if file_content:
            results[task_id]["file_content"] = file_content
            results[task_id]["has_file_content"] = True
        else:
            # Explicitly mark as not having file content
            results[task_id]["has_file_content"] = False
    
    print(f"Added file content to {files_added} results")
    return results

# Example usage:
results = add_file_content_to_results(results, examples)

# Check how many results have file content
with_content = sum(1 for item in results.values() if item.get("has_file_content", False))
print(f"Results with file content: {with_content} out of {len(results)}")

Processing 53 examples to add file content to results
Added file content to 11 results
Results with file content: 11 out of 53


In [8]:
results_df = pd.DataFrame(results).T

#### concatenate content

In [13]:
results_df['context'] = results_df.apply(
    lambda row: (row['file_content'][:5000] if isinstance(row['file_content'], str) else '') + 
                'WEB SEARCH CONTENT:' + row['final_summarized_web_content'],
    axis=1
)

In [14]:
results_df.columns

Index(['question', 'expected_answer', 'tools', 'web_search_queries',
       'web_search_results', 'final_summarized_web_content',
       'has_file_content', 'file_content', 'context'],
      dtype='object')

In [19]:
import ollama
import pandas as pd

def query_ollama(question, context):
    # Create the prompt with the question, context, and reminder
    extraction_prompt = f"""
Question: {question}
Context: {context}
Remember, the question is: {question}
Therefore, the answer should be:
"""
    
    try:
        # Call ollama using the chat method
        response = ollama.chat(
            model='llama3.1:8b',
            messages=[
                {'role': 'user', 'content': extraction_prompt}
            ]
        )
        
        # Extract the assistant's response
        return response['message']['content']
    except Exception as e:
        return f"Error: {str(e)}"

# Function to process the entire dataframe
def process_dataframe(results_df):
    # Create a new column for answers
    answers = []
    
    for idx, row in results_df.iterrows():
        question = row['question']
        context = row['context']
        expected_answer = row['expected_answer']
        
        print(f"Processing question {idx}: {question}")
        answer = query_ollama(question, context)
        print(f"Answer: {answer}")
        print("Expected_answer:", expected_answer)
        print("-" * 50)
        
        answers.append(answer)
    
    results_df['answer'] = answers
    return results_df

results_with_answers = process_dataframe(results_df)


Processing question e1fc63a2-da7a-432f-be78-7c4a95598703: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.
Answer: To calculate the number of thousand hours it would take Eliud Kipchoge to run the distance between the Earth and the Moon's closest approach, we need to follow these steps:

1. Find the minimum perigee value on the Wikipedia page for the Moon: 356,000 km
2. Convert the marathon distance from 42.195 km to km/h: Eliud Kipchoge's average velocity was 20.9 km/h.
3. Calculate the time it would take him to run 1 km at this pace: 1 km / 20.9 km/h = 0.0477 hours
4. Convert the perigee distance from km to hours by multiplying by his speed in

In [20]:
results_df

Unnamed: 0,question,expected_answer,tools,web_search_queries,web_search_results,final_summarized_web_content,has_file_content,file_content,context,answer
e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,17,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['Eliud Kipchoge...",[{'query': 'Eliud Kipchoge marathon pace km/h'...,Here is a summary of the text in approximately...,False,,WEB SEARCH CONTENT:Here is a summary of the te...,To calculate the number of thousand hours it w...
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,3,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['Mercedes Sosa ...",[{'query': 'Mercedes Sosa studio albums 2000-2...,I'll summarize the actual text that was provid...,False,,WEB SEARCH CONTENT:I'll summarize the actual t...,According to the Wikipedia article on Mercedes...
ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,3,{'initial_response': 'After analyzing the ques...,"{'required': False, 'queries': [], 'is_valid_l...",[],,False,,WEB SEARCH CONTENT:,## Step 1: Understand the game mechanics\nThe ...
5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,0.1777,{'initial_response': 'To determine the tools r...,"{'required': True, 'queries': ['University of ...",[{'query': 'University of Leicester Can Hiccup...,Here is a summary of the text in approximately...,False,,WEB SEARCH CONTENT:Here is a summary of the te...,"Unfortunately, based on the provided text, I w..."
a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,3,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['number of bird...",[{'query': 'number of bird species in YouTube ...,It seems that there was no original text provi...,False,,WEB SEARCH CONTENT:It seems that there was no ...,"Unfortunately, I was provided with summaries o..."
46719c30-f4c3-4cad-be07-d5cb21eee6bb,Of the authors (First M. Last) that worked on ...,Mapping Human Oriented Information to Software...,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['authors Pie Me...",[{'query': 'authors Pie Menus or Linear Menus ...,It appears that the original request was for m...,False,,WEB SEARCH CONTENT:It appears that the origina...,"Unfortunately, I have to say that none of the ..."
4b6bb5f7-f634-410e-815d-e673ab7f8632,"In Series 9, Episode 11 of Doctor Who, the Doc...",THE CASTLE,{'initial_response': 'To answer the given ques...,"{'required': True, 'queries': ['Doctor Who Ser...",[{'query': 'Doctor Who Series 9 Episode 11 scr...,Here is a summary of the text in approximately...,False,,WEB SEARCH CONTENT:Here is a summary of the te...,"According to the information provided, the cor..."
cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb,An office held a Secret Santa gift exchange wh...,Fred,"{'initial_response': 'To answer this question,...","{'required': True, 'queries': ['Secret Santa g...","[{'query': 'Secret Santa gift exchange rules',...",Here is a summary of the text in about 167 wor...,True,PK�����!�2oWf��¥���[Content_Types]....,PK�����!�2oWf��¥���[Content_Types]....,Since there were 12 employees and only 11 gift...
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht...",Right,{'initial_response': 'The given question appea...,"{'required': False, 'queries': [], 'is_valid_l...",[],,False,,WEB SEARCH CONTENT:,It looks like you've provided a reversed Engli...
5cfb274c-0207-4aa7-9575-6ac0bd95d9b2,Each cell in the attached spreadsheet represen...,No,{'initial_response': 'After analyzing the ques...,"{'required': False, 'queries': [], 'is_valid_l...",[],,True,PK��æjÆV���������������xl/drawings/draw...,PK��æjÆV���������������xl/drawings/draw...,A classic puzzle!\n\nLet's analyze the situati...


#### extract final answer

In [27]:
import pandas as pd
import ollama
import json
import re

# Assuming results_df already exists with 'question' and 'answer' columns
# We'll add a new column 'finalanswer' to store the extracted responses

def extract_final_answer(row):
    question = row['question']
    answer = row['answer']
    
    # Create the improved extraction prompt with clearer instructions
    extraction_prompt = f"""
    CONTEXT:
    Question: {question}
    Answer: {answer}
    
    TASK:
    Given the question and answer above, provide only the precise answer in a structured JSON format.
    The answer should be exactly in the format requested in the question.
    
    IMPORTANT:
    - Return ONLY the JSON with a 'finalanswer' key
    - Do not include any explanations, notes, or other text
    - Follow EXACTLY the format requested in the question
    
    Here is the question again: {question}
    Here is the answer again: {answer}
    
    Now please provide ACCORDING TO THE REQUESTED FORMAT OF THE QUESTION the JSON with 'finalanswer' key now:
    """
    
    # Call the llama model
    response = ollama.chat(
        model='llama3.1:8b',
        messages=[
            {'role': 'user', 'content': extraction_prompt}
        ]
    )
    
    # Extract the response content
    response_content = response['message']['content']
    
    # Print the raw response for debugging
    print(f"\n--- RAW RESPONSE ---\n{response_content}\n-------------------")
    
    # Try to extract JSON from the response
    extracted_answer = "Extraction failed"
    try:
        # Try using json.loads directly
        json_data = json.loads(response_content)
        extracted_answer = json_data.get('finalanswer', '')
        print(f"Extracted using direct JSON parsing: {extracted_answer}")
    except json.JSONDecodeError:
        # If that fails, try to extract JSON using regex
        json_match = re.search(r'(\{.*\})', response_content, re.DOTALL)
        if json_match:
            try:
                json_data = json.loads(json_match.group(1))
                extracted_answer = json_data.get('finalanswer', '')
                print(f"Extracted using regex JSON match: {extracted_answer}")
            except:
                pass
                
        # If regex fails, look for a pattern with the finalanswer key
        if extracted_answer == "Extraction failed":
            finalanswer_match = re.search(r'"finalanswer"\s*:\s*"([^"]*)"', response_content)
            if finalanswer_match:
                extracted_answer = finalanswer_match.group(1)
                print(f"Extracted using finalanswer key regex: {extracted_answer}")
    
    print(f"FINAL EXTRACTED ANSWER: {extracted_answer}\n")
    return extracted_answer


# Apply the function to each row
results_df['finalanswer'] = results_df.apply(extract_final_answer, axis=1)


--- RAW RESPONSE ---
{
  "finalanswer": 17072
}
-------------------
Extracted using direct JSON parsing: 17072
FINAL EXTRACTED ANSWER: 17072


--- RAW RESPONSE ---
{
  "finalanswer": 3
}
-------------------
Extracted using direct JSON parsing: 3
FINAL EXTRACTED ANSWER: 3


--- RAW RESPONSE ---
{
  "finalanswer": 37
}
-------------------
Extracted using direct JSON parsing: 37
FINAL EXTRACTED ANSWER: 37


--- RAW RESPONSE ---
```json
{
  "finalanswer": "I'm not able to find any information about the volume in m^3 of the fish bag calculated in the University of Leicester paper 'Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?'. The original text appears to be missing or unclear."
}
```
-------------------
Extracted using regex JSON match: I'm not able to find any information about the volume in m^3 of the fish bag calculated in the University of Leicester paper 'Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?'. The original text appears to be missing or unclear.
FIN

In [28]:
results_df.columns

Index(['question', 'expected_answer', 'tools', 'web_search_queries',
       'web_search_results', 'final_summarized_web_content',
       'has_file_content', 'file_content', 'context', 'answer', 'finalanswer'],
      dtype='object')

In [29]:
results_df[['expected_answer', 'finalanswer']]

Unnamed: 0,expected_answer,finalanswer
e1fc63a2-da7a-432f-be78-7c4a95598703,17,17072
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,3,3
ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,3,37
5d0080cb-90d7-4712-bc33-848150e917d3,0.1777,I'm not able to find any information about the...
a1e91b78-d3d8-4675-bb8d-62741b4b68a6,3,Not provided
46719c30-f4c3-4cad-be07-d5cb21eee6bb,Mapping Human Oriented Information to Software...,"Unfortunately, I have to say that none of the ..."
4b6bb5f7-f634-410e-815d-e673ab7f8632,THE CASTLE,Heaven Sent - Part 1
cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb,Fred,There is no information provided to determine ...
2d83110e-a098-4ebb-9987-066c06fa42d0,Right,{'text': 'There is no actual question to answe...
5cfb274c-0207-4aa7-9575-6ac0bd95d9b2,No,No


In [30]:
results_df['model_answer'] = results_df['finalanswer']
results_df['task_id'] = results_df.index

In [38]:
results_df['model_answer'] = results_df['model_answer'].astype(str)
results_df['expected_answer'] = results_df['expected_answer'].astype(str)

In [39]:
results_df[['task_id','model_answer']].to_json('gaia_val_level1_submission.json', orient='records', lines=True, force_ascii=False)

##### try their scoring function

In [36]:
import json
import re
import string
import warnings

import numpy as np


def normalize_number_str(number_str: str) -> float:
    # we replace these common units and commas to allow
    # conversion to float
    for char in ["$", "%", ","]:
        number_str = number_str.replace(char, "")
    try:
        return float(number_str)
    except ValueError:
        print(f"String {number_str} cannot be normalized to number str.")
        return float("inf")


def split_string(
    s: str,
    char_list: list[str] = [",", ";"],
) -> list[str]:
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)


def question_scorer(
    model_answer: str,
    ground_truth: str,
) -> bool:
    def is_float(element: any) -> bool:
        try:
            float(element)
            return True
        except ValueError:
            return False
        
    if model_answer is None:
        model_answer = "None"

    # if gt is a number
    if is_float(ground_truth):
        print(f"Evaluating {model_answer} as a number.")
        normalized_answer = normalize_number_str(model_answer)
        return normalized_answer == float(ground_truth)

    # if gt is a list
    elif any(char in ground_truth for char in [",", ";"]):
        print(f"Evaluating {model_answer} as a comma separated list.")
        # question with the fish: normalization removes punct

        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)

        # check length is the same
        if len(gt_elems) != len(ma_elems):
            warnings.warn(
                "Answer lists have different lengths, returning False.", UserWarning
            )
            return False

        # compare each element as float or str
        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    # if gt is a str
    else:
        print(f"Evaluating {model_answer} as a string.")
        return normalize_str(model_answer) == normalize_str(ground_truth)


def normalize_str(input_str, remove_punct=True) -> str:
    """
    Normalize a string by:
    - Removing all white spaces
    - Optionally removing punctuation (if remove_punct is True)
    - Converting to lowercase
    Parameters:
    - input_str: str, the string to normalize
    - remove_punct: bool, whether to remove punctuation (default: True)
    Returns:
    - str, the normalized string
    """
    # Remove all white spaces. Required e.g for seagull vs. sea gull
    no_spaces = re.sub(r"\s", "", input_str)

    # Remove punctuation, if specified.
    if remove_punct:
        translator = str.maketrans("", "", string.punctuation)
        return no_spaces.lower().translate(translator)
    else:
        return no_spaces.lower()
    
def question_scorer(
    model_answer: str,
    ground_truth: str,
) -> bool:
    def is_float(element: any) -> bool:
        try:
            float(element)
            return True
        except ValueError:
            return False
    
    if model_answer is None:
        model_answer = "None"
    
    # if gt is a number
    if is_float(ground_truth):
        print(f"Evaluating {model_answer} as a number.")
        normalized_answer = normalize_number_str(model_answer)
        return normalized_answer == float(ground_truth)
    
    # if gt is a list
    elif any(char in ground_truth for char in [",", ";"]):
        print(f"Evaluating {model_answer} as a comma separated list.")
        # question with the fish: normalization removes punct
        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)
        
        # check length is the same
        if len(gt_elems) != len(ma_elems):
            warnings.warn(
                "Answer lists have different lengths, returning False.", UserWarning
            )
            return False
        
        # compare each element as float or str
        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)
    
    # if gt is a str
    else:
        print(f"Evaluating {model_answer} as a string.")
        return normalize_str(model_answer) == normalize_str(ground_truth)

In [40]:
results = []
for _, row in results_df.iterrows():
    score = question_scorer(
        model_answer=row['model_answer'],
        ground_truth=row['expected_answer']
    )
    results.append(score)

# Calculate accuracy
accuracy = sum(results) / len(results)
print(f"Accuracy: {accuracy:.2f}")

Evaluating 17072 as a number.
Evaluating 3 as a number.
Evaluating 37 as a number.
Evaluating I'm not able to find any information about the volume in m^3 of the fish bag calculated in the University of Leicester paper 'Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?'. The original text appears to be missing or unclear. as a number.
String I'm not able to find any information about the volume in m^3 of the fish bag calculated in the University of Leicester paper 'Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?'. The original text appears to be missing or unclear. cannot be normalized to number str.
Evaluating Not provided as a number.
String Not provided cannot be normalized to number str.
Evaluating Unfortunately, I have to say that none of the provided texts contain information about the authors of the paper "Pie Menus or Linear Menus, Which Is Better?" in 2015. as a string.
Evaluating Heaven Sent - Part 1 as a string.
Evaluating There is no information provided



#### Add other levels to make submmission complete

In [2]:
import os
from datasets import load_dataset

def load_gaia_dataset(levels=["2023_level1","2023_level2","2023_level3"], loader_path="./GAIA.py", split="validation"):
    """
    Load the GAIA dataset for the specified levels.
    
    Args:
        levels: List of dataset levels to process
        loader_path: Path to the GAIA loader script
        split: Dataset split to use
    
    Returns:
        Dictionary containing loaded datasets by level
    """
    loaded_datasets = {}
    
    print(f"Loading GAIA dataset for levels: {levels}")
    
    for level in levels:
        print(f"\nLoading level: {level}")
        
        try:
            dataset = load_dataset(loader_path, name=level, split=split)
            loaded_datasets[level] = dataset
            print(f"Successfully loaded {len(dataset)} examples from {level}")
        except Exception as e:
            print(f"Error loading dataset {level}: {e}")
    
    return loaded_datasets

# Example usage:
levels = ["2023_level1","2023_level2","2023_level3"]  # Change as needed
loader_path = "./GAIA.py"
split = "validation"

datasets = load_gaia_dataset(levels, loader_path, split)

Loading GAIA dataset for levels: ['2023_level1', '2023_level2', '2023_level3']

Loading level: 2023_level1
Successfully loaded 53 examples from 2023_level1

Loading level: 2023_level2
Successfully loaded 86 examples from 2023_level2

Loading level: 2023_level3
Successfully loaded 26 examples from 2023_level3


In [None]:
datasets['2023_level2']

KeyError: "Column features not in the dataset. Current columns in the dataset: ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata']"