In [None]:

from smolagents import CodeAgent, LiteLLMModel, DuckDuckGoSearchTool, VisitWebpageTool, FinalAnswerTool, Tool, tool 

# Create a model using LiteLLMModel with Ollama
model = LiteLLMModel(
    model_id="ollama_chat/qwen2.5-coder:32b",  # Format: "ollama_chat/[model-name]"
    api_base="http://localhost:11434",   # Default Ollama API endpoint
    api_key="ollama",                    # This is just a placeholder, Ollama doesn't actually require an API key
    num_ctx=30000                        # Ollama default is 2048 which might be too small for complex tasks
)

# Create the CodeAgent with all tools
agent = CodeAgent(
    tools=[
        DuckDuckGoSearchTool(),
        VisitWebpageTool(),
        FinalAnswerTool()
    ],
    model=model,
    additional_authorized_imports=["wikipedia", "requests", "json", "re", "datetime", "os"]
)

In [None]:
result = agent.run("If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.")
print(result)

## Iterate over dataset

In [3]:
import os
from datasets import load_dataset
from tqdm import tqdm

def load_gaia_datasets(levels=None, loader_path="../GAIA.py", split="validation"):
    """
    Load GAIA datasets for specified levels as a dictionary.
    
    Args:
        levels: List of dataset levels to load (if None, loads all available levels)
        loader_path: Path to the GAIA loader script
        split: Dataset split to use
        
    Returns:
        Dictionary with level names as keys and dataset examples as values
    """
    # Default levels if none specified
    if levels is None:
        levels = ["2023_level1", "2023_level2", "2023_level3", "2023_level4", "2023_level5"]
        
    result = {}
    
    for level in levels:
        try:
            print(f"Loading {level}...")
            dataset = load_dataset(loader_path, name=level, split=split)
            
            # Convert to list of dictionaries with file content
            examples = []
            for idx, example in enumerate(dataset):
                item = dict(example)
                
                # Try to load file content if available
                file_content = ""
                file_path = item.get("file_path", "")
                file_name = item.get("file_name", "")
                
                if file_path and os.path.exists(file_path):
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        file_content = f.read()
                elif file_name and os.path.exists(file_name):
                    with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
                        file_content = f.read()
                
                item["file_content"] = file_content
                item["has_file_content"] = bool(file_content)
                examples.append(item)
                
            result[level] = examples
            print(f"Loaded {len(examples)} examples from {level}")
        except Exception as e:
            print(f"Error loading {level}: {e}")
    
    return result

datasets = load_gaia_datasets(levels=["2023_level1", "2023_level2", "2023_level3"])  

Loading 2023_level1...
Loaded 53 examples from 2023_level1
Loading 2023_level2...
Loaded 86 examples from 2023_level2
Loading 2023_level3...
Loaded 26 examples from 2023_level3


### Answer questions using the loaded datasets

In [None]:
from smolagents import CodeAgent, LiteLLMModel, DuckDuckGoSearchTool, VisitWebpageTool, FinalAnswerTool, Tool, tool
import json
import os
import time
from tqdm import tqdm

# Create a model using LiteLLMModel with Ollama
model = LiteLLMModel(
    model_id="ollama_chat/qwen2.5-coder:32b",  # Format: "ollama_chat/[model-name]"
    api_base="http://localhost:11434",   # Default Ollama API endpoint
    api_key="ollama",                    # This is just a placeholder, Ollama doesn't actually require an API key
    num_ctx=30000                        # Ollama default is 2048 which might be too small for complex tasks
)


# Create the CodeAgent with all tools
def get_agent():
    return CodeAgent(
        tools=[
            DuckDuckGoSearchTool(),
            VisitWebpageTool(),
            FinalAnswerTool()
        ],
        model=model,
        additional_authorized_imports=["wikipedia", "requests", "json", "re", "datetime", "os"]
    )

In [None]:
datasets 

In [None]:

def answer_gaia_questions(datasets, output_dir="gaia_results"):
    """
    Process and answer all questions from the provided GAIA datasets
    
    Args:
        datasets: Dictionary with level names as keys and dataset examples as values
        output_dir: Directory to save results
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a checkpoint file to track progress
    checkpoint_file = os.path.join(output_dir, "checkpoint.json")
    completed_tasks = {}
    
    # Load checkpoint if exists
    if os.path.exists(checkpoint_file):
        try:
            with open(checkpoint_file, 'r') as f:
                completed_tasks = json.load(f)
            print(f"Loaded checkpoint with {sum(len(tasks) for tasks in completed_tasks.values())} completed tasks")
        except Exception as e:
            print(f"Error loading checkpoint: {e}. Starting fresh.")
            completed_tasks = {}
    
    # Process each level
    for level_name, examples in datasets.items():
        print(f"\nProcessing {level_name} with {len(examples)} examples...")
        
        # Initialize level results from checkpoint or create new
        if level_name not in completed_tasks:
            completed_tasks[level_name] = {}
        
        # Load existing results if available
        level_results_file = os.path.join(output_dir, f"{level_name}_results.json")
        level_results = []
        if os.path.exists(level_results_file):
            try:
                with open(level_results_file, 'r') as f:
                    level_results = json.load(f)
                print(f"Loaded {len(level_results)} existing results for {level_name}")
            except Exception as e:
                print(f"Error loading existing results: {e}. Starting with empty results.")
                level_results = []
        
        # Create level-specific output directory
        level_dir = os.path.join(output_dir, level_name)
        os.makedirs(level_dir, exist_ok=True)
        
        # Process each example in the level
        for example in tqdm(examples):
            # Extract question information
            task_id = example.get("task_id", "unknown_id")
            
            # Skip if already completed
            if task_id in completed_tasks[level_name]:
                print(f"Skipping completed task {task_id}")
                continue
                
            question = example.get("Question", "")
            expected_answer = example.get("Final answer", "")
            
            # Handle file content if available
            file_content = ""
            file_path = example.get("file_path", "")
            file_name = example.get("file_name", "")
            
            # Try to load file content if available
            if file_path and os.path.exists(file_path):
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        file_content = f.read()
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")
            elif file_name and os.path.exists(file_name):
                try:
                    with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
                        file_content = f.read()
                except Exception as e:
                    print(f"Error reading file {file_name}: {e}")
            
            # Use file_content from example if already loaded
            if not file_content and "file_content" in example and example["file_content"]:
                file_content = example["file_content"]
            
            # Skip if question is empty
            if not question:
                print(f"Skipping example {task_id}: Question is empty")
                continue
            
            # Print detailed information about the task
            print(f"\n{'='*80}")
            print(f"Processing task {task_id}:")
            print(f"Question: {question}")
            print(f"Expected answer: {expected_answer}")
            print(f"Has file content: {bool(file_content)}")
            print(f"{'='*80}")
            
            # Construct the prompt
            prompt = ""
            if file_content:
                prompt = f"Here is the file content to use for answering the question:\n\n{file_content}\n\n"
            
            prompt += f"Question: {question}\n\n"
            # Reiterate the question at the end of the prompt
            prompt += f"Please answer the question: {question}"
            
            # Initialize a new agent for each question to avoid context contamination
            agent = get_agent()
            
            try:
                # Run the agent and get the answer
                start_time = time.time()
                result = agent.run(prompt)
                end_time = time.time()
                
                # Print the agent's answer
                print(f"Question: {question}")
                print(f"Expected answer: {expected_answer}")
                print(f"Agent's answer: {result}")
                print(f"Processing time: {end_time - start_time:.2f} seconds")
                
                # Store the result
                question_result = {
                    "task_id": task_id,
                    "question": question,
                    "level": example.get("Level", ""),
                    "has_file_content": bool(file_content),
                    "model_answer": result,
                    "expected_answer": expected_answer,
                    "processing_time": end_time - start_time
                }
                
                # Save individual result
                result_file = os.path.join(level_dir, f"{task_id}.json")
                with open(result_file, "w") as f:
                    json.dump(question_result, f, indent=2)
                
                # Add to level results
                level_results.append(question_result)
                
                # Update checkpoint
                completed_tasks[level_name][task_id] = True
                with open(checkpoint_file, "w") as f:
                    json.dump(completed_tasks, f, indent=2)
                
                # Update level results file after each completion
                with open(level_results_file, "w") as f:
                    json.dump(level_results, f, indent=2)
                
                # Pause between questions to avoid rate limiting
                time.sleep(1)
                
            except Exception as e:
                print(f"Error processing task {task_id}: {str(e)}")
                # Store the error
                question_result = {
                    "task_id": task_id,
                    "question": question,
                    "level": example.get("Level", ""),
                    "has_file_content": bool(file_content),
                    "error": str(e),
                    "expected_answer": expected_answer
                }
                
                # Save error result
                error_file = os.path.join(level_dir, f"{task_id}_error.json")
                with open(error_file, "w") as f:
                    json.dump(question_result, f, indent=2)
                
                level_results.append(question_result)
                
                # Update checkpoint and results file even for errors
                completed_tasks[level_name][task_id] = True
                with open(checkpoint_file, "w") as f:
                    json.dump(completed_tasks, f, indent=2)
                
                with open(level_results_file, "w") as f:
                    json.dump(level_results, f, indent=2)
                    
                # Pause between questions to avoid rate limiting
                time.sleep(1)

                
        print(f"Completed processing {level_name}. Results saved to {level_dir}")
    
    print(f"\nAll processing complete. Results saved to {output_dir}")
answer_gaia_questions(datasets, output_dir="gaia_results")

In [None]:
# Print a summary of results
def print_summary(output_dir="gaia_results"):
    """Print a summary of the processing results"""
    if not os.path.exists(output_dir):
        print(f"Output directory {output_dir} does not exist. No results to summarize.")
        return
        
    total_questions = 0
    total_answered = 0
    total_errors = 0
    correct_answers = 0
    
    for level_name in ["2023_level1", "2023_level2", "2023_level3"]:
        result_file = os.path.join(output_dir, f"{level_name}_results.json")
        if not os.path.exists(result_file):
            print(f"No results file found for {level_name}")
            continue
            
        with open(result_file, "r") as f:
            results = json.load(f)
            
        questions = len(results)
        errors = sum(1 for r in results if "error" in r)
        answered = questions - errors
        
        # Count correct answers
        level_correct = 0
        for r in results:
            if "error" not in r and r.get("model_answer", "").strip() == r.get("expected_answer", "").strip():
                level_correct += 1
        
        print(f"{level_name}: {answered}/{questions} questions answered ({errors} errors), {level_correct} correct")
        
        total_questions += questions
        total_answered += answered
        total_errors += errors
        correct_answers += level_correct
    
    if total_questions > 0:
        print(f"\nOverall: {total_answered}/{total_questions} questions answered ({total_errors} errors)")
        if total_answered > 0:
            print(f"Accuracy: {correct_answers}/{total_answered} correct ({correct_answers/total_answered*100:.2f}%)")
    else:
        print("\nNo questions processed yet.")

# Print summary after completion
print_summary()

## run without outputs. add logging

In [None]:
from smolagents import CodeAgent,LiteLLMModel, DuckDuckGoSearchTool, VisitWebpageTool, FinalAnswerTool, Tool, tool
import json
import os
import time
import logging
import sys
from tqdm import tqdm
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO

# Set up logging configuration
def setup_logging(output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a logger for the application
    logger = logging.getLogger('gaia_agent')
    logger.setLevel(logging.INFO)
    
    # Clear any existing handlers
    if logger.handlers:
        logger.handlers.clear()
    
    # Create file handler which logs all messages
    log_file = os.path.join(output_dir, 'agent_execution.log')
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.INFO)
    
    # Create formatter and add it to the handlers
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # Add the handlers to the logger
    logger.addHandler(file_handler)
    
    return logger

# Create a model using LiteLLMModel with Ollama
model = LiteLLMModel(
    model_id="ollama_chat/qwen2.5-coder:32b",  # Format: "ollama_chat/[model-name]"
    api_base="http://localhost:11434",   # Default Ollama API endpoint
    api_key="ollama",                    # This is just a placeholder, Ollama doesn't actually require an API key
    num_ctx=30000                        # Ollama default is 2048 which might be too small for complex tasks
)

# Create the CodeAgent with all tools and extended max steps
def get_agent():
    return CodeAgent(
        tools=[
            DuckDuckGoSearchTool(),
            VisitWebpageTool(),
            FinalAnswerTool()
        ],
        model=model,
        additional_authorized_imports=["wikipedia", "requests", "json", "re", "datetime", "os"]  # We'll capture this verbose output to logs
    )

def answer_gaia_questions(datasets, output_dir="gaia_results"):
    """
    Process and answer all questions from the provided GAIA datasets
    
    Args:
        datasets: Dictionary with level names as keys and dataset examples as values
        output_dir: Directory to save results
    """
    # Set up logging
    logger = setup_logging(output_dir)
    logger.info("Starting GAIA question processing")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create a checkpoint file to track progress
    checkpoint_file = os.path.join(output_dir, "checkpoint.json")
    completed_tasks = {}
    
    # Load checkpoint if exists
    if os.path.exists(checkpoint_file):
        try:
            with open(checkpoint_file, 'r') as f:
                completed_tasks = json.load(f)
            print(f"Loaded checkpoint with {sum(len(tasks) for tasks in completed_tasks.values())} completed tasks")
            logger.info(f"Loaded checkpoint with {sum(len(tasks) for tasks in completed_tasks.values())} completed tasks")
        except Exception as e:
            print(f"Error loading checkpoint: {e}. Starting fresh.")
            logger.error(f"Error loading checkpoint: {e}. Starting fresh.")
            completed_tasks = {}
    
    # Process each level
    for level_name, examples in datasets.items():
        print(f"\nProcessing {level_name} with {len(examples)} examples...")
        logger.info(f"Processing {level_name} with {len(examples)} examples")
        
        # Initialize level results from checkpoint or create new
        if level_name not in completed_tasks:
            completed_tasks[level_name] = {}
        
        # Load existing results if available
        level_results_file = os.path.join(output_dir, f"{level_name}_results.json")
        level_results = []
        if os.path.exists(level_results_file):
            try:
                with open(level_results_file, 'r') as f:
                    level_results = json.load(f)
                print(f"Loaded {len(level_results)} existing results for {level_name}")
                logger.info(f"Loaded {len(level_results)} existing results for {level_name}")
            except Exception as e:
                print(f"Error loading existing results: {e}. Starting with empty results.")
                logger.error(f"Error loading existing results: {e}. Starting with empty results.")
                level_results = []
        
        # Create level-specific output directory
        level_dir = os.path.join(output_dir, level_name)
        os.makedirs(level_dir, exist_ok=True)
        
        # Create logs directory for agent outputs
        logs_dir = os.path.join(output_dir, "agent_logs", level_name)
        os.makedirs(logs_dir, exist_ok=True)
        
        # Process each example in the level
        for example in tqdm(examples, desc=f"Processing {level_name}"):
            # Extract question information
            task_id = example.get("task_id", "unknown_id")
            
            # Skip if already completed
            if task_id in completed_tasks.get(level_name, {}):
                print(f"Skipping completed task {task_id}")
                logger.info(f"Skipping completed task {task_id}")
                continue
                
            question = example.get("Question", "")
            expected_answer = example.get("Final answer", "")
            
            # Handle file content if available
            file_content = ""
            file_path = example.get("file_path", "")
            file_name = example.get("file_name", "")
            
            # Try to load file content if available
            if file_path and os.path.exists(file_path):
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                        file_content = f.read()
                except Exception as e:
                    error_msg = f"Error reading file {file_path}: {e}"
                    print(error_msg)
                    logger.error(error_msg)
            elif file_name and os.path.exists(file_name):
                try:
                    with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
                        file_content = f.read()
                except Exception as e:
                    error_msg = f"Error reading file {file_name}: {e}"
                    print(error_msg)
                    logger.error(error_msg)
            
            # Use file_content from example if already loaded
            if not file_content and "file_content" in example and example["file_content"]:
                file_content = example["file_content"]
            
            # Skip if question is empty
            if not question:
                print(f"Skipping example {task_id}: Question is empty")
                logger.info(f"Skipping example {task_id}: Question is empty")
                continue
            
            # Print detailed information about the task
            task_info = f"\n{'='*80}\nProcessing task {task_id}:\nQuestion: {question}\nExpected answer: {expected_answer}\nHas file content: {bool(file_content)}\n{'='*80}"
            print(task_info)
            logger.info(task_info)
            
            # Construct the prompt
            prompt = ""
            if file_content:
                prompt = f"Here is the file content to use for answering the question:\n\n{file_content}\n\n"
            
            prompt += f"Question: {question}\n\n"
            # Reiterate the question at the end of the prompt
            prompt += f"Please answer the question: {question}"
            
            # Initialize a new agent for each question to avoid context contamination
            agent = get_agent()
            
            # Setup a capture for the agent's output
            agent_log_file = os.path.join(logs_dir, f"{task_id}.log")
            
            try:
                # Capture start time
                start_time = time.time()
                
                # Redirect stdout and stderr to capture agent output
                with open(agent_log_file, 'w', encoding='utf-8') as agent_log:
                    # Use StringIO to capture any output we want to process
                    stdout_capture = StringIO()
                    
                    # Redirect both stdout and stderr to our log file and capture
                    with redirect_stdout(agent_log), redirect_stderr(agent_log):
                        # Run the agent
                        result = agent.run(prompt)
                
                # Calculate processing time
                end_time = time.time()
                processing_time = end_time - start_time
                
                # Print only the agent's final answer and processing time
                print(f"Agent's answer: {result}")
                print(f"Processing time: {processing_time:.2f} seconds")
                logger.info(f"Agent's answer: {result}")
                logger.info(f"Processing time: {processing_time:.2f} seconds")
                
                # Store the result
                question_result = {
                    "task_id": task_id,
                    "question": question,
                    "level": example.get("Level", ""),
                    "has_file_content": bool(file_content),
                    "agent_answer": result,
                    "expected_answer": expected_answer,
                    "processing_time": processing_time,
                    "correct": result.strip() == expected_answer.strip(),
                    "agent_log_file": agent_log_file
                }
                
                # Save individual result
                result_file = os.path.join(level_dir, f"{task_id}.json")
                with open(result_file, "w") as f:
                    json.dump(question_result, f, indent=2)
                
                # Add to level results
                level_results.append(question_result)
                
                # Update checkpoint
                if level_name not in completed_tasks:
                    completed_tasks[level_name] = {}
                completed_tasks[level_name][task_id] = True
                with open(checkpoint_file, "w") as f:
                    json.dump(completed_tasks, f, indent=2)
                
                # Update level results file after each completion
                with open(level_results_file, "w") as f:
                    json.dump(level_results, f, indent=2)
                
                # Pause between questions to avoid rate limiting
                time.sleep(1)
                
            except Exception as e:
                error_msg = f"Error processing task {task_id}: {str(e)}"
                print(error_msg)
                logger.error(error_msg)
                
                # Check if it's a timeout or step limit error
                error_message = str(e)
                error_type = "general_error"
                if "maximum number of steps" in error_message.lower() or "max_steps" in error_message.lower():
                    error_type = "max_steps_exceeded"
                elif "timed out" in error_message.lower() or "timeout" in error_message.lower():
                    error_type = "timeout"
                
                # Store the error
                question_result = {
                    "task_id": task_id,
                    "question": question,
                    "level": example.get("Level", ""),
                    "has_file_content": bool(file_content),
                    "error": str(e),
                    "error_type": error_type,
                    "expected_answer": expected_answer,
                    "agent_log_file": agent_log_file
                }
                
                # Save error result
                error_file = os.path.join(level_dir, f"{task_id}_error.json")
                with open(error_file, "w") as f:
                    json.dump(question_result, f, indent=2)
                
                level_results.append(question_result)
                
                # Update checkpoint and results file even for errors
                if level_name not in completed_tasks:
                    completed_tasks[level_name] = {}
                completed_tasks[level_name][task_id] = True
                with open(checkpoint_file, "w") as f:
                    json.dump(completed_tasks, f, indent=2)
                
                with open(level_results_file, "w") as f:
                    json.dump(level_results, f, indent=2)
        
        print(f"Completed processing {level_name}. Results saved to {level_dir}")
        logger.info(f"Completed processing {level_name}. Results saved to {level_dir}")

        
    print(f"\nAll processing complete. Results saved to {output_dir}")
    logger.info(f"All processing complete. Results saved to {output_dir}")

# Print a summary of results
def print_summary(output_dir="gaia_results"):
    """Print a summary of the processing results"""
    if not os.path.exists(output_dir):
        print(f"Output directory {output_dir} does not exist. No results to summarize.")
        return
        
    total_questions = 0
    total_answered = 0
    total_errors = 0
    correct_answers = 0
    error_types = {"max_steps_exceeded": 0, "timeout": 0, "general_error": 0}
    
    for level_name in ["2023_level1", "2023_level2", "2023_level3"]:
        result_file = os.path.join(output_dir, f"{level_name}_results.json")
        if not os.path.exists(result_file):
            print(f"No results file found for {level_name}")
            continue
            
        with open(result_file, "r") as f:
            results = json.load(f)
            
        questions = len(results)
        errors = sum(1 for r in results if "error" in r)
        answered = questions - errors
        
        # Count correct answers and error types
        level_correct = 0
        level_error_types = {"max_steps_exceeded": 0, "timeout": 0, "general_error": 0}
        
        for r in results:
            if "error" in r:
                error_type = r.get("error_type", "general_error")
                level_error_types[error_type] = level_error_types.get(error_type, 0) + 1
                error_types[error_type] = error_types.get(error_type, 0) + 1
            elif r.get("agent_answer", "").strip() == r.get("expected_answer", "").strip():
                level_correct += 1
        
        print(f"{level_name}: {answered}/{questions} questions answered ({errors} errors), {level_correct} correct")
        print(f"  Error breakdown: {level_error_types}")
        
        total_questions += questions
        total_answered += answered
        total_errors += errors
        correct_answers += level_correct
    
    if total_questions > 0:
        print(f"\nOverall: {total_answered}/{total_questions} questions answered ({total_errors} errors)")
        print(f"Error breakdown: {error_types}")
        if total_answered > 0:
            print(f"Accuracy: {correct_answers}/{total_answered} correct ({correct_answers/total_answered*100:.2f}%)")
    else:
        print("\nNo questions processed yet.")

# Process and answer all questions
answer_gaia_questions(datasets)

# Print summary after completion
print_summary()


Processing 2023_level1 with 53 examples...


Processing 2023_level1:   0%|          | 0/53 [00:00<?, ?it/s]


Processing task e1fc63a2-da7a-432f-be78-7c4a95598703:
Question: If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.
Expected answer: 17
Has file content: False


Processing 2023_level1:   2%|▏         | 1/53 [01:00<52:04, 60.09s/it]

Agent's answer: 17000.0
Processing time: 60.08 seconds
Error processing task e1fc63a2-da7a-432f-be78-7c4a95598703: 'float' object has no attribute 'strip'

Processing task 8e867cd7-cff9-4e6c-867a-ff5ddc2550be:
Question: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
Expected answer: 3
Has file content: False


Processing 2023_level1:   2%|▏         | 1/53 [01:59<1:43:44, 119.69s/it]


KeyboardInterrupt: 

In [1]:
import json
import os
import pandas as pd

# Path to the results directory
results_dir = "gaia_results"

# List of level files to combine
level_files = [
    "2023_level1_results.json",
    "2023_level2_results.json",
    "2023_level3_results.json"
]

# Combined results list
all_results = []

# Load and combine all result files
for file_name in level_files:
    file_path = os.path.join(results_dir, file_name)
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r') as f:
                results = json.load(f)
                # Add level info to each result if not already present
                level = file_name.replace("_results.json", "")
                for result in results:
                    if "level" not in result:
                        result["level"] = level
                all_results.extend(results)
            print(f"Loaded {len(results)} results from {file_name}")
        except Exception as e:
            print(f"Error loading {file_name}: {e}")
    else:
        print(f"File not found: {file_path}")

# Save the combined results to a new JSON file
combined_file = os.path.join(results_dir, "combined_results.json")
with open(combined_file, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\nCombined {len(all_results)} results and saved to {combined_file}")

# Convert to pandas DataFrame
results_df = pd.DataFrame(all_results)

# Display basic info about the DataFrame
print(f"\nDataFrame shape: {results_df.shape}")
print(f"Columns: {results_df.columns.tolist()}")

# Display the DataFrame
results_df

Loaded 53 results from 2023_level1_results.json
Loaded 86 results from 2023_level2_results.json
Loaded 26 results from 2023_level3_results.json

Combined 165 results and saved to gaia_results/combined_results.json

DataFrame shape: (165, 10)
Columns: ['task_id', 'question', 'level', 'has_file_content', 'error', 'error_type', 'expected_answer', 'agent_log_file', 'model_answer', 'processing_time']


Unnamed: 0,task_id,question,level,has_file_content,error,error_type,expected_answer,agent_log_file,model_answer,processing_time
0,e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,1,False,'float' object has no attribute 'strip',general_error,17,gaia_results/agent_logs/2023_level1/e1fc63a2-d...,,
1,8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,1,False,,,3,,Based on the information provided and our manu...,472.560343
2,ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,1,False,,,3,,97,158.401876
3,5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,1,False,,,0.1777,,XRef Table:\nxref\n0 14\n0000000000 65535 f\n0...,266.601216
4,a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,1,False,,,3,,2,42.667010
...,...,...,...,...,...,...,...,...,...,...
160,5b2a14e8-6e59-479c-80e3-4696e8980152,The brand that makes these harnesses the dogs ...,3,True,closing tag '[/\n�uU����G.���SE���S���W*��...,,bacon,,,
161,9e1fc53b-46ff-49a1-9d05-9e6faac34cc5,"A 5-man group made up of one tank, one healer,...",3,False,,,"Death Knight, Hunter, Paladin, Priest, Warlock",,"Demon Hunter, Mage, Paladin, Priest, Warrior",39.543340
162,5f982798-16b9-4051-ab57-cfc7ebdb2a91,I read a paper about multiwavelength observati...,3,False,,,0.2,,The radiation activity of FRB 180916.J0158+65 ...,420.866357
163,0512426f-4d28-49f0-be77-06d05daec096,In the YouTube 360 VR video from March 2018 na...,3,False,,,100000000,,65,331.843917


In [4]:
import json
import re
import string
import warnings

import numpy as np


def normalize_number_str(number_str: str) -> float:
    # we replace these common units and commas to allow
    # conversion to float
    for char in ["$", "%", ","]:
        number_str = number_str.replace(char, "")
    try:
        return float(number_str)
    except ValueError:
        print(f"String {number_str} cannot be normalized to number str.")
        return float("inf")


def split_string(
    s: str,
    char_list: list[str] = [",", ";"],
) -> list[str]:
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)


def question_scorer(
    model_answer: str,
    ground_truth: str,
) -> bool:
    def is_float(element: any) -> bool:
        try:
            float(element)
            return True
        except ValueError:
            return False
        
    if model_answer is None:
        model_answer = "None"

    # if gt is a number
    if is_float(ground_truth):
        print(f"Evaluating {model_answer} as a number.")
        normalized_answer = normalize_number_str(model_answer)
        return normalized_answer == float(ground_truth)

    # if gt is a list
    elif any(char in ground_truth for char in [",", ";"]):
        print(f"Evaluating {model_answer} as a comma separated list.")
        # question with the fish: normalization removes punct

        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)

        # check length is the same
        if len(gt_elems) != len(ma_elems):
            warnings.warn(
                "Answer lists have different lengths, returning False.", UserWarning
            )
            return False

        # compare each element as float or str
        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    # if gt is a str
    else:
        print(f"Evaluating {model_answer} as a string.")
        return normalize_str(model_answer) == normalize_str(ground_truth)


def normalize_str(input_str, remove_punct=True) -> str:
    """
    Normalize a string by:
    - Removing all white spaces
    - Optionally removing punctuation (if remove_punct is True)
    - Converting to lowercase
    Parameters:
    - input_str: str, the string to normalize
    - remove_punct: bool, whether to remove punctuation (default: True)
    Returns:
    - str, the normalized string
    """
    # Remove all white spaces. Required e.g for seagull vs. sea gull
    no_spaces = re.sub(r"\s", "", input_str)

    # Remove punctuation, if specified.
    if remove_punct:
        translator = str.maketrans("", "", string.punctuation)
        return no_spaces.lower().translate(translator)
    else:
        return no_spaces.lower()
    
def question_scorer(
    model_answer: str,
    ground_truth: str,
) -> bool:
    def is_float(element: any) -> bool:
        try:
            float(element)
            return True
        except ValueError:
            return False
    
    if model_answer is None:
        model_answer = "None"
    
    # if gt is a number
    if is_float(ground_truth):
        print(f"Evaluating {model_answer} as a number.")
        normalized_answer = normalize_number_str(model_answer)
        return normalized_answer == float(ground_truth)
    
    # if gt is a list
    elif any(char in ground_truth for char in [",", ";"]):
        print(f"Evaluating {model_answer} as a comma separated list.")
        # question with the fish: normalization removes punct
        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)
        
        # check length is the same
        if len(gt_elems) != len(ma_elems):
            warnings.warn(
                "Answer lists have different lengths, returning False.", UserWarning
            )
            return False
        
        # compare each element as float or str
        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)
    
    # if gt is a str
    else:
        print(f"Evaluating {model_answer} as a string.")
        return normalize_str(model_answer) == normalize_str(ground_truth)

In [5]:
results = results_df.copy() #

In [8]:
# Create a copy of the results dataframe with relevant columns
results_df = results_df[["task_id", "model_answer", "expected_answer"]]

# Convert model_answer and expected_answer to strings
results_df['model_answer'] = results_df['model_answer'].apply(lambda x: str(x) if x is not None else "")
results_df['expected_answer'] = results_df['expected_answer'].apply(lambda x: str(x) if x is not None else "")

In [None]:
results = []
for _, row in results_df.iterrows():
    score = question_scorer(
        model_answer=row['model_answer'],
        ground_truth=row['expected_answer']
    )
    results.append(score)

# Calculate accuracy
accuracy = sum(results) / len(results)
print(f"Accuracy: {accuracy:.2f}")

In [14]:
results_df[["task_id","model_answer"]].to_json("smolagents_baseline.json")