In [None]:
!pip install -e .. datasets sympy numpy matplotlib seaborn -q  # Install dev version of smolagents + some packages

In [14]:
# Benchmark date
# - set a concrete date:
DATE = "2024-12-26"
# - or use default: today
# DATE = None

# Evaluation dataset
# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents/benchmark-v1
EVAL_DATASET = "GeekAgents/GAIA-web"

## Constants and utilities/tools

In [15]:
import re
import string
import warnings
from typing import List


def normalize_number_str(number_str: str) -> float:
    # we replace these common units and commas to allow
    # conversion to float
    for char in ["$", "%", ","]:
        number_str = number_str.replace(char, "")
    try:
        return float(number_str)
    except ValueError:
        return float("inf")


def split_string(
    s: str,
    char_list: list[str] = [",", ";"],
) -> list[str]:
    pattern = f"[{''.join(char_list)}]"
    return re.split(pattern, s)


def is_float(element: any) -> bool:
    try:
        float(element)
        return True
    except ValueError:
        return False


def normalize_str(input_str, remove_punct=True) -> str:
    """
    Normalize a string by:
    - Removing all white spaces
    - Optionally removing punctuation (if remove_punct is True)
    - Converting to lowercase
    Parameters:
    - input_str: str, the string to normalize
    - remove_punct: bool, whether to remove punctuation (default: True)
    Returns:
    - str, the normalized string
    """
    # Remove all white spaces. Required e.g for seagull vs. sea gull
    no_spaces = re.sub(r"\s", "", input_str)

    # Remove punctuation, if specified.
    if remove_punct:
        translator = str.maketrans("", "", string.punctuation)
        return no_spaces.lower().translate(translator)
    else:
        return no_spaces.lower()


def extract_numbers(text: str) -> List[str]:
    """This pattern matches:
    - Optional negative sign
    - Numbers with optional comma thousand separators
    - Optional decimal points with decimal numbers
    """
    pattern = r"-?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?"

    return [el.replace(",", "") for el in re.findall(pattern, text)]


def get_question_score_gaia(
    model_answer: str,
    ground_truth: str,
) -> bool:
    """Scoring function used to score functions from the GAIA benchmark"""
    if is_float(ground_truth):
        normalized_answer = normalize_number_str(str(model_answer))
        return normalized_answer == float(ground_truth)

    elif any(char in ground_truth for char in [",", ";"]):  # if gt is a list
        # question with the fish: normalization removes punct
        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)

        if len(gt_elems) != len(ma_elems):  # check length is the same
            warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
            return False

        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):  # compare each element as float or str
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    else:  # if gt is a str
        return normalize_str(model_answer) == normalize_str(ground_truth)

## Score answers

In [16]:
import glob

import pandas as pd


result_dataframes = []
for file in glob.glob("../../output/*.jsonl"):
    result_dataframes.append(pd.read_json(file, lines=True))
result_df = pd.concat(result_dataframes, ignore_index=True)
result_df = result_df.loc[result_df["level"] == 1]
result_df.head()

Unnamed: 0,model_id,question,original_question,answer,true_answer,intermediate_steps,start_time,end_time,token_counts,level
0,Qwen/Qwen2.5-VL-72B-Instruct,If Eliud Kipchoge could maintain his record-ma...,If Eliud Kipchoge could maintain his record-ma...,17000,17,[TaskStep(task='If Eliud Kipchoge could mainta...,2025-04-02 16:17:38.111258984,2025-04-02 16:18:43.742221117,"{'input': 22296, 'output': 595}",1
1,Qwen/Qwen2.5-VL-72B-Instruct,How many studio albums were published by Merce...,How many studio albums were published by Merce...,3,3,[TaskStep(task='How many studio albums were pu...,2025-04-02 16:17:38.749067068,2025-04-02 16:19:04.685535908,"{'input': 29980, 'output': 848}",1
2,Qwen/Qwen2.5-VL-72B-Instruct,What was the volume in m^3 of the fish bag tha...,What was the volume in m^3 of the fish bag tha...,0.177,0.1777,[TaskStep(task='What was the volume in m^3 of ...,2025-04-02 16:17:38.574250937,2025-04-02 16:20:28.700230837,"{'input': 59301, 'output': 1616}",1
3,Qwen/Qwen2.5-VL-72B-Instruct,"In Series 9, Episode 11 of Doctor Who, the Doc...","In Series 9, Episode 11 of Doctor Who, the Doc...",EXT. THE CASTLE - DAY,THE CASTLE,"[TaskStep(task='In Series 9, Episode 11 of Doc...",2025-04-02 16:19:10.391962767,2025-04-02 16:20:41.083353043,"{'input': 30810, 'output': 753}",1
4,Qwen/Qwen2.5-VL-72B-Instruct,Of the authors (First M. Last) that worked on ...,Of the authors (First M. Last) that worked on ...,A usability evaluation of Web user interface s...,Mapping Human Oriented Information to Software...,[TaskStep(task='Of the authors (First M. Last)...,2025-04-02 16:18:49.799759150,2025-04-02 16:21:25.836360931,"{'input': 57094, 'output': 1303}",1


In [17]:
result_df["is_correct"] = result_df.apply(lambda x: get_question_score_gaia(x["answer"], x["true_answer"]), axis=1)

In [18]:
display(result_df.groupby("model_id")[["is_correct"]].value_counts())

result_df.groupby("model_id")[["is_correct"]].mean()

model_id                      is_correct
Qwen/Qwen2.5-VL-72B-Instruct  False         15
                              True           3
gpt-4o                        False         18
                              True           8
Name: count, dtype: int64

Unnamed: 0_level_0,is_correct
model_id,Unnamed: 1_level_1
Qwen/Qwen2.5-VL-72B-Instruct,0.166667
gpt-4o,0.307692
