This Notebook is made for experimentation purposes, so we can track the changes vs performance of our agent

### Setup

In [1]:
# Libraries
from typing import Literal
from datasets import load_dataset
import pandas as pd
import os
from random import sample
from dotenv import load_dotenv
from huggingface_hub import snapshot_download, login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import GAIA files
load_dotenv()
login(token=os.getenv(key="HF_TOKEN_CHAPPIE"))
gaia_repo_dir = snapshot_download(repo_id="gaia-benchmark/GAIA", repo_type="dataset")
print(gaia_repo_dir)

Fetching 114 files: 100%|██████████| 114/114 [00:00<00:00, 2236.05it/s]

/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316





In [3]:
# Local Modules
os.sys.path.append("../src/")
os.sys.path.append("../src/agents/")
os.sys.path.append("../src/utils/")

import react  # My AI assistant
import gaia_eval

Langfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client


In [4]:
# Paths
output_results_path = "/home/santiagoal/current-projects/chappie/data/agent_experiments/"
experiment_iterations_path = os.path.join(output_results_path, "iterations/")
summary_experiments_path = os.path.join(output_results_path, "summary.csv")

In [5]:
# GAIA dataset
gaia_dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1")

Docs suggest to focus on validation set for dev purposes, let's explore it

In [6]:
dev_set = gaia_dataset["validation"]
dev_set

Dataset({
    features: ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata'],
    num_rows: 53
})

### Explore GAIA Lvl. 1 Questions

In [7]:
df_dev = pd.DataFrame(dev_set)

In [8]:
df_dev.head()

Unnamed: 0,task_id,Question,Level,Final answer,file_name,file_path,Annotator Metadata
0,e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,1,17.0,,,{'Steps': '1. Googled Eliud Kipchoge marathon ...
1,8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,1,3.0,,,{'Steps': '1. I did a search for Mercedes Sosa...
2,ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,1,3.0,,,{'Steps': 'Step 1: Evaluate the problem statem...
3,5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,1,0.1777,,,"{'Steps': '1. Searched '""Can Hiccup Supply Eno..."
4,a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,1,3.0,,,{'Steps': '1. Navigate to the YouTube link. 2....


In order to start our dev phase, let's observe how does our React Agent perform of a single question

In [9]:
n_samples = 15
sample_questions = df_dev.sample(n_samples)

In [10]:
# Dataset copy just for eval

results_df = sample_questions.copy()[["Question", "Final answer", "file_path"]]
results_df["Agent response"] = None
results_df["is_correct"] = None  # 1 if it is correct, 0 otherwise

results_df = results_df[["Question", "file_path", "Agent response", "Final answer", "is_correct"]]

In [11]:
results_df

Unnamed: 0,Question,file_path,Agent response,Final answer,is_correct
12,In Emily Midkiff's June 2014 article in a jour...,,,fluffy,
28,What is the surname of the equine veterinarian...,,,Louvrier,
41,What was the complete title of the book in whi...,,,Five Hundred Things To Eat Before It's Too Lat...,
37,Pull out the sentence in the following 5x7 blo...,,,The seagull glided peacefully to my chair.,
40,"According to Girls Who Code, how long did it t...",,,22,
13,Under DDC 633 on Bielefeld University Library'...,,,Guatemala,
34,What is the final numeric output from the atta...,/home/santiagoal/.cache/huggingface/hub/datase...,,0,
44,"Hi, I was out sick from my classes on Friday, ...",/home/santiagoal/.cache/huggingface/hub/datase...,,"132, 133, 134, 197, 245",
29,"I'm making a grocery list for my mom, but she'...",,,"broccoli, celery, fresh basil, lettuce, sweet ...",
19,What writer is quoted by Merriam-Webster for t...,,,Annie Levin,


### Experiment and Track performance on dev set

In [12]:
# Get XP history
old_experiments_data = pd.read_csv(summary_experiments_path)

In [18]:
# Form

# Get last information 
latest_experiment = old_experiments_data.iloc[-1]
#latest_xp_name = latest_experiment["experiment"]
latest_agent = latest_experiment["agent"]
latest_tools = latest_experiment["tools"]
latest_iteration = latest_experiment["iteration"]
current_iteration = latest_iteration + 1

# Get XP name
msg = "Type the experiment name (E.g. Integrate Whisper Transcriber)"
usr_response = ""
while usr_response.lower() == "":
    usr_response = input(msg + ": ")
xp_name = usr_response


# Get Agent Archictecure
usr_response = ""
msg = f"Is your agent differente from '{latest_agent}'? [y/N]"
usr_response = input(msg + ": ")
warning_msg = f"Oops! '{usr_response}' is not a valid response, pls try again. "
while usr_response.lower() not in ("y", "n"):
    usr_response = input(warning_msg.format(usr_response) + msg + ": ")

if usr_response.lower() == "n":
    agent_architecture = latest_agent
elif usr_response.lower() == "y":
    agent_architecture = input("Please type the new agent architecture to track" + ": ")

# Get Tools
usr_response = ""
msg = "Are there new tools to track? [y/N]"
usr_response = input(msg + ": ")
while usr_response.lower() not in ("y", "n"):
    usr_response = input(warning_msg.format(usr_response) + msg + ": ")

if usr_response.lower() == "n":
    new_tools = ""
elif usr_response.lower() == "y":
    new_tools = input("Please type the new tools list to track (comma separated)" + ": ")

new_tools_list = latest_tools + ", " + new_tools

# Format xp name and path
xp_name_snake = str(current_iteration) + "_" + xp_name.replace(" ", "_").replace(",", "").lower()
xp_path = os.path.join(experiment_iterations_path, xp_name_snake + ".csv")

In [None]:
# DEL
results_df = results_df[results_df["file_path"].map(lambda row: row.endswith(".mp3"))] 

In [23]:
# DEL
from huggingface_hub import hf_hub_download
print(hf_hub_download.__defaults__)

None


In [22]:
results_df.iloc[0]["file_path"]

'/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3'

In [20]:
results_df

Unnamed: 0,Question,file_path,Agent response,Final answer,is_correct
44,"Hi, I was out sick from my classes on Friday, ...",/home/santiagoal/.cache/huggingface/hub/datase...,,"132, 133, 134, 197, 245",
30,"Hi, I'm making a pie but I could use some help...",/home/santiagoal/.cache/huggingface/hub/datase...,,"cornstarch, freshly squeezed lemon juice, gran...",


In [31]:
gaia_eval.get_agent_response(results_df.iloc[0])



FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [21]:
# Compute and save agent responses and their evaluation

results_df["Agent response"] = results_df.apply(func=gaia_eval.get_agent_response, axis=1)
results_df["is_correct"] = results_df.apply(func=gaia_eval.evaluate_response, axis=1)
results_df.to_csv(xp_path, index=False)
results_df



FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [None]:
accuracy = results_df["is_correct"].mean()
print(f" Experiment Accuracy: {(100 * accuracy).astype("str")[:3]} %")

### Save Results

In [None]:
# Update experimentation history

xp_results = {
    "iteration": current_iteration,
    "experiment": xp_name,
    "agent": agent_architecture,
    "tools": new_tools_list,
    "accuracy": round(results_df.copy()["is_correct"].mean(), 2),
}

updated_experiments_data = pd.concat([old_experiments_data, pd.DataFrame([xp_results])], ignore_index=True)
updated_experiments_data.drop_duplicates(inplace=True)
updated_experiments_data.to_csv(summary_experiments_path, index=False)

In [None]:
updated_experiments_data

### Gather Level 1 Dev Questions

### React Agent Developing

In [None]:
# TODO: move to utils
#
#def eval_answer(row: pd.Series) -> Literal[0, 1]:   
#    # DEPRECATED
#    """
#    Evaluate Agent responses of GAIA-like answers. Exact match is mandatory for good responses
#
#    Parameters
#    ----------
#    model_response: str
#        Model response to the question
#    gt_answer: str
#        Ground truth answer to the question
#    
#    Returns:
#        Literal[0, 1]: 0 if the answer is not correct, 1 otherwise 
#    
#    Example:
#        >>> eval_answer(32.0, 32.1)
#        '0'
#    """
#    model_response = row["Agent response"]
#    gt_answer = row["Final answer"]
#    return 1 if (model_response == gt_answer) else 0
#
#
#

In [None]:
## TODO: move to utils
## TODO: docstring
#
#def evaluate_response(row):
#    model_res = row["Agent response"]
#    gt_ans = row["Final answer"]
#    score = gaia_scorer.question_scorer(
#        model_answer=model_res, 
#        ground_truth=gt_ans
#    )
#    return int(score)    
#
#def get_agent_response(row) -> str:
#    user_query = row["Question"]
#    agent_response = react.run_app(user_query=user_query)
#    agent_response = str(agent_response)
#    return agent_response

How to access and handle this dataset?

### Main Questions to solve

$\square$ Which are the core tools for each level of questions 
  - Level 1:
  - Level 2:
  - Level 3:

