This Notebook is made for experimentation purposes, so we can track the changes vs performance of our agent

### Setup

In [1]:
# Libraries
from typing import Literal
from datasets import load_dataset
import pandas as pd
import os
from random import sample
from dotenv import load_dotenv
from huggingface_hub import snapshot_download, login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import tasks attached files 
load_dotenv()
login(token=os.getenv(key="HF_TOKEN_CHAPPIE"))
gaia_repo_dir = snapshot_download(repo_id="gaia-benchmark/GAIA", repo_type="dataset")
print(gaia_repo_dir)

Fetching 114 files: 100%|██████████| 114/114 [00:00<00:00, 2483.91it/s]

/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316





In [3]:
# Local Modules
os.sys.path.append("../src/")
os.sys.path.append("../src/agents/")
os.sys.path.append("../src/utils/")

import react  # My AI assistant
import gaia_eval

Langfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client


In [4]:
# Paths
output_results_path = "/home/santiagoal/current-projects/chappie/data/agent_experiments/"
experiment_iterations_path = os.path.join(output_results_path, "iterations/")
summary_experiments_path = os.path.join(output_results_path, "summary.csv")

In [5]:
# GAIA dataset
gaia_dataset = load_dataset("gaia-benchmark/GAIA", "2023_level1")

Docs suggest to focus on validation set for dev purposes, let's explore it

In [6]:
dev_set = gaia_dataset["validation"]
dev_set

Dataset({
    features: ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata'],
    num_rows: 53
})

### Explore GAIA Lvl. 1 Questions

In [7]:
df_dev = pd.DataFrame(dev_set)

In [8]:
df_dev.head()

Unnamed: 0,task_id,Question,Level,Final answer,file_name,file_path,Annotator Metadata
0,e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,1,17.0,,,{'Steps': '1. Googled Eliud Kipchoge marathon ...
1,8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,1,3.0,,,{'Steps': '1. I did a search for Mercedes Sosa...
2,ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,1,3.0,,,{'Steps': 'Step 1: Evaluate the problem statem...
3,5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,1,0.1777,,,"{'Steps': '1. Searched '""Can Hiccup Supply Eno..."
4,a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,1,3.0,,,{'Steps': '1. Navigate to the YouTube link. 2....


In order to start our dev phase, let's observe how does our React Agent perform of a single question

In [9]:
# DEL
# Experiment with only .txt like tasts
# Temporal cell
df_dev = df_dev[df_dev["file_path"].apply(lambda row: row.endswith(".txt"))]

In [10]:
n_samples = min(20, df_dev.shape[0])
sample_questions = df_dev.sample(n_samples)

In [11]:
# Dataset copy just for eval

results_df = sample_questions.copy()[["Question", "Final answer", "file_path"]]
results_df["Agent response"] = None
results_df["is_correct"] = None  # 1 if it is correct, 0 otherwise

results_df = results_df[["Question", "file_path", "Agent response", "Final answer", "is_correct"]]

### Experiment and Track performance on dev set

In [12]:
# Get XP history
old_experiments_data = pd.read_csv(summary_experiments_path)

In [13]:
# Form

# Get last information 
latest_experiment = old_experiments_data.iloc[-1]
#latest_xp_name = latest_experiment["experiment"]
latest_agent = latest_experiment["agent"]
latest_tools = latest_experiment["tools"]
latest_iteration = latest_experiment["iteration"]
current_iteration = latest_iteration + 1

# Get XP name
msg = "Type the experiment name (E.g. Integrate Whisper Transcriber)"
usr_response = ""
while usr_response.lower() == "":
    usr_response = input(msg + ": ")
xp_name = usr_response


# Get Agent Archictecure
usr_response = ""
msg = f"Is your agent different from '{latest_agent}'? [y/N]"
usr_response = input(msg + ": ")
warning_msg = f"Oops! '{usr_response}' is not a valid response, pls try again. "
while usr_response.lower() not in ("y", "n"):
    usr_response = input(warning_msg.format(usr_response) + msg + ": ")

if usr_response.lower() == "n":
    agent_architecture = latest_agent
elif usr_response.lower() == "y":
    agent_architecture = input("Please type the new agent architecture to track" + ": ")

# Get Tools
usr_response = ""
msg = "Are there new tools to track? [y/N]"
usr_response = input(msg + ": ")
while usr_response.lower() not in ("y", "n"):
    usr_response = input(warning_msg.format(usr_response) + msg + ": ")

if usr_response.lower() == "n":
    new_tools = ""
elif usr_response.lower() == "y":
    new_tools = input("Please type the new tools list to track (comma separated)" + ": ")

new_tools_list = latest_tools + ", " + new_tools

# Format xp name and path
xp_name_snake = str(current_iteration) + "_" + xp_name.replace(" ", "_").replace(",", "").lower()
xp_path = os.path.join(experiment_iterations_path, xp_name_snake + ".csv")

In [14]:
results_df

Unnamed: 0,Question,file_path,Agent response,Final answer,is_correct
22,You are a telecommunications engineer who want...,/home/santiagoal/.cache/huggingface/hub/datase...,,3,


In [15]:
# Compute and save agent responses and their evaluation

results_df["Agent response"] = results_df.apply(func=gaia_eval.get_agent_response, axis=1)
results_df["is_correct"] = results_df.apply(func=gaia_eval.evaluate_response, axis=1)
results_df

attached_files: /home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt


Unnamed: 0,Question,file_path,Agent response,Final answer,is_correct
22,You are a telecommunications engineer who want...,/home/santiagoal/.cache/huggingface/hub/datase...,2,3,0


In [16]:
accuracy = results_df["is_correct"].mean()
print(f" Experiment Accuracy: {(100 * accuracy):.2f} %")

 Experiment Accuracy: 0.00 %


In [17]:
results_df

Unnamed: 0,Question,file_path,Agent response,Final answer,is_correct
22,You are a telecommunications engineer who want...,/home/santiagoal/.cache/huggingface/hub/datase...,2,3,0


### Save Results

In [18]:
# Save current experiment
results_df.to_csv(xp_path, index=False)

# Update experimentation history

xp_results = {
    "iteration": current_iteration,
    "experiment": xp_name,
    "agent": agent_architecture,
    "tools": new_tools_list,
    "accuracy": round(results_df.copy()["is_correct"].mean(), 2),
}

updated_experiments_data = pd.concat([old_experiments_data, pd.DataFrame([xp_results])], ignore_index=True)
updated_experiments_data.drop_duplicates(inplace=True)
updated_experiments_data.to_csv(summary_experiments_path, index=False)

In [19]:
updated_experiments_data

Unnamed: 0,iteration,experiment,agent,tools,accuracy
0,1,Implement Calculator Tool,React agent,Aritmetic,0.0
1,2,Implement Search and Code tools,React agent,"Aritmetic, Search, Code",0.17
2,3,Integrate Whisper Audio Transcriber,React agent,"Aritmetic, Search, Code, Audio Transcriber",0.15
3,4,Test Workflow,React agent,"Aritmetic, Search, Code, Audio Transcriber,",0.2
4,5,Integrate Text processing tool,React agent,"Aritmetic, Search, Code, Audio Transcriber, , ...",0.1
5,6,Integrate Text Handler tool,React agent,"Aritmetic, Search, Code, Audio Transcriber, , ...",0.2
6,7,Evaluate Agent Performance against tasks with ...,React agent,"Aritmetic, Search, Code, Audio Transcriber, , ...",
7,8,Test Agent performance against tasks with atta...,React agent,"Aritmetic, Search, Code, Audio Transcriber, , ...",0.0


### Evaluation Summary



In [20]:
good_responses = results_df[results_df["is_correct"] == 1].copy()
good_extensions = (
    good_responses["file_path"]
    .fillna("No files")
    .apply(lambda row: row.split(".")[-1] if "." in row else "No files")
    .unique()
)
good_file_management = ", ".join(sorted(good_extensions))

bad_responses = results_df[results_df["is_correct"] == 0].copy()
bad_extensions = (
    bad_responses["file_path"]
    .fillna("No files")
    .apply(lambda row: row.split(".")[-1] if "." in row else "No files")
    .unique()
)
bad_file_management = ", ".join(sorted(bad_extensions))

performance_no_attached = results_df[results_df["file_path"].apply(lambda row: len(row))==0]["is_correct"].mean()

In [21]:
print(
    "Insights\n\n",
    "-" * 50,
    f"\n\n1. The Agent has an overall accuracy of {100 * accuracy:.1f}%"
    f"\n2. The Agent succeded at questions with the following files types: {good_file_management} ({good_responses.is_correct.shape[0]}/{results_df.shape[0]})",
    f"\n3. The Agent failed at questions with the following files types: {bad_file_management} ({bad_responses.is_correct.shape[0]}/{results_df.shape[0]})",
    f"\n4. The Agent has an Accuracy of {100 * performance_no_attached:.1f}% at tasks with no attached files"
)

Insights

 -------------------------------------------------- 

1. The Agent has an overall accuracy of 0.0%
2. The Agent succeded at questions with the following files types:  (0/1) 
3. The Agent failed at questions with the following files types: txt (1/1) 
4. The Agent has an Accuracy of nan% at tasks with no attached files


Conclusions

1. The Agent has a 0% of accuracy. Showing poor performance at tasks with attached .txt files
4. Reasoning task has a big margin of improvement (0% of success on reasoning tasks). There are some questions that ask for watching internet videos, so it might be necessary to implement tools to adress this kind of tasks

Next steps

1. Build tools to deal with .png, .pptx, .py .xlsx files
2. Investigate improvements for pure-resoning tasks
3. Build tools to enable the agent watch / process youtube videos


### Main Questions to solve

$\square$ Which are the core tools for each level of questions 
  - Level 1:
  - Level 2:
  - Level 3:

