In this Notebook we create and sketch tools for the Agent. Building proofs of concept (PoC)

### Setup

In [1]:
# Libraries

import os

In [2]:
# import Whisper

import whisper

In [3]:
# Import Agent

os.sys.path.append("../../src")
os.sys.path.append("../../src/agents")

import react  # My AI assistant

Langfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client


In [4]:
# DEBUG

os.sys.path.append("../../src/tools")

In [5]:

# Import GAIA Questions
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [6]:

import pandas as pd


In [7]:
from dotenv import load_dotenv
from huggingface_hub import snapshot_download, login

In [8]:
# Load Hugging face credentials

#load_dotenv()
#login(os.getenv(key="HF_TOKEN_CHAPPIE"))  # Replace with your hf api key name


In [9]:
#gaia_questions_path = snapshot_download(repo_id="gaia-benchmark/GAIA", repo_type="dataset")
#gaia_questions = load_dataset(path="gaia-benchmark/GAIA", name="2023_level1")

In [10]:
gaia_index_dir = "../../../../.cache/huggingface/datasets/gaia-benchmark___gaia/2023_level1/0.0.1/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005"
#gaia_index_dir = os.path.abspath(gaia_index_dir)
gaia_data_path = os.path.join(gaia_index_dir, "gaia-validation.arrow")

In [11]:
# Temporal cell: wrote because of hugging face api calls limit

import pyarrow.ipc as ipc

with open(gaia_data_path, "rb") as f:
    reader = ipc.RecordBatchStreamReader(f)
    table = reader.read_all()

gaia_df = table.to_pandas()


In [12]:
#gaia_questions = gaia_questions["validation"]  # Filter for dev purposes
#gaia_df = pd.DataFrame(gaia_questions)

In [13]:
gaia_df.head()

Unnamed: 0,task_id,Question,Level,Final answer,file_name,file_path,Annotator Metadata
0,e1fc63a2-da7a-432f-be78-7c4a95598703,If Eliud Kipchoge could maintain his record-ma...,1,17.0,,,{'Steps': '1. Googled Eliud Kipchoge marathon ...
1,8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Merce...,1,3.0,,,{'Steps': '1. I did a search for Mercedes Sosa...
2,ec09fa32-d03f-4bf8-84b0-1f16922c3ae4,Here's a fun riddle that I think you'll enjoy....,1,3.0,,,{'Steps': 'Step 1: Evaluate the problem statem...
3,5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,1,0.1777,,,"{'Steps': '1. Searched '""Can Hiccup Supply Eno..."
4,a1e91b78-d3d8-4675-bb8d-62741b4b68a6,In the video https://www.youtube.com/watch?v=L...,1,3.0,,,{'Steps': '1. Navigate to the YouTube link. 2....


In [14]:
# DEL
# temp filter to get chess images
gaia_df[gaia_df["file_path"].map(lambda f: f.endswith("44.png"))]

Unnamed: 0,task_id,Question,Level,Final answer,file_name,file_path,Annotator Metadata
16,cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the imag...,1,Rd5,cca530fc-4052-43b2-b130-b30968d8aa44.png,/home/santiagoal/.cache/huggingface/hub/datase...,{'Steps': 'Step 1: Evaluate the position of th...


---

In [15]:
filetypes = {d[1].file_path.split(".")[-1] for d in gaia_df.iterrows()}
filetypes

{'', 'docx', 'mp3', 'png', 'pptx', 'py', 'txt', 'xlsx'}

### Read Historical XPs

In [None]:
xp_paths = "../data/agent_experiments/iterations/"

In [None]:
# Index all XPs
import os
import pandas as pd

xp_dir = "../../data/agent_experiments/iterations"
xp_list = []
# FIXME: from the 10th XP iteration, the sorted method is unuseful

for i, xp_path in enumerate(sorted(os.listdir(xp_dir))):  
    xp_path = os.path.join(xp_dir, xp_path)
    temp_xp_df = pd.read_csv(xp_path)
    xp_list.append((i, temp_xp_df))
    del temp_xp_df 

In [None]:
# Join latest XP result to each question

index = gaia_df.index.tolist()  # Index for all the questions
questions = gaia_df["Question"].tolist()
answers = list()


def filter_condition(xp_data: list) -> bool:
    i, xp = xp_data
    for question in questions:
        xp_addressed_questions = xp["Question"].tolist()
        if question in xp_addressed_questions:
            return True
    return False

filtered_xps = list(filter(filter_condition, xp_list))
filtered_xps_reversed = filtered_xps
filtered_xps_reversed.reverse()

# Join latest answer result (i.e. result of the latest xp)

# Join latest answer result (i.e. result of the latest xp)

for question in questions:
    answer_result = 0  # Assume wrong answer by default
    for i, xp in filtered_xps_reversed:
        for xp_question in xp["Question"].tolist():
            
            if question == xp_question:
                answer_result_row = list(xp[xp["Question"] == xp_question]["is_correct"])#.loc[0]
                answer_result_temp = answer_result_row[0]                    
                if answer_result_temp >= answer_result:
                    answer_result = answer_result_temp
        del xp
    answers.append(answer_result)

# Join answers
historical_xp_results = gaia_df.copy()
historical_xp_results["is_correct"] = pd.Series(answers)
del answers, questions

In [None]:
historical_xp_results

In [None]:
historical_xp_results.is_correct.mean()

Let's study first which are the most common wrong tasks. I.e. How can we increase accuracy with a single next step (e.g. implement a new tool, modify sys message, etc.)

In [None]:
wrong_ans_df = historical_xp_results[historical_xp_results["is_correct"]==0]
wrong_ans_df["fp_extension"] = wrong_ans_df["file_path"].map(lambda path: path.split(".")[-1])
wrong_ans_df

In [None]:
# Summary
wrong_ans_df.groupby("fp_extension")["is_correct"].count()

The vast majority of remaining tasks do not include files to read. So we sould study them at first

In [None]:
wrong_ans_df_no_extension = wrong_ans_df[wrong_ans_df["file_path"].map(lambda fp: len(fp)==0)]

In [None]:
wrong_ans_df_no_extension

### Method

We identified three main necessities to cover through web search

1. Analize youtube videos (transcription and computer vision)
2. Scrap web pages (e.g. journals, pdfs, wikipedia articles)


So we decided to modularize the web_search_tool into three main modules

1. High-level Web results
2. Youtube Video Class: To gather both audio and video
3. Web page scrapper: Given a web url, retrieve the text as markdown
4. Object detection tool: Ideally coded into handle_image tool

So we plan to generate three main tools: `youtube_tool` (Class + Langchain runnable), `web_search` (Expanded) and `handle_images` with an initial module `object_detection` using YoloV4-tiny

### Youtube Tool

In [None]:
import re

#### 1. FIlter tasks to solve with the tool

In [None]:
pattern = r".*youtube.*"
yt_tasks = gaia_df.copy()
yt_tasks = yt_tasks[yt_tasks["Question"].map(lambda question: bool(re.search(pattern, question, re.IGNORECASE)))]
yt_tasks

#### 2. Tool PoC

In [None]:
sample_yt_url = "https://www.youtube.com/watch?v=L1vXCYZAYYM"

Lets import that video

In [None]:
import os

In [None]:
output_dir = "../../data/temp/"

yt_audio_filename = "yt_audio.mp3"
yt_audio_path = os.path.join(output_dir, yt_audio_filename)

raw_yt_video_filename = "raw_yt_video.mp4"
processed_yt_video_filename = "processed_yt_video.mp4"

raw_yt_video_path = os.path.join(output_dir, raw_yt_video_filename)
processed_yt_video_path = os.path.join(output_dir, processed_yt_video_filename)

In [None]:
from pytubefix import YouTube

In [None]:
yt = YouTube(sample_yt_url)
#yt_audio = yt.streams.filter(only_audio=True).first()
#yt_video = yt.streams.filter(only_video=True).first()

In [None]:
yt_audio = yt.streams.filter(only_audio=True).first()
yt_audio_filename = "yt_audio.mp3"
yt_audio_path = os.path.join(output_dir, yt_audio_filename)

In [None]:
yt_video = yt.streams.filter(only_video=True, fps=25, res="144p").order_by("fps").asc().first()
yt_video_filename = "yt_video.mp4"
yt_video_path = os.path.join(output_dir, raw_yt_video_filename)

We imported a binary representation of both YT video and audio. The idea now is to try to pass these objects to a CV model (e.g. YoloV4) and to a Transcriber model (e.g. Whisper)

YoLo

In [None]:
yt_video.download(output_path=output_dir, filename=raw_yt_video_filename)

In [None]:
import subprocess

new_fps = 1
reduce_fps_cmd = f'ffmpeg -y -i {raw_yt_video_path} -filter:v "fps={new_fps}" -an {processed_yt_video_path}'
subprocess.run(
    reduce_fps_cmd, shell=True
)

In [None]:
from ultralytics import YOLO

cv_model = YOLO(model="yolov8s.pt", task="detect")

In [None]:
cv_results = cv_model.predict(source=processed_yt_video_path, stream=True)

In [None]:
os.sys.getsizeof(cv_results)

In [None]:
def summarize(results):# -> Generator[dict[str, Any], Any, None]:# -> Generator[dict[str, Any], Any, None]:
    for i, result in enumerate(results):
        yield {
            "frame": i + 1,
            "result": [result.names[int(cls)] for cls in result.boxes.cls]
           }

In [None]:
data = list(summarize(cv_results))

In [None]:
# TODO: make this a typed dict to let the agent process this data
processed_data = {
    "video_url": sample_yt_url,
    "detected_objects": data
}

In [None]:
processed_data

In [None]:
# delete unnecesary stuff
import subprocess
remove_videos_cmd = f"rm {raw_yt_video_path} {processed_yt_video_path}"
subprocess.run(remove_videos_cmd, shell=True)

Whisper custom tool

In [None]:
import time
yt_audio.download(output_path=output_dir, filename=yt_audio_filename)

In [None]:
tools_dir = "../../src/tools"
os.sys.path.append(tools_dir)

In [None]:
import transcriber

In [None]:
transcript = transcriber.transcriber.invoke(input={"audio_path": yt_audio_path})

In [None]:
transcript

In [None]:
import subprocess
subprocess.run(cmd=f"rm {yt_audio_path}", shell=True)

#### 3. Test Tool

In [None]:
import os
os.sys.path.append("../../src/")
os.sys.path.append("../../src/agents/")
os.sys.path.append("../../src/utils/")

import react

In [None]:
yt_tasks

In [None]:
sample_yt_task = yt_tasks.iloc[0]
sample_yt_question = sample_yt_task["Question"]

In [None]:
sample_yt_question

In [None]:
react.run_app(user_query=sample_yt_question + ". Just use the pull_youtube_video and the transcriber tools. Try to figure out the number of bird species from the transcript as those are mentioned")

### Web search Tool (Web scrapping)

#### 1. FIlter tasks to solve with the tool

In [18]:
sample_task_ids = [
    "5d0080cb-90d7-4712-bc33-848150e917d3",
    "46719c30-f4c3-4cad-be07-d5cb21eee6bb",
    "b816bfce-3d80-4913-a07d-69b752ce6377",
    "b415aba4-4b68-4fc6-9b89-2c812e55a3e1",
    "935e2cff-ae78-4218-b3f5-115589b19dae",
    "5188369a-3bbe-43d8-8b94-11558f909a08",
    "7673d772-ef80-4f0f-a602-1bf4485c9b43",
    "c365c1c7-a3db-4d5e-a9a1-66f56eae7865",
    "7d4a7d1d-cac6-44a8-96e8-ea9584a70825",
    "3f57289b-8c60-48be-bd80-01f8099ca449",
    "23dd907f-1261-4488-b21c-e9185af91d5e",
    "840bfca7-4f7b-481a-8794-c560c340185d",
    "a0068077-79f4-461a-adfe-75c1a4148545",
    "a0c07678-e491-4bbc-8f0b-07405144218f"
]

In [20]:
l = ["a", "b", "c"]
b = "a"

b in l

True

In [24]:
sample_tasks = gaia_df[gaia_df["task_id"].map(lambda row: row in sample_task_ids)]
sample_tasks.reset_index(inplace=True)
sample_tasks

Unnamed: 0,index,task_id,Question,Level,Final answer,file_name,file_path,Annotator Metadata
0,3,5d0080cb-90d7-4712-bc33-848150e917d3,What was the volume in m^3 of the fish bag tha...,1,0.1777,,,"{'Steps': '1. Searched '""Can Hiccup Supply Eno..."
1,5,46719c30-f4c3-4cad-be07-d5cb21eee6bb,Of the authors (First M. Last) that worked on ...,1,Mapping Human Oriented Information to Software...,,,"{'Steps': '1. Searched ""Pie Menus or Linear Me..."
2,12,b816bfce-3d80-4913-a07d-69b752ce6377,In Emily Midkiff's June 2014 article in a jour...,1,fluffy,,,"{'Steps': '1. Searched ""Hreidmar's sons"" on Go..."
3,15,b415aba4-4b68-4fc6-9b89-2c812e55a3e1,In Nature journal's Scientific Reports confere...,1,diamond,,,"{'Steps': '1. Searched ""nature scientific repo..."
4,17,935e2cff-ae78-4218-b3f5-115589b19dae,"In the year 2022, and before December, what do...",1,research,,,"{'Steps': '1. Searched ""legume wikipedia"" on G..."
5,19,5188369a-3bbe-43d8-8b94-11558f909a08,What writer is quoted by Merriam-Webster for t...,1,Annie Levin,,,"{'Steps': '1. Search ""merriam-webster word of ..."
6,38,7673d772-ef80-4f0f-a602-1bf4485c9b43,On Cornell Law School website's legal informat...,1,inference,,,"{'Steps': '1. Searched ""Cornell Law School leg..."
7,39,c365c1c7-a3db-4d5e-a9a1-66f56eae7865,Of the cities within the United States where U...,1,"Braintree, Honolulu",,,"{'Steps': '1. Searched ""cities where us presid..."
8,40,7d4a7d1d-cac6-44a8-96e8-ea9584a70825,"According to Girls Who Code, how long did it t...",1,22,,,"{'Steps': '1. Searched ""Girls Who Code"" on Goo..."
9,42,3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most ...,1,519,,,"{'Steps': '1. Search ""yankee stats"" to find th..."


#### 2. Tool PoC

In [31]:
from langchain_community.tools import playwright

In [None]:
from langgraph.prebuilt import create_react_agent
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0.0)
agent = create_react_agent(model=llm, tools=tools)

#### 3. Test Tool

### Handle Images Tool (Object detection)

#### 1. FIlter tasks to solve with the tool

#### 2. Tool PoC

#### 3. Test Tool