In [1]:
import os
import token
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login

load_dotenv()

hf_key = os.getenv("HUGGING_FACE_KEY")

login(token=hf_key)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset(
    "gaia-benchmark/GAIA",
    "2023_all",
    trust_remote_code=True
)


test_data = dataset["test"]
validation_data = dataset["validation"]

In [3]:
print(dataset.keys())

dict_keys(['test', 'validation'])


In [4]:
print("\nDataset sizes:")
print(f"Validation: {len(dataset['validation'])} examples")
print(f"Test: {len(dataset['test'])} examples")



Dataset sizes:
Validation: 165 examples
Test: 301 examples


In [5]:
# 2. Check size of each split
print("\nDataset sizes:")
print(f"Validation: {len(dataset['validation'])} examples")
print(f"Test: {len(dataset['test'])} examples")

# 3. See the column names (fields)
print("\nColumns in validation split:")
print(dataset['validation'].column_names)

# 4. See dataset features (data types)
print("\nDataset features:")
print(dataset['validation'].features)


Dataset sizes:
Validation: 165 examples
Test: 301 examples

Columns in validation split:
['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata']

Dataset features:
{'task_id': Value(dtype='string', id=None), 'Question': Value(dtype='string', id=None), 'Level': Value(dtype='string', id=None), 'Final answer': Value(dtype='string', id=None), 'file_name': Value(dtype='string', id=None), 'file_path': Value(dtype='string', id=None), 'Annotator Metadata': {'Steps': Value(dtype='string', id=None), 'Number of steps': Value(dtype='string', id=None), 'How long did this take?': Value(dtype='string', id=None), 'Tools': Value(dtype='string', id=None), 'Number of tools': Value(dtype='string', id=None)}}


In [6]:
example = dataset['validation'][5]

for key, value in example.items():
    if value is not None:
        print(f"{key}: {value}")
    else:
        print(f"{key}: None")

task_id: 32102e3e-d12a-4209-9163-7b3a104efe5d
Question: The attached spreadsheet shows the inventory for a movie and video game rental store in Seattle, Washington. What is the title of the oldest Blu-Ray recorded in this spreadsheet? Return it as appearing in the spreadsheet.
Level: 2
Final answer: Time-Parking 2: Parallel Universe
file_name: 32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx
file_path: /Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx
Annotator Metadata: {'Steps': '1. Open the attached file.\n2. Compare the years given in the Blu-Ray section to find the oldest year, 2009.\n3. Find the title of the Blu-Ray disc that corresponds to the year 2009: Time-Parking 2: Parallel Universe.', 'Number of steps': '3', 'How long did this take?': '1 minute', 'Tools': '1. Microsoft Excel', 'Number of tools': '1'}


In [7]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder("gaia-benchmark/GAIA", "2023_all")


In [8]:
ds_builder.info.description

' '

In [9]:
filtered_test_data = validation_data.filter(lambda example: example["file_name"]!= "")

In [10]:
len(filtered_test_data)

38

In [11]:
for example in filtered_test_data.select(range(10)):
    print(example["file_path"])

/Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx
/Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb
/Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld
/Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/df6561b2-7ee5-4540-baab-5095f742716a.png
/Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx
/Users/sebastian/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c

In [12]:
filtered_test_data = validation_data.filter(lambda example: example["file_name"].endswith(".zip"))

filtered_test_data

Filter: 100%|██████████| 165/165 [00:00<00:00, 10722.63 examples/s]


Dataset({
    features: ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata'],
    num_rows: 2
})