<div align="left">
  <img src="https://raw.githubusercontent.com/hud-evals/hud-sdk/main/docs/logo/hud_logo.svg" alt="HUD" width="100"/>
</div>

[ Running SheetBench on an excel agent ]

```bash
export HUD_API_KEY=your_api_key_here
pip install hud-python
pip install pandas # optional for debugging
pip install openpyxl # also a requirement for pandas
```

### 1. Verbose example

In [2]:
import requests
import base64
import pandas as pd  # optional for debugging
from io import BytesIO  # optional for debugging

from hud import gym
from hud.taskset import load_taskset

In [71]:
# Loads the SheetBench-50 taskset and enables partial grading
taskset = await load_taskset("SheetBench-50", metadata={"partial": True})

In [None]:
# Load in the first task for testing
test_task = taskset[1]
prompt = test_task.prompt
print(f"Prompt: {prompt}")

# The setup function for the SheetBench tasks is a direct xlsx download link
download_link = test_task.setup.args[0]  # type: ignore

# Download the xlsx file
input_xlsx_req = requests.get(download_link)
input_xlsx_req.raise_for_status()
input_xlsx_file = input_xlsx_req.content

print(pd.ExcelFile(BytesIO(input_xlsx_file)).sheet_names)
print(
    pd.ExcelFile(BytesIO(input_xlsx_file))
    .parse(pd.ExcelFile(BytesIO(input_xlsx_file)).sheet_names[0])
    .head()
)

In [None]:
###
### Your agent loop goes here, using *prompt* and *input_xlsx_file*, returns *output_xlsx_file*
###

# For testing, we'll use the gold solution
gold_solution = requests.get(
    "https://gahludmjcsmszgyufydt.supabase.co/storage/v1/object/public/sheetbench/852a6a9e-7e9f-4563-8298-20e80ee0a66a/Copy%20of%20Derivations_CAGR_WORKBOOK.xlsx"
)
output_xlsx_file = gold_solution.content
print(pd.ExcelFile(BytesIO(output_xlsx_file)).sheet_names)
print(
    pd.ExcelFile(BytesIO(output_xlsx_file))
    .parse(pd.ExcelFile(BytesIO(output_xlsx_file)).sheet_names[1])
    .head()
)

In [None]:
# Get the base64 encoded xlsx file
base64_output_xlsx_file = base64.b64encode(output_xlsx_file).decode("utf-8")

# Adapt the task to set up an evaluation environment with the output xlsx file
test_task.setup = ("sheets_from_bytes", base64_output_xlsx_file)
test_task.id = None

# Use a hud environment to evaluate the agent
env = await gym.make(test_task)
result = await env.evaluate()
print(f"Reward: {result['reward']}")

# obs, _ = await env.reset() # get obs["screenshot"] to visualize
# await env.stream() # to see the live state of the environment for debugging

In [75]:
# Close the environment
await env.close()

### 2. Running at scale

In [3]:
# Optionally turn off hud logging
# import logging
# logging.getLogger("hud").setLevel(logging.WARNING)

In [17]:
async def run_single_task(task, job=None):
    prompt = task.prompt
    input_xlsx_file = requests.get(task.setup.args[0]).content

    # TODO: Implement the agent loop using *prompt* and *input_xlsx_file*
    # TODO: Return the *output_xlsx_file* as a base64 encoded string

    # The input alone will return a 0 reward
    base64_output_xlsx_file = base64.b64encode(input_xlsx_file).decode("utf-8")

    # Run evaluation
    task.setup = ("sheets_from_bytes", base64_output_xlsx_file)
    task.id = None
    env = await gym.make(task, job=job)
    result = await env.evaluate()
    await env.close()

    return result["reward"]

In [None]:
# Loading and evaluating 50 tasks should take around 2 minutes, without the agent loop
import asyncio

# Adds the job to the app.hud.so platform, optional
from hud import create_job

# Run the taskset
taskset = await load_taskset("SheetBench-50", metadata={"partial": True})
job = await create_job("SheetBench-50-Excel-Agent", evalset_id=taskset.id)

task_runs = [run_single_task(task, job) for task in taskset]
rewards = await asyncio.gather(*task_runs)

print(f"Average reward: {sum(rewards) / len(rewards)}")