In [None]:
from openai import OpenAI
from pathlib import Path
import json

client = OpenAI(api_key=json.load(open("./secrets.json"))["openai"])
function_doc_paths = list(Path("../data/multi_turn_func_doc").glob("*.json"))
function_docs = {}
for path in function_doc_paths:
    print(f"Loading {path.stem}...")
    with open(path) as f:
        function_docs[path.stem] = [json.loads(line) for line in f if line.strip()]

### Synthesize all goals

In [None]:
from tqdm import tqdm

prompt = open("./prompts/goal_synthesis.txt").read()

# ==== variables you can change ====
num_of_goals = 10
saving_path = f"./bfcl_goals_{num_of_goals}.json"
# ==== end of variables you can change ====

goals = {} # environment -> goals
for environment in tqdm(function_docs.keys()):
    docs = "\n".join(json.dumps(doc) for doc in function_docs[environment])

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt.replace("${N}", str(num_of_goals))},
            {"role": "user", "content": f"Environment: {environment}\nFunctions: {docs}"},
        ],
        temperature=1.0,
        top_p=0.95,
    )
    output = response.choices[0].message.content
    try:
        parsed_output = json.loads(output)
    except:
        print(f"Error parsing {environment}!!")
        parsed_output = output
    goals[environment] = parsed_output

with open(saving_path, "w") as f:
    json.dump(goals, f, indent=2)

### Sample `initial_state` for each environment

In [None]:
data = [json.loads(line) for line in open("../data/BFCL_v3_multi_turn_base.json")]
unique_classes = sorted(list({cls for d in data for cls in d["involved_classes"]}))
print(f"There are {len(unique_classes)} unique classes in the dataset: {unique_classes}")

In [None]:
import random
random.seed(42)

# For each unique class, randomly select one data entry and print its index in the dataset
selected_indices = {}
for cls in unique_classes:
    indices = [i for i, d in enumerate(data) if cls in d["involved_classes"]]
    if indices:
        selected_idx = random.choice(indices)
        selected_indices[cls] = selected_idx
    else:
        print(f"Class: {cls} not found in any data entry.")


print(selected_indices)

# Note that MessageAPI is a special case, it is actually GorillaFileSystem
initial_configs = {}
for cls, idx in selected_indices.items():
    if cls == "MathAPI":
        # mathapi is stateless, so we don't need to specify the initial config
        initial_configs[cls] = {}
    elif cls == "MessageAPI":
        initial_configs[cls] = data[idx]["initial_config"]["GorillaFileSystem"]
    else:
        initial_configs[cls] = data[idx]["initial_config"][cls]

with open("initial_configs.json", "w") as f:
    json.dump(initial_configs, f, indent=2)

### Now we merge all synthetic data into exploration data

In [None]:
import json
initial_configs = json.load(open("initial_configs.json"))
all_goals = json.load(open("bfcl_goals_10.json"))


environment_to_class = {
    'travel_booking': 'TravelAPI',
    'gorilla_file_system': 'GorillaFileSystem',
    'trading_bot': 'TradingBot',
    'message_api': 'MessageAPI',
    'posting_api': 'TwitterAPI',
    'math_api': 'MathAPI',
    'ticket_api': 'TicketAPI',
    'vehicle_control': 'VehicleControlAPI'
}

processed_data = [] # json list
for environment, goals in all_goals.items():
    classname = environment_to_class[environment]
    config = initial_configs[classname]
    for idx, goal in enumerate(goals):
        temp_message = {"id": f"multi_turn_{environment}_{idx}", "question": [[{"role": "user", "content": "Goal: " + goal}]], "initial_config": config, "involved_classes": [classname]}
        processed_data.append(temp_message)

with open("../data/multi_turn_exploration_10.json", "w") as f:
    for data in processed_data:
        f.write(json.dumps(data))
        f.write("\n")