In [1]:
import os
import json
import random
random.seed(42)
from pprint import pprint

import pandas as pd
from dotenv import load_dotenv

import promptquality as pq
from tau_bench.envs.retail.tasks import tasks as retail_tasks
from tau_bench.envs.retail.tools import ALL_TOOLS as retail_tools
from tau_bench.envs.airline.tasks import tasks as airline_tasks
from tau_bench.envs.airline.tools import ALL_TOOLS as airline_tools
from tqdm import tqdm
tqdm.pandas()

load_dotenv()
# pq.login("console.demo.rungalileo.io")

* 'fields' has been removed


True

In [2]:
retail_instructions = [task["instruction"] for task in retail_tasks]
airline_instructions = [task["instruction"] for task in airline_tasks]

#select 50 random instructions for each domain
retail_instructions = random.sample(retail_instructions, 50)
airline_instructions = random.sample(airline_instructions, 50)

In [5]:
def convert_tau_to_langchain_format(openai_tool):
    """Convert OpenAI function format to LangChain format"""
    
    def convert_property(prop_dict):
        """Convert property while preserving all attributes"""
        converted = {
            "type": prop_dict["type"],
            "description": prop_dict.get("description", ""),
            "title": prop_dict.get("title", "")
        }
        
        # Handle enums
        if "enum" in prop_dict:
            converted["enum"] = prop_dict["enum"]
            
        # Handle array items
        if prop_dict["type"] == "array" and "items" in prop_dict:
            items = prop_dict["items"]
            if items["type"] == "object":
                converted["items"] = {
                    "type": "object",
                    "properties": {
                        k: convert_property(v) 
                        for k, v in items["properties"].items()
                    }
                }
                if "required" in items:
                    converted["items"]["required"] = items["required"]
            else:
                converted["items"] = {"type": items["type"]}
                
        return converted

    function_data = openai_tool["function"]
    
    converted = [{
        "description": function_data["description"],
        "properties": {
            k: convert_property(v)
            for k, v in function_data["parameters"]["properties"].items()
        },
        "required": function_data["parameters"]["required"],
        "title": function_data["name"],
        "type": "object"
    }]
    
    return converted

In [6]:
retail_langchain_tools = [convert_tau_to_langchain_format(tool.get_info())[0] for tool in retail_tools]
airline_langchain_tools = [convert_tau_to_langchain_format(tool.get_info())[0] for tool in airline_tools]

conversations = [[{"role": "user", "content": instruction}] for instruction in retail_instructions + airline_instructions]
tools = [retail_langchain_tools]*len(retail_instructions) + [airline_langchain_tools]*len(airline_instructions)

df = pd.DataFrame({"conversation": conversations, "tools_langchain": tools})
df["n_turns"] = df.conversation.apply(lambda x: len([m for m in x if m["role"] == "user"]))
df["len_query"] = df.conversation.apply(lambda x: len(x[-1]["content"]))
df["n_tools"] = df.tools_langchain.apply(lambda x: len(x))
df.to_parquet("../data/datasets/tau_long_context.parquet", engine="fastparquet")
df.head()

Unnamed: 0,conversation,tools_langchain,n_turns,len_query,n_tools
0,"[{'role': 'user', 'content': 'You name is Jame...",[{'description': 'Calculate the result of a ma...,1,306,16
1,"[{'role': 'user', 'content': 'You are mia_garc...",[{'description': 'Calculate the result of a ma...,1,302,16
2,"[{'role': 'user', 'content': 'You are Yusuf Ro...",[{'description': 'Calculate the result of a ma...,1,296,16
3,"[{'role': 'user', 'content': 'You name is Lei ...",[{'description': 'Calculate the result of a ma...,1,239,16
4,"[{'role': 'user', 'content': 'You are aarav_sa...",[{'description': 'Calculate the result of a ma...,1,435,16
