In [114]:
import re
import json

from dotenv import load_dotenv
from datasets import load_dataset
from langchain_openai import ChatOpenAI
import pandas as pd
from tqdm import tqdm

import promptquality as pq
from tqdm import tqdm
tqdm.pandas()

load_dotenv("../.env")
# pq.login("console.demo.rungalileo.io")

True

In [115]:
import unicodedata

def normalize_string(text):
    # Normalize unicode characters and remove diacritics
    normalized = unicodedata.normalize('NFKD', text)
    # Remove non-ASCII characters
    ascii_text = normalized.encode('ASCII', 'ignore').decode('ASCII')
    return ascii_text


def extract_functions_list(text):
    # Regex pattern to match the JSON array
    pattern = r"\[\s*{[\s\S]*?\}\s*\]"

    # Find the match
    match = re.search(pattern, text)
    if match:
        # Extract and parse the JSON string
        json_str = match.group(0)
        return json.loads(json_str)
    return None


def convert_tools_to_langchain_format(tools):
    """Convert tools from the given format to OpenAI function format"""

    def convert_tool(tool):
        # Start with the base function structure
        converted = {
            "type": "function",
            "function": {
                "name": re.sub(r"[^\w]", "_", tool["name"]),
                "description": tool["description"],
                "parameters": {"type": "object", "properties": {}},
            },
        }

        # Mapping of invalid types to valid JSON Schema types
        type_mapping = {
            "float": "number",
            "int": "integer",
            "str": "string",
            "bool": "boolean",
            "dict": "object",
            "list": "array",
            "array": "array",
            "decimal": "number",
            "tuple": "array",
            "number": "number",
            "integer": "integer",
            "string": "string",
            "boolean": "boolean",
            "object": "object",
        }

        # Convert properties
        if "parameters" in tool and "properties" in tool["parameters"]:
            for prop_name, prop_data in tool["parameters"]["properties"].items():
                converted_prop = {
                    "type": type_mapping.get(prop_data["type"], prop_data["type"]),
                    "description": prop_data["description"],
                }

                # Handle enum if present
                if "enum" in prop_data:
                    converted_prop["enum"] = prop_data["enum"]

                # Handle default if present
                if "default" in prop_data:
                    converted_prop["default"] = prop_data["default"]

                converted["function"]["parameters"]["properties"][
                    prop_name
                ] = converted_prop

        # Handle required fields
        if "parameters" in tool and "required" in tool["parameters"]:
            converted["function"]["parameters"]["required"] = tool["parameters"][
                "required"
            ]

        return converted

    # Handle both single tool and list of tools
    if isinstance(tools, list):
        return [convert_tool(tool) for tool in tools]
    else:
        return convert_tool(tools)

def convert_tools_to_langchain_format_v2(tools):
    """Convert tools from the given format to OpenAI function format"""
    
    def convert_type(t):
        type_mapping = {
            "float": "number",
            "int": "integer",
            "str": "string",
            "bool": "boolean",
            "dict": "object",
            "list": "array",
            "array": "array",
            "decimal": "number",
            "tuple": "array",
            "number": "number",
            "integer": "integer",
            "string": "string",
            "boolean": "boolean",
            "object": "object"
        }
        return type_mapping.get(t.lower() if isinstance(t, str) else t, "string")

    def convert_property(prop_data):
        converted_prop = {
            "type": convert_type(prop_data["type"]),
            "description": prop_data["description"]
        }
        
        # Handle array types
        if converted_prop["type"] == "array":
            if "items" in prop_data:
                if isinstance(prop_data["items"], dict):
                    if "type" in prop_data["items"]:
                        converted_prop["items"] = {"type": convert_type(prop_data["items"]["type"])}
                    elif "properties" in prop_data["items"]:
                        converted_prop["items"] = {
                            "type": "object",
                            "properties": {
                                k: convert_property(v) 
                                for k, v in prop_data["items"]["properties"].items()
                            }
                        }
                        if "required" in prop_data["items"]:
                            converted_prop["items"]["required"] = prop_data["items"]["required"]
            else:
                converted_prop["items"] = {"type": "string"}  # Default type for array items
        
        # Handle enum and default
        if "enum" in prop_data:
            converted_prop["enum"] = prop_data["enum"]
        if "default" in prop_data:
            converted_prop["default"] = prop_data["default"]
            
        return converted_prop

    def convert_tool(tool):
        converted = {
            "type": "function",
            "function": {
                "name": normalize_string(re.sub(r"[^\w]", "_", tool["name"])),
                "description": tool["description"],
                "parameters": {
                    "type": "object",
                    "properties": {}
                }
            }
        }
        
        if "parameters" in tool and "properties" in tool["parameters"]:
            converted["function"]["parameters"]["properties"] = {
                prop_name: convert_property(prop_data)
                for prop_name, prop_data in tool["parameters"]["properties"].items()
            }
            
            if "required" in tool["parameters"]:
                converted["function"]["parameters"]["required"] = tool["parameters"]["required"]
                
        return converted

    return [convert_tool(tool) for tool in tools] if isinstance(tools, list) else convert_tool(tools)

def convert_messages_to_langchain_format(messages):
    """
    Convert message format to match OpenAI's expected structure
    with proper tool_calls handling
    """

    def parse_arguments(args_str):
        """Convert argument string to proper JSON format"""
        # Split by comma and process each key-value pair
        pairs = [pair.strip() for pair in args_str.split(",")]
        json_pairs = []

        for pair in pairs:
            if "=" in pair:
                key, value = pair.split("=")
                # Clean up the key and value
                key = key.strip()
                value = value.strip('"')
                # Create proper JSON key-value pair
                json_pairs.append(f'"{key}": "{value}"')

        # Combine into valid JSON object
        return "{" + ", ".join(json_pairs) + "}"

    converted_messages = []

    for i, message in enumerate(messages):
        if message["from"] == "tool":
            # Get the previous assistant message
            prev_message = messages[i - 1]

            # Convert the string format tool call to proper structure
            if prev_message["from"] == "assistant" and "[" in prev_message["value"]:
                # Extract tool call information
                content = prev_message["value"].strip("[]")
                tool_name = normalize_string(re.sub(r"[^\w]", "_", content.split("(")[0]))
                args_str = content.split("(")[1].strip(")")

                # Create properly formatted JSON arguments
                json_args = parse_arguments(args_str)

                # Add assistant message with tool_calls
                assistant_message = {
                    "role": "assistant",
                    "content": None,
                    "tool_calls": [
                        {
                            "id": f"call_{i}",
                            "type": "function",
                            "function": {
                                "name": tool_name,
                                "arguments": json_args,
                            },
                        }
                    ],
                }
                converted_messages.append(assistant_message)

                # Add tool response with proper format
                tool_message = {
                    "role": "tool",
                    "tool_call_id": f"call_{i}",
                    "content": message["value"],
                }
                converted_messages.append(tool_message)

        else:
            # Keep non-tool messages as is
            converted_messages.append(
                {"role": message["from"], "content": message["value"]}
            )

    return converted_messages

def keep_messages_till_last_user(messages):
    """
    Keeps all messages in the conversation up until the last message with role 'user'.
    
    Args:
        messages (list): List of dictionaries containing conversation messages.
                        Each message should have at least a 'role' key.
    
    Returns:
        list: Filtered list of messages up to and including the last user message.
    """
    # Find the index of the last user message
    last_user_index = -1
    for i, message in enumerate(messages):
        if message.get('role') == 'user':
            last_user_index = i
    
    # If no user messages found, return empty list
    if last_user_index == -1:
        return []
    
    # Return all messages up to and including the last user message
    return messages[:last_user_index + 1], last_user_index


In [116]:
df = load_dataset("Team-ACE/ToolACE", split="train").to_pandas()
df.head()

Unnamed: 0,system,conversations
0,You are an expert in composing functions. You ...,"[{'from': 'user', 'value': 'I'm considering in..."
1,You are an expert in composing functions. You ...,"[{'from': 'user', 'value': 'Could you please f..."
2,You are an expert in composing functions. You ...,"[{'from': 'user', 'value': 'Hey, can you show ..."
3,You are an expert in composing functions. You ...,"[{'from': 'user', 'value': 'Could you provide ..."
4,You are an expert in composing functions. You ...,"[{'from': 'user', 'value': 'Can you generate a..."


In [117]:
data = []

for row in tqdm(df.itertuples()):
    try:
        tools = convert_tools_to_langchain_format_v2(extract_functions_list(row.system))
        all_msgs = convert_messages_to_langchain_format(row.conversations)
        conversation, last_user_index = keep_messages_till_last_user(all_msgs)
        n_function_call = len(all_msgs[last_user_index+1]["content"].split("(")) - 1
        data.append([tools, all_msgs, conversation, n_function_call])
    except Exception as e:
        pass

df = pd.DataFrame(data, columns=["tools_langchain", "all_msgs", "conversation", "n_function_calls"])
print(len(df))

df.head()

11300it [00:00, 25356.22it/s]

9606





Unnamed: 0,tools_langchain,all_msgs,conversation,n_function_calls
0,"[{'type': 'function', 'function': {'name': 'ne...","[{'role': 'user', 'content': 'I'm considering ...","[{'role': 'user', 'content': 'I'm considering ...",2
1,"[{'type': 'function', 'function': {'name': 'Qu...","[{'role': 'user', 'content': 'Could you please...","[{'role': 'user', 'content': 'Could you please...",1
2,"[{'type': 'function', 'function': {'name': 'Ge...","[{'role': 'user', 'content': 'Hey, can you sho...","[{'role': 'user', 'content': 'Hey, can you sho...",1
3,"[{'type': 'function', 'function': {'name': 'Ge...","[{'role': 'user', 'content': 'Could you provid...","[{'role': 'user', 'content': 'Could you provid...",1
4,"[{'type': 'function', 'function': {'name': 'Ge...","[{'role': 'user', 'content': 'Can you generate...","[{'role': 'user', 'content': 'Can you generate...",1


In [118]:
df.n_function_calls.value_counts()

n_function_calls
1     4664
2     1521
0     1109
3     1109
4      869
5      205
6       81
7       27
8       12
9        3
12       3
18       1
10       1
14       1
Name: count, dtype: int64

In [119]:
df["n_turns"] = df.conversation.apply(lambda x: len([m for m in x if m["role"] == "user"]))
df["len_query"] = df.conversation.apply(lambda x: len(x[-1]["content"]))
df["n_tools"] = df.tools_langchain.apply(lambda x: len(x))
df.head()

Unnamed: 0,tools_langchain,all_msgs,conversation,n_function_calls,n_turns,len_query,n_tools
0,"[{'type': 'function', 'function': {'name': 'ne...","[{'role': 'user', 'content': 'I'm considering ...","[{'role': 'user', 'content': 'I'm considering ...",2,4,146,6
1,"[{'type': 'function', 'function': {'name': 'Qu...","[{'role': 'user', 'content': 'Could you please...","[{'role': 'user', 'content': 'Could you please...",1,3,121,6
2,"[{'type': 'function', 'function': {'name': 'Ge...","[{'role': 'user', 'content': 'Hey, can you sho...","[{'role': 'user', 'content': 'Hey, can you sho...",1,3,153,4
3,"[{'type': 'function', 'function': {'name': 'Ge...","[{'role': 'user', 'content': 'Could you provid...","[{'role': 'user', 'content': 'Could you provid...",1,3,187,6
4,"[{'type': 'function', 'function': {'name': 'Ge...","[{'role': 'user', 'content': 'Can you generate...","[{'role': 'user', 'content': 'Can you generate...",1,3,215,5


In [120]:
len(df)

9606

In [121]:
def helper_func(test_dict):
    is_empty = True
    if isinstance(test_dict, dict):
        for _, sub in test_dict.items():
            if not helper_func(sub):
                is_empty = False
                break
    elif isinstance(test_dict, list):
        for sub in test_dict:
            if not helper_func(sub):
                is_empty = False
                break
    else:
        is_empty = False
    return is_empty

In [125]:
temp = df[(df.len_query > 100) & (df.n_function_calls == 0)]
print(len(temp))
temp = temp[~temp.conversation.apply(lambda x: "1" in str(x))]
print(len(temp))
temp = temp.sample(100, random_state=42)
temp.iloc[70:90].to_parquet("../data/datasets/toolace_single_func_call_1.parquet", engine="fastparquet")
temp.iloc[90:100].to_parquet("../data/datasets/toolace_single_func_call_2.parquet", engine="fastparquet")

602
284
