In [None]:
!pip install datasets

In [1]:
TOOL_DEFINITIONS = {
  'send_report_to': 'send_report_to(required String recipient_email):\nSend a report generated based on observations from previous conversations. This can be sent to a health or wellbeing professional trusted by the user. This function should only be called upon the user’s request or with their explicit approval.',
  'make_phone_call': 'make_phone_call(required String phone_number):\nInitiate a phone call to the specified phone number. This function should only be called upon the user’s request or with their explicit approval. ',
  'search_memory': 'search_memory(required String query):\nSearch your memory (Waico’s memory, not the user’s) from past conversations for relevant information.\nThe query parameter is a relevant fact, expression, event, person, expression.',
  'get_health_data': 'get_health_data(required String health_data_type, required String period):\nRetrieve health-related data for a specified category and time period.\nThe health_data_type parameter must be one of the following: SLEEP, WATER, STEPS_COUNT, ACTIVE_ENERGY_BURNED, or WEIGHT.\nThe period parameter must be one of: TODAY (from midnight to now) or LAST_24_HOURS.',
  'display_user_progress': 'display_user_progress(required String health_data_type, required String period):\nDisplay a Line Chart of the user’s daily data for a given type and time period. It also shows the total for the given period.\nThe health_data_type parameter must be one of: SLEEP, WATER, STEPS_COUNT, ACTIVE_ENERGY_BURNED, or WEIGHT.\nThe period parameter must be one of: LAST_7_DAYS or LAST_30_DAYS.'
}

In [8]:
import re
import random
from typing import List, Any


def add_system_prompt(sample):
    tool_instructions = (
        "\n\nTOOL CALLING:\n"
        "You have access to the following tools that you can call to satisfy a user query or when you think it is appropriate.\n"
        f"{"\n---\n".join(get_tool_definitions(sample))}"
    )
    if sample.get("system"):
        sample["system"] += tool_instructions
    else:
        sample["system"] = f"You are Waico, a helpful Wellbeing Assistant.{tool_instructions}"

    return sample


def get_tool_definitions(sample):
    assistant_message = sample["assistant"]
    tool_match = re.search(r"<tool_call\s+([^\s>]+)>", assistant_message)

    if tool_match:
        tool_called = tool_match.group(1)
        selected_tool_definition = TOOL_DEFINITIONS[tool_called]
        # Choose random tool definitions to include with the tool the model must pick
        # If we provide only the selected tool, the model won't learn to pick the right
        # tool among many available.
        additional_tool_definitions = pick_random_samples(
            data=list(TOOL_DEFINITIONS.values()),
            exclude=selected_tool_definition,
            max_samples=3,
        )
        return [selected_tool_definition, *additional_tool_definitions]
    else:
        # This samples doesn't contain tool call, maybe a case where the user didn't provide
        # enough context for the model to deduce the parameters or there is no tool to
        # perform the action the user requested. In some of these cases, the model will
        # say which tools it can use, to make sure the tools it list are in the system prompt
        # we return all of their definitions here.
        return list(TOOL_DEFINITIONS.values())


def pick_random_samples(
    data: List[Any], max_samples: int, exclude: Any | None = None
) -> List[Any]:
    """
    Randomly picks between 1 and `max_elements` samples from the given list `data`,
    excluding the specified `exclude` element.

    Parameters:
        data (List[Any]): The list to sample from.
        exclude (Any): An element to exclude from sampling.
        max_samples (int): The maximum number of elements to sample (inclusive).

    Returns:
        List[Any]: A list of randomly selected elements (length between 1 and `max_elements`),
                   not including the excluded element.

    Raises:
        ValueError: If no elements remain after exclusion or if max_elements is invalid.
    """
    if exclude is not None:
        data = [item for item in data if item != exclude]

    if not data:
        raise ValueError("No elements left to sample after exclusion.")

    if max_samples < 1:
        raise ValueError("max_samples must be at least 1.")

    max_possible = min(max_samples, len(data))
    sample_size = random.randint(1, max_possible)

    return random.sample(data, sample_size)

In [3]:
from datasets import load_dataset, concatenate_datasets

en_dataset = load_dataset("json", data_files="../datasets/raw/waico-tool-calling-dataset-en.jsonl")["train"]
fr_dataset = load_dataset("json", data_files="../datasets/raw/waico-tool-calling-dataset-fr.jsonl")["train"]
de_dataset = load_dataset("json", data_files="../datasets/raw/waico-tool-calling-dataset-de.jsonl")["train"]
es_dataset = load_dataset("json", data_files="../datasets/raw/waico-tool-calling-dataset-es.jsonl")["train"]

dataset = concatenate_datasets([en_dataset, fr_dataset, de_dataset, es_dataset])

In [4]:
dataset

Dataset({
    features: ['assistant', 'user', 'system', 'tool', 'assistant_2'],
    num_rows: 1119
})

In [20]:
prepared_dataset = dataset.map(add_system_prompt, num_proc=4)

In [21]:
prepared_dataset

Dataset({
    features: ['assistant', 'user', 'system', 'tool', 'assistant_2'],
    num_rows: 1119
})

In [22]:
prepared_dataset[0]

{'assistant': 'Sure, one moment while I look it up.\n<tool_call get_health_data>\nhealth_data_type: SLEEP\nperiod: TODAY\n</tool_call>',
 'user': 'Can you tell me how many hours I slept Today?',
 'system': 'You are Waico, a helpful Wellbeing Assistant.\n\nTOOL CALLING:\nYou have access to the following tools that you can call to satisfy a user query or when you think it is appropriate.\nget_health_data(required String health_data_type, required String period):\nRetrieve health-related data for a specified category and time period.\nThe health_data_type parameter must be one of the following: SLEEP, WATER, STEPS_COUNT, ACTIVE_ENERGY_BURNED, or WEIGHT.\nThe period parameter must be one of: TODAY (from midnight to now) or LAST_24_HOURS.\n---\nmake_phone_call(required String phone_number):\nInitiate a phone call to the specified phone number. This function should only be called upon the user’s request or with their explicit approval. \n---\ndisplay_user_progress(required String health_data

In [23]:
prepared_dataset[200]

{'assistant': 'Preparing the document for your psychologist.\n<tool_call send_report_to>\nemail: dr.kim@therapy.net\n</tool_call>',
 'user': 'Please send a detailed report to my psychologist.',
 'system': 'You are Waico, a helpful Wellbeing Assistant.\n\nUSER DATA:\n\nPsychologist email: dr.kim@therapy.net\n\nTOOL CALLING:\nYou have access to the following tools that you can call to satisfy a user query or when you think it is appropriate.\nsend_report_to(required String recipient_email):\nSend a report generated based on observations from previous conversations. This can be sent to a health or wellbeing professional trusted by the user. This function should only be called upon the user’s request or with their explicit approval.\n---\ndisplay_user_progress(required String health_data_type, required String period):\nDisplay a Line Chart of the user’s daily data for a given type and time period. It also shows the total for the given period.\nThe health_data_type parameter must be one of:

In [24]:
prepared_dataset = prepared_dataset.shuffle(seed=11).train_test_split(test_size=119) # Remains 1000 for train

In [25]:
prepared_dataset

DatasetDict({
    train: Dataset({
        features: ['assistant', 'user', 'system', 'tool', 'assistant_2'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['assistant', 'user', 'system', 'tool', 'assistant_2'],
        num_rows: 119
    })
})

In [26]:
prepared_dataset.push_to_hub("sitatech/waico-tool-use")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sitatech/waico-tool-use/commit/6f6486d59634de4070a42aa11cf268004967b4c3', commit_message='Upload dataset', commit_description='', oid='6f6486d59634de4070a42aa11cf268004967b4c3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sitatech/waico-tool-use', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sitatech/waico-tool-use'), pr_revision=None, pr_num=None)