# Goal
The overall goal of the experiment is to evaluate the tool calling limitations of small LLMs (1B - 3B params) and find ways (through prompting, tool descriptions, etc.) to increase performance.

This first cell shows giving `llama3.2:3B` 32 tools and then stores metrics of the accuracy levels. 32 is set as the `TOOL_LIMIT` because any increase leads to complete degradation in accuracy. That is one of the most surprising results of this experiment because intuitively one would expect a decrease in accuracy but not a complete drop.

In [None]:
import os
import json
import time
import utils
from llama_stack_client.lib.agents.client_tool import ClientTool
from typing import Dict, List, Any, Optional, Union
from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from tools import tools

# Max client tools llama3.2:3B can handle
TOOL_LIMIT = 32
# Load environment variables
load_dotenv()

def load_queries(file_path: str) -> List[Dict[str, str]]:
    """Load query strings from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)

        if not isinstance(data, dict) or 'queries' not in data:
            raise ValueError(f"Invalid JSON format in {file_path}")

        # Return full query objects with ID for better test identification
        return data['queries']
    except FileNotFoundError:
        print(f"Query file not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"Invalid JSON in file: {file_path}")
        return []

def get_query_id(query_obj):
    """Extract an ID from a query object for better test identification."""
    if isinstance(query_obj, dict) and 'id' in query_obj:
        return query_obj['id']
    elif isinstance(query_obj, dict) and 'query' in query_obj:
        # Use first few words of query if no ID is available
        words = query_obj['query'].split()[:5]
        return '_'.join(words).lower().replace(',', '').replace('.', '')
    return "unknown_query"

def execute_query(
    client: LlamaStackClient,
    prompt: str,
    model: str,
    tools: Union[List[str], List[Any]], # list of toolgroup_ids or tool objects
    instructions: Optional[str] = None,
    max_tokens: int = 4096
) -> Dict[str, Any]:
    """Execute a single query with a given set of tools."""

    if instructions is None:
        # Default instructions for general tool use
        instructions = """You are a helpful assistant. You have access to a number of tools.
            Whenever a tool is called, be sure return the Response in a friendly and helpful tone.
            When you are asked to search the web you must use a tool. Keep answers concise.
            """

    # Create agent with tools
    agent = Agent(
        client,
        model=model,
        instructions=instructions,
        tools=tools,
        tool_config={"tool_choice": "auto"},
        sampling_params={"max_tokens": max_tokens}
    )

    # Create session
    session_id = agent.create_session(session_name=f"Test_{int(time.time())}")
    print(f"Created session_id={session_id} for Agent({agent.agent_id})" if not all(isinstance(t, str) for t in tools) else "")

    turn_response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        session_id=session_id,
        stream=False
    )
    return turn_response

def run_client_tool_test(model, query_obj, client_tool_module, llama_client, logger):
    """Run a single test for a specific model and query."""
    query_id = get_query_id(query_obj)
    prompt = query_obj['query']
    expected_tool_call = query_obj['tool_call']

    tool_list = []
    for name in dir(client_tool_module):
        if len(tool_list) >= TOOL_LIMIT:
            break
        attribute = getattr(client_tool_module, name)
        if isinstance(attribute, ClientTool):
            tool_list.append(attribute)

    logger.info(f"Testing query '{query_id}' with model {model}")
    logger.info(f"Query: {prompt[:50]}...")

    try:
        response = execute_query(
            client=llama_client,
            prompt=prompt,
            model=model,
            tools=tool_list,
        )
        # Get Tool execution and Inference steps
        steps = response.steps

        #Get tool used
        try:
            tools_used = steps[1].tool_calls[0].tool_name
        except Exception as e:
            logger.error(f"Error extracting tool name: {e}")
            tools_used = None
        tool_call_match = True if tools_used == expected_tool_call else False
        logger.info(f"Tool used: {tools_used} Tool expected: {expected_tool_call} match: {tool_call_match} ")

        #Check inference was not empty
        final_response = ""
        try:
            final_response = steps[2].api_model_response.content.strip()
            inference_not_empty = True if final_response != '' else False
        except Exception as e:
            logger.error(f"Error checking inference content: {e}")
            inference_not_empty = False
        logger.info(f'Inference not empty: {inference_not_empty}')
        logger.info(f"Query '{query_id}' succeeded with model {model} and the response \n {final_response}")

        # Record success metrics, including the expected_tool_call
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="SUCCESS",
            tool_call_match=tool_call_match,
            inference_not_empty=inference_not_empty,
            expected_tool_call=expected_tool_call
        )

        return True

    except Exception as e:
        error_msg = str(e)
        logger.error(f"Query '{query_id}' failed with model {model}: {error_msg}")

        # Record failure metrics
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="FAILURE",
            tool_call_match=False,
            inference_not_empty=False,
            expected_tool_call=expected_tool_call,
            error=error_msg
        )

        return False

def main():
    """Main function to run all tests."""
    # Set up logger
    logger = utils.setup_logger()

    # Create client
    base_url = os.getenv('REMOTE_BASE_URL')
    print(f"base_url={base_url}")
    if not base_url:
        logger.error("REMOTE_BASE_URL environment variable not set")
        return

    llama_client = LlamaStackClient(base_url=base_url)

    # Define models to test
    # make sure they are available in your LLS server
    models = ["meta-llama/Llama-3.2-3B-Instruct"]

    client_tool_queries = os.path.join(os.getcwd(), "queries/", "client_tool_queries.json")

    # Track statistics
    total_tests = 0
    successful_tests = 0

    # Loop through models (outermost loop)
    for model in models:
        logger.info(f"\n=== Testing with model: {model} ===\n")

        if client_tool_queries:
            queries = load_queries(client_tool_queries)

            if not queries:
                logger.info(f"No queries found in {client_tool_queries}")
                continue

            for query_obj in queries:
                total_tests += 1
                success = run_client_tool_test(model, query_obj, tools, llama_client, logger)
                if success:
                    successful_tests += 1

    # Print summary
    logger.info(f"\n=== Test Summary ===")
    logger.info(f"Total tests: {total_tests}")
    logger.info(f"Successful tests: {successful_tests}")
    logger.info(f"Failed tests: {total_tests - successful_tests}")
    if total_tests > 0:
        success_rate = (successful_tests / total_tests) * 100
        logger.info(f"Success rate: {success_rate:.1f}%")

    # Generate plots
    logger.info(f"\n=== Generating plots ===")
    utils.get_analysis_plots('./results/normal_client_tool_metrics.csv')


main()


# Results
![Tool Call Match Per Function Tool](results/plots/normal_tool_call_match_per_function_tool.png)
Overall majority of the tools have 100% accuracy, the tool with the worst accuracy is `convert_fahrenheit_to_kelvin`. One possible explanation could be that the training data likely had very little, if any, data on this type of Q/A. `convert_celsius_to_kelvin` has 100% accuracy which affirms this theory as that's a common conversion in science courses.

The next few cells run experiments to test what truly matters when defining a tool: the tool name, description, and the format of the docstring. A quick overview of how `llama-stack` parses the function when tagged with the `client_tool` decorator.

```python
def client_tool(func: T) -> ClientTool:
    """
    Decorator to convert a function into a ClientTool.
    ...
    """

    class _WrappedTool(ClientTool):
        __name__ = func.__name__
        __doc__ = func.__doc__
        __module__ = func.__module__

        def get_name(self) -> str:
            ...

        def get_description(self) -> str:
            ...

        def get_params_definition(self) -> Dict[str, Parameter]:
            hints = get_type_hints(func)
            # Remove return annotation if present
            hints.pop("return", None)

            # Get parameter descriptions from docstring
            params = {}
            sig = inspect.signature(func)
            doc = inspect.getdoc(func) or ""

            for name, type_hint in hints.items():
                # Look for :param name: in docstring
                param_doc = ""
                for line in doc.split("\n"):
                    if line.strip().startswith(f":param {name}:"):
                        param_doc = line.split(":", 2)[2].strip()
                        break

                if param_doc == "":
                    raise ValueError(f"No parameter description found for parameter {name}")

                ...

            return params
```
Full implementation can be found [here](https://github.com/meta-llama/llama-stack-client-python/blob/645d2195c5af1c6f903cb93c293319d8f94c36cc/src/llama_stack_client/lib/agents/client_tool.py#L150-L170).
An important thing to realize is that `llama-stack` **purposefully** disregards the return information from the docstring. Also that the docstring only **requires** one annotation, `:params:`, and everything _above_ that will be parsed as well.

This next cell will test whether explicitly having `:description:` and `:use_case:` annotations help, compared to including them without any annotation.

Ex.
```python
@client_tool
def add_two_numbers(a: float, b: float) -> float:
    """
    :description: Adds two numbers.
    :use_case: Use when the user wants to find the sum, total, or combined value of two numbers.
    :param a: The first number.
    :param b: The second number.
    :returns: The sum of `a` and `b`.
    """
    return a + b
```

compared to

```python
@client_tool
def add_two_numbers(a: float, b: float) -> float:
    """
    Adds two numbers.
    Use when the user wants to find the sum, total, or combined value of two numbers.
    :param a: The first number.
    :param b: The second number.
    :returns: The sum of `a` and `b`.
    """
    return a + b
```

In [None]:
import os
import json
import time
import utils
from llama_stack_client.lib.agents.client_tool import ClientTool
from typing import Dict, List, Any, Optional, Union
from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from tools import tools_no_extra_tags

# Max client tools llama3.2:3B can handle
TOOL_LIMIT = 32
# Load environment variables
load_dotenv()

def load_queries(file_path: str) -> List[Dict[str, str]]:
    """Load query strings from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)

        if not isinstance(data, dict) or 'queries' not in data:
            raise ValueError(f"Invalid JSON format in {file_path}")

        # Return full query objects with ID for better test identification
        return data['queries']
    except FileNotFoundError:
        print(f"Query file not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"Invalid JSON in file: {file_path}")
        return []

def get_query_id(query_obj):
    """Extract an ID from a query object for better test identification."""
    if isinstance(query_obj, dict) and 'id' in query_obj:
        return query_obj['id']
    elif isinstance(query_obj, dict) and 'query' in query_obj:
        # Use first few words of query if no ID is available
        words = query_obj['query'].split()[:5]
        return '_'.join(words).lower().replace(',', '').replace('.', '')
    return "unknown_query"

def execute_query(
    client: LlamaStackClient,
    prompt: str,
    model: str,
    tools: Union[List[str], List[Any]], # list of toolgroup_ids or tool objects
    instructions: Optional[str] = None,
    max_tokens: int = 4096
) -> Dict[str, Any]:
    """Execute a single query with a given set of tools."""

    if instructions is None:
        # Default instructions for general tool use
        instructions = """You are a helpful assistant. You have access to a number of tools.
            Whenever a tool is called, be sure return the Response in a friendly and helpful tone.
            When you are asked to search the web you must use a tool. Keep answers concise.
            """

    # Create agent with tools
    agent = Agent(
        client,
        model=model,
        instructions=instructions,
        tools=tools,
        tool_config={"tool_choice": "auto"},
        sampling_params={"max_tokens": max_tokens}
    )

    # Create session
    session_id = agent.create_session(session_name=f"Test_{int(time.time())}")
    print(f"Created session_id={session_id} for Agent({agent.agent_id})" if not all(isinstance(t, str) for t in tools) else "")

    turn_response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        session_id=session_id,
        stream=False
    )
    return turn_response

def run_client_tool_test(model, query_obj, client_tool_module, llama_client, logger):
    """Run a single test for a specific model and query."""
    query_id = get_query_id(query_obj)
    prompt = query_obj['query']
    expected_tool_call = query_obj['tool_call']

    tool_list = []
    for name in dir(client_tool_module):
        if len(tool_list) >= TOOL_LIMIT:
            break
        attribute = getattr(client_tool_module, name)
        if isinstance(attribute, ClientTool):
            tool_list.append(attribute)

    logger.info(f"Testing query '{query_id}' with model {model}")
    logger.info(f"Query: {prompt[:50]}...")

    try:
        response = execute_query(
            client=llama_client,
            prompt=prompt,
            model=model,
            tools=tool_list,
        )
        # Get Tool execution and Inference steps
        steps = response.steps

        #Get tool used
        try:
            tools_used = steps[1].tool_calls[0].tool_name
        except Exception as e:
            logger.error(f"Error extracting tool name: {e}")
            tools_used = None
        tool_call_match = True if tools_used == expected_tool_call else False
        logger.info(f"Tool used: {tools_used} Tool expected: {expected_tool_call} match: {tool_call_match} ")

        #Check inference was not empty
        final_response = ""
        try:
            final_response = steps[2].api_model_response.content.strip()
            inference_not_empty = True if final_response != '' else False
        except Exception as e:
            logger.error(f"Error checking inference content: {e}")
            inference_not_empty = False
        logger.info(f'Inference not empty: {inference_not_empty}')
        logger.info(f"Query '{query_id}' succeeded with model {model} and the response \n {final_response}")

        # Record success metrics, including the expected_tool_call
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="SUCCESS",
            tool_call_match=tool_call_match,
            inference_not_empty=inference_not_empty,
            expected_tool_call=expected_tool_call
        )

        return True

    except Exception as e:
        error_msg = str(e)
        logger.error(f"Query '{query_id}' failed with model {model}: {error_msg}")

        # Record failure metrics
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="FAILURE",
            tool_call_match=False,
            inference_not_empty=False,
            expected_tool_call=expected_tool_call,
            error=error_msg
        )

        return False

def main():
    """Main function to run all tests."""
    # Set up logger
    logger = utils.setup_logger()

    # Create client
    base_url = os.getenv('REMOTE_BASE_URL')
    print(f"base_url={base_url}")
    if not base_url:
        logger.error("REMOTE_BASE_URL environment variable not set")
        return

    llama_client = LlamaStackClient(base_url=base_url)

    # Define models to test
    # make sure they are available in your LLS server
    models = ["meta-llama/Llama-3.2-3B-Instruct"]

    client_tool_queries = os.path.join(os.getcwd(), "queries/", "client_tool_queries.json")

    # Track statistics
    total_tests = 0
    successful_tests = 0

    # Loop through models (outermost loop)
    for model in models:
        logger.info(f"\n=== Testing with model: {model} ===\n")

        if client_tool_queries:
            queries = load_queries(client_tool_queries)

            if not queries:
                logger.info(f"No queries found in {client_tool_queries}")
                continue

            for query_obj in queries:
                total_tests += 1
                success = run_client_tool_test(model, query_obj, tools_no_extra_tags, llama_client, logger)
                if success:
                    successful_tests += 1

    # Print summary
    logger.info(f"\n=== Test Summary ===")
    logger.info(f"Total tests: {total_tests}")
    logger.info(f"Successful tests: {successful_tests}")
    logger.info(f"Failed tests: {total_tests - successful_tests}")
    if total_tests > 0:
        success_rate = (successful_tests / total_tests) * 100
        logger.info(f"Success rate: {success_rate:.1f}%")

    # Generate plots
    logger.info(f"\n=== Generating plots ===")
    utils.get_analysis_plots('./results/no_extra_tools_client_tool_metrics.csv')


main()


# Results
![no_extra_tags_tool_call_match_per_function_tool.jpg](results/plots/no_extra_tags_tool_call_match_per_function_tool.jpg)

Based off the results from removing the explicit `:description:` and `:use_case:` there is a clear decrease in accuracy. It is **not** a fact that adding them will increase accuracy but for our query set it helped.

This next cell will test whether the tool name matters at all. To do this test, all functions were renamed to `function_1`, `function_2`, etc. but the docstring was left unchanged.

Ex.
```python
@client_tool
def function_1(a: float, b: float) -> float:
    """
    :description: Adds two numbers.
    :use_case: Use when the user wants to find the sum, total, or combined value of two numbers.
    :param a: The first number.
    :param b: The second number.
    :returns: The sum of `a` and `b`.
    """
    return a + b
```

In [None]:
import os
import json
import time
import utils
from llama_stack_client.lib.agents.client_tool import ClientTool
from typing import Dict, List, Any, Optional, Union
from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from tools import tools_bad_function_names

# Max client tools llama3.2:3B can handle
TOOL_LIMIT = 32
# Load environment variables
load_dotenv()

def load_queries(file_path: str) -> List[Dict[str, str]]:
    """Load query strings from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)

        if not isinstance(data, dict) or 'queries' not in data:
            raise ValueError(f"Invalid JSON format in {file_path}")

        # Return full query objects with ID for better test identification
        return data['queries']
    except FileNotFoundError:
        print(f"Query file not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"Invalid JSON in file: {file_path}")
        return []

def get_query_id(query_obj):
    """Extract an ID from a query object for better test identification."""
    if isinstance(query_obj, dict) and 'id' in query_obj:
        return query_obj['id']
    elif isinstance(query_obj, dict) and 'query' in query_obj:
        # Use first few words of query if no ID is available
        words = query_obj['query'].split()[:5]
        return '_'.join(words).lower().replace(',', '').replace('.', '')
    return "unknown_query"

def execute_query(
    client: LlamaStackClient,
    prompt: str,
    model: str,
    tools: Union[List[str], List[Any]], # list of toolgroup_ids or tool objects
    instructions: Optional[str] = None,
    max_tokens: int = 4096
) -> Dict[str, Any]:
    """Execute a single query with a given set of tools."""

    if instructions is None:
        # Default instructions for general tool use
        instructions = """You are a helpful assistant. You have access to a number of tools.
            Whenever a tool is called, be sure return the Response in a friendly and helpful tone.
            When you are asked to search the web you must use a tool. Keep answers concise.
            """

    # Create agent with tools
    agent = Agent(
        client,
        model=model,
        instructions=instructions,
        tools=tools,
        tool_config={"tool_choice": "auto"},
        sampling_params={"max_tokens": max_tokens}
    )

    # Create session
    session_id = agent.create_session(session_name=f"Test_{int(time.time())}")
    print(f"Created session_id={session_id} for Agent({agent.agent_id})" if not all(isinstance(t, str) for t in tools) else "")

    turn_response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        session_id=session_id,
        stream=False
    )
    return turn_response

def run_client_tool_test(model, query_obj, client_tool_module, llama_client, logger):
    """Run a single test for a specific model and query."""
    query_id = get_query_id(query_obj)
    prompt = query_obj['query']
    expected_tool_call = query_obj['tool_call']

    tool_list = []
    for name in dir(client_tool_module):
        if len(tool_list) >= TOOL_LIMIT:
            break
        attribute = getattr(client_tool_module, name)
        if isinstance(attribute, ClientTool):
            tool_list.append(attribute)

    logger.info(f"Testing query '{query_id}' with model {model}")
    logger.info(f"Query: {prompt[:50]}...")

    try:
        response = execute_query(
            client=llama_client,
            prompt=prompt,
            model=model,
            tools=tool_list,
        )
        # Get Tool execution and Inference steps
        steps = response.steps

        #Get tool used
        try:
            tools_used = steps[1].tool_calls[0].tool_name
        except Exception as e:
            logger.error(f"Error extracting tool name: {e}")
            tools_used = None
        tool_call_match = True if tools_used == expected_tool_call else False
        logger.info(f"Tool used: {tools_used} Tool expected: {expected_tool_call} match: {tool_call_match} ")

        #Check inference was not empty
        final_response = ""
        try:
            final_response = steps[2].api_model_response.content.strip()
            inference_not_empty = True if final_response != '' else False
        except Exception as e:
            logger.error(f"Error checking inference content: {e}")
            inference_not_empty = False
        logger.info(f'Inference not empty: {inference_not_empty}')
        logger.info(f"Query '{query_id}' succeeded with model {model} and the response \n {final_response}")

        # Record success metrics, including the expected_tool_call
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="SUCCESS",
            tool_call_match=tool_call_match,
            inference_not_empty=inference_not_empty,
            expected_tool_call=expected_tool_call
        )

        return True

    except Exception as e:
        error_msg = str(e)
        logger.error(f"Query '{query_id}' failed with model {model}: {error_msg}")

        # Record failure metrics
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="FAILURE",
            tool_call_match=False,
            inference_not_empty=False,
            expected_tool_call=expected_tool_call,
            error=error_msg
        )

        return False

def main():
    """Main function to run all tests."""
    # Set up logger
    logger = utils.setup_logger()

    # Create client
    base_url = os.getenv('REMOTE_BASE_URL')
    print(f"base_url={base_url}")
    if not base_url:
        logger.error("REMOTE_BASE_URL environment variable not set")
        return

    llama_client = LlamaStackClient(base_url=base_url)

    # Define models to test
    # make sure they are available in your LLS server
    models = ["meta-llama/Llama-3.2-3B-Instruct"]

    client_tool_queries = os.path.join(os.getcwd(), "queries/", "client_tool_queries_bad_functions.json")

    # Track statistics
    total_tests = 0
    successful_tests = 0

    # Loop through models (outermost loop)
    for model in models:
        logger.info(f"\n=== Testing with model: {model} ===\n")

        if client_tool_queries:
            queries = load_queries(client_tool_queries)

            if not queries:
                logger.info(f"No queries found in {client_tool_queries}")
                continue

            for query_obj in queries:
                total_tests += 1
                success = run_client_tool_test(model, query_obj, tools_bad_function_names, llama_client, logger)
                if success:
                    successful_tests += 1

    # Print summary
    logger.info(f"\n=== Test Summary ===")
    logger.info(f"Total tests: {total_tests}")
    logger.info(f"Successful tests: {successful_tests}")
    logger.info(f"Failed tests: {total_tests - successful_tests}")
    if total_tests > 0:
        success_rate = (successful_tests / total_tests) * 100
        logger.info(f"Success rate: {success_rate:.1f}%")

    # Generate plots
    logger.info(f"\n=== Generating plots ===")
    utils.get_analysis_plots('./results/bad_function_names_client_tool_metrics.csv')


main()


# Results

![bad_function_names_tool_call_match_per_function_tool](results/plots/bad_function_names_tool_call_match_per_function_tool.jpg)

The results show a sharp degrade in accuracy, emphasizing the importance of good function naming practices. Another experiment which could spawn from this is seeing whether using unit test style function naming for client tools and MCP servers.

This next cell will test whether the tool description matters at all. To do this test, all docstrings have been reduced to only contain the required `:params:` annotation and function names have been kept the same.

Ex.
```python
@client_tool
def add_two_numbers(a: float, b: float) -> float:
    """
    :param a: The first number.
    :param b: The second number.
    """
    return a + b
```

In [None]:
import os
import json
import time
import utils
from llama_stack_client.lib.agents.client_tool import ClientTool
from typing import Dict, List, Any, Optional, Union
from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from tools import tools_only_params

# Max client tools llama3.2:3B can handle
TOOL_LIMIT = 32
# Load environment variables
load_dotenv()

def load_queries(file_path: str) -> List[Dict[str, str]]:
    """Load query strings from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)

        if not isinstance(data, dict) or 'queries' not in data:
            raise ValueError(f"Invalid JSON format in {file_path}")

        # Return full query objects with ID for better test identification
        return data['queries']
    except FileNotFoundError:
        print(f"Query file not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"Invalid JSON in file: {file_path}")
        return []

def get_query_id(query_obj):
    """Extract an ID from a query object for better test identification."""
    if isinstance(query_obj, dict) and 'id' in query_obj:
        return query_obj['id']
    elif isinstance(query_obj, dict) and 'query' in query_obj:
        # Use first few words of query if no ID is available
        words = query_obj['query'].split()[:5]
        return '_'.join(words).lower().replace(',', '').replace('.', '')
    return "unknown_query"

def execute_query(
    client: LlamaStackClient,
    prompt: str,
    model: str,
    tools: Union[List[str], List[Any]], # list of toolgroup_ids or tool objects
    instructions: Optional[str] = None,
    max_tokens: int = 4096
) -> Dict[str, Any]:
    """Execute a single query with a given set of tools."""

    if instructions is None:
        # Default instructions for general tool use
        instructions = """You are a helpful assistant. You have access to a number of tools.
            Whenever a tool is called, be sure return the Response in a friendly and helpful tone.
            When you are asked to search the web you must use a tool. Keep answers concise.
            """

    # Create agent with tools
    agent = Agent(
        client,
        model=model,
        instructions=instructions,
        tools=tools,
        tool_config={"tool_choice": "auto"},
        sampling_params={"max_tokens": max_tokens}
    )

    # Create session
    session_id = agent.create_session(session_name=f"Test_{int(time.time())}")
    print(f"Created session_id={session_id} for Agent({agent.agent_id})" if not all(isinstance(t, str) for t in tools) else "")

    turn_response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        session_id=session_id,
        stream=False
    )
    return turn_response

def run_client_tool_test(model, query_obj, client_tool_module, llama_client, logger):
    """Run a single test for a specific model and query."""
    query_id = get_query_id(query_obj)
    prompt = query_obj['query']
    expected_tool_call = query_obj['tool_call']

    tool_list = []
    for name in dir(client_tool_module):
        if len(tool_list) >= TOOL_LIMIT:
            break
        attribute = getattr(client_tool_module, name)
        if isinstance(attribute, ClientTool):
            tool_list.append(attribute)

    logger.info(f"Testing query '{query_id}' with model {model}")
    logger.info(f"Query: {prompt[:50]}...")

    try:
        response = execute_query(
            client=llama_client,
            prompt=prompt,
            model=model,
            tools=tool_list,
        )
        # Get Tool execution and Inference steps
        steps = response.steps

        #Get tool used
        try:
            tools_used = steps[1].tool_calls[0].tool_name
        except Exception as e:
            logger.error(f"Error extracting tool name: {e}")
            tools_used = None
        tool_call_match = True if tools_used == expected_tool_call else False
        logger.info(f"Tool used: {tools_used} Tool expected: {expected_tool_call} match: {tool_call_match} ")

        #Check inference was not empty
        final_response = ""
        try:
            final_response = steps[2].api_model_response.content.strip()
            inference_not_empty = True if final_response != '' else False
        except Exception as e:
            logger.error(f"Error checking inference content: {e}")
            inference_not_empty = False
        logger.info(f'Inference not empty: {inference_not_empty}')
        logger.info(f"Query '{query_id}' succeeded with model {model} and the response \n {final_response}")

        # Record success metrics, including the expected_tool_call
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="SUCCESS",
            tool_call_match=tool_call_match,
            inference_not_empty=inference_not_empty,
            expected_tool_call=expected_tool_call
        )

        return True

    except Exception as e:
        error_msg = str(e)
        logger.error(f"Query '{query_id}' failed with model {model}: {error_msg}")

        # Record failure metrics
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="FAILURE",
            tool_call_match=False,
            inference_not_empty=False,
            expected_tool_call=expected_tool_call,
            error=error_msg
        )

        return False

def main():
    """Main function to run all tests."""
    # Set up logger
    logger = utils.setup_logger()

    # Create client
    base_url = os.getenv('REMOTE_BASE_URL')
    print(f"base_url={base_url}")
    if not base_url:
        logger.error("REMOTE_BASE_URL environment variable not set")
        return

    llama_client = LlamaStackClient(base_url=base_url)

    # Define models to test
    # make sure they are available in your LLS server
    models = ["meta-llama/Llama-3.2-3B-Instruct"]

    client_tool_queries = os.path.join(os.getcwd(), "queries/", "client_tool_queries.json")

    # Track statistics
    total_tests = 0
    successful_tests = 0

    # Loop through models (outermost loop)
    for model in models:
        logger.info(f"\n=== Testing with model: {model} ===\n")

        if client_tool_queries:
            queries = load_queries(client_tool_queries)

            if not queries:
                logger.info(f"No queries found in {client_tool_queries}")
                continue

            for query_obj in queries:
                total_tests += 1
                success = run_client_tool_test(model, query_obj, tools_only_params, llama_client, logger)
                if success:
                    successful_tests += 1

    # Print summary
    logger.info(f"\n=== Test Summary ===")
    logger.info(f"Total tests: {total_tests}")
    logger.info(f"Successful tests: {successful_tests}")
    logger.info(f"Failed tests: {total_tests - successful_tests}")
    if total_tests > 0:
        success_rate = (successful_tests / total_tests) * 100
        logger.info(f"Success rate: {success_rate:.1f}%")

    # Generate plots
    logger.info(f"\n=== Generating plots ===")
    utils.get_analysis_plots('./results/only_params_client_tool_metrics.csv')


main()


# Results
![only_params_tool_call_match_per_function.jpg](results/plots/only_params_tool_call_match_per_function.jpg)

The results show that removing all details from the docstring other than the required `:params:` annotation does not lead to large decrease in accuracy. This is likely why `llama-stack` only requires the `:params:` annotation but nothing else, like `:use_case:`.

## We will now run the same evaluation set on the well constructed tools but swap `llama3.2:3B` with `llama3.2:1B`.

In [None]:
import os
import json
import time
import utils
from llama_stack_client.lib.agents.client_tool import ClientTool
from typing import Dict, List, Any, Optional, Union
from dotenv import load_dotenv
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from tools import tools

# Max client tools llama3.2:3B can handle
TOOL_LIMIT = 23
# Load environment variables
load_dotenv()

def load_queries(file_path: str) -> List[Dict[str, str]]:
    """Load query strings from a JSON file."""
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)

        if not isinstance(data, dict) or 'queries' not in data:
            raise ValueError(f"Invalid JSON format in {file_path}")

        # Return full query objects with ID for better test identification
        return data['queries']
    except FileNotFoundError:
        print(f"Query file not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"Invalid JSON in file: {file_path}")
        return []

def get_query_id(query_obj):
    """Extract an ID from a query object for better test identification."""
    if isinstance(query_obj, dict) and 'id' in query_obj:
        return query_obj['id']
    elif isinstance(query_obj, dict) and 'query' in query_obj:
        # Use first few words of query if no ID is available
        words = query_obj['query'].split()[:5]
        return '_'.join(words).lower().replace(',', '').replace('.', '')
    return "unknown_query"

def execute_query(
    client: LlamaStackClient,
    prompt: str,
    model: str,
    tools: Union[List[str], List[Any]], # list of toolgroup_ids or tool objects
    instructions: Optional[str] = None,
    max_tokens: int = 4096
) -> Dict[str, Any]:
    """Execute a single query with a given set of tools."""

    if instructions is None:
        # Default instructions for general tool use
        instructions = """You are a helpful assistant. You have access to a number of tools.
            Whenever a tool is called, be sure return the Response in a friendly and helpful tone.
            When you are asked to search the web you must use a tool. Keep answers concise.
            """

    # Create agent with tools
    agent = Agent(
        client,
        model=model,
        instructions=instructions,
        tools=tools,
        tool_config={"tool_choice": "auto"},
        sampling_params={"max_tokens": max_tokens}
    )

    # Create session
    session_id = agent.create_session(session_name=f"Test_{int(time.time())}")
    print(f"Created session_id={session_id} for Agent({agent.agent_id})" if not all(isinstance(t, str) for t in tools) else "")

    turn_response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        session_id=session_id,
        stream=False
    )
    return turn_response

def run_client_tool_test(model, query_obj, client_tool_module, llama_client, logger):
    """Run a single test for a specific model and query."""
    query_id = get_query_id(query_obj)
    prompt = query_obj['query']
    expected_tool_call = query_obj['tool_call']

    tool_list = []
    for name in dir(client_tool_module):
        if len(tool_list) >= TOOL_LIMIT:
            break
        attribute = getattr(client_tool_module, name)
        if isinstance(attribute, ClientTool):
            tool_list.append(attribute)

    logger.info(f"Testing query '{query_id}' with model {model}")
    logger.info(f"Query: {prompt[:50]}...")

    try:
        response = execute_query(
            client=llama_client,
            prompt=prompt,
            model=model,
            tools=tool_list,
        )
        # Get Tool execution and Inference steps
        steps = response.steps

        #Get tool used
        try:
            tools_used = steps[1].tool_calls[0].tool_name
        except Exception as e:
            logger.error(f"Error extracting tool name: {e}")
            tools_used = None
        tool_call_match = True if tools_used == expected_tool_call else False
        logger.info(f"Tool used: {tools_used} Tool expected: {expected_tool_call} match: {tool_call_match} ")

        #Check inference was not empty
        final_response = ""
        try:
            final_response = steps[2].api_model_response.content.strip()
            inference_not_empty = True if final_response != '' else False
        except Exception as e:
            logger.error(f"Error checking inference content: {e}")
            inference_not_empty = False
        logger.info(f'Inference not empty: {inference_not_empty}')
        logger.info(f"Query '{query_id}' succeeded with model {model} and the response \n {final_response}")

        # Record success metrics, including the expected_tool_call
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="SUCCESS",
            tool_call_match=tool_call_match,
            inference_not_empty=inference_not_empty,
            expected_tool_call=expected_tool_call
        )

        return True

    except Exception as e:
        error_msg = str(e)
        logger.error(f"Query '{query_id}' failed with model {model}: {error_msg}")

        # Record failure metrics
        utils.add_metric(
            model=model,
            query_id=query_id,
            status="FAILURE",
            tool_call_match=False,
            inference_not_empty=False,
            expected_tool_call=expected_tool_call,
            error=error_msg
        )

        return False

def main():
    """Main function to run all tests."""
    # Set up logger
    logger = utils.setup_logger()

    # Create client
    base_url = os.getenv('REMOTE_BASE_URL')
    print(f"base_url={base_url}")
    if not base_url:
        logger.error("REMOTE_BASE_URL environment variable not set")
        return

    llama_client = LlamaStackClient(base_url=base_url)

    # Define models to test
    # make sure they are available in your LLS server
    models = ["llama3.2:1b"]

    client_tool_queries = os.path.join(os.getcwd(), "queries/", "client_tool_queries.json")

    # Track statistics
    total_tests = 0
    successful_tests = 0

    # Loop through models (outermost loop)
    for model in models:
        logger.info(f"\n=== Testing with model: {model} ===\n")

        if client_tool_queries:
            queries = load_queries(client_tool_queries)

            if not queries:
                logger.info(f"No queries found in {client_tool_queries}")
                continue

            for query_obj in queries:
                total_tests += 1
                success = run_client_tool_test(model, query_obj, tools, llama_client, logger)
                if success:
                    successful_tests += 1

    # Print summary
    logger.info(f"\n=== Test Summary ===")
    logger.info(f"Total tests: {total_tests}")
    logger.info(f"Successful tests: {successful_tests}")
    logger.info(f"Failed tests: {total_tests - successful_tests}")
    if total_tests > 0:
        success_rate = (successful_tests / total_tests) * 100
        logger.info(f"Success rate: {success_rate:.1f}%")

    # Generate plots
    logger.info(f"\n=== Generating plots ===")
    utils.get_analysis_plots('./results/normal_client_tool_metrics_1B.csv')


main()


# Results

![tool_calling_match_per_function_23_tools_1B.png.jpg](results/plots/tool_calling_match_per_function_23_tools_1B.png.jpg)
![tool_calling_match_per_function_1_tool_1B.jpg](results/plots/tool_calling_match_per_function_1_tool_1B.jpg)

The results show that 1B param models, at least for llama family, are too small for tool calling.

Another test was done where the model was **explicitly** told to use tool calling and given **only** the correct tool, in order to test whether the tool selection process was offloaded if 1B param models would work, however surprisingly it got 0 successful tool calls then.

# Summary

The following observations have been made using our eval set of 600 queries.
- `llama3.2:3B` can handle a maximum of 32 tools before a complete degradation in accuracy.
- Well named functions are exponentially more important than a well written function description.
- Explicitly added `:description:` and `:use_case:` showed a slight improve in accuracy for our eval set.
- LLMs need to be introduced to tool calling in the training data, otherwise will have difficulty correctly calling them.
    - This was seen by pulling `granite3.2` from ollama, and it wasn't able to call a single tool.
- `llama3.2:1B` is too small of a model and is extremely inconsistent at tool calling.
    - Even given just a single tool and explicitly instructed to call that tool, it is unable to consistently do it. Important note is that the quantized model was used which could have been a big factor for low performance