In [46]:
from typing import Dict

import requests
from llama_stack_client.lib.agents.client_tool import client_tool


@client_tool
def knowledge_search(query: str):
    """Search for information in a database. Do not use this tool to load files.

    :param query: The query to search for. Can be a natural language sentence or keywords.
    """
    if "nba" not in query.lower():
        override_output = f"Found context for query '{query}': The open-source AI models you can fine-tune, distill and deploy anywhere. Choose from our collection of models: Llama 3.1, Llama 3.2, Llama 3.3."
    else:
        override_output = f"Found context for query '{query}': The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL)."
    return {
        "content": override_output
    }


In [68]:
import os

from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types import Document
from llama_stack_client.types.agent_create_params import AgentConfig
from termcolor import cprint
from uuid import uuid4

from llama_stack_client import LlamaStackClient

llama_stack_client = LlamaStackClient(
    base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
)

urls = ["chat.rst"]
documents = [
    Document(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]
documents = []
documents.append(
    Document(
        document_id="nba_wiki",
        content="The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).",
        metadata={},
    )
)
documents.append(
    Document(
        document_id="perplexity_wiki",
        content="""Perplexity was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:

Srinivas, the CEO, worked at OpenAI as an AI researcher.
Konwinski was among the founding team at Databricks.
Yarats, the CTO, was an AI research scientist at Meta.
Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]""",
        metadata={},
    )
)
vector_db_id = f"test-vector-db-{uuid4()}"
llama_stack_client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
)
llama_stack_client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=128,
)

client_tools = (knowledge_search,)
client_tools = []

# with system message behavior replace
instructions = """
You are a helpful assistant. You have access to functions, but you should only use them if they are required.

You are an expert in composing functions. You are given a question and a set of possible functions.
Based on the question, you may or may not need to make one function/tool call to achieve the purpose.
If none of the function can be used, don't return [], instead answer the question directly without using functions. If the given question lacks the parameters required by the function,
also point it out.
Do not make the same function call twice as you will get the same result.
Do not make another function call consecutively. Answer the user query after getting the result from the function call.
Only use `code_interpreter` to answer questions about the provided file.

{{ function_description }}
"""
# instructions = "You are a helpful assistant."
agent_config = {
    "toolgroups": [
        dict(
            name="builtin::rag",
            args={"vector_db_ids": [vector_db_id]},
        ),
        "builtin::code_interpreter",
    ],
    "instructions": instructions,
    # "model": "meta-llama/Llama-3.2-3B-Instruct",
    "model": "meta-llama/Llama-3.3-70B-Instruct",
    "tool_config": {
        "system_message_behavior": "replace",
    },
    "client_tools":[client_tool.get_tool_definition() for client_tool in client_tools],

    "enable_session_persistence": False,
}

agent = Agent(llama_stack_client, agent_config, client_tools)
inflation_doc = Document(
    document_id="test_csv",
    content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
    mime_type="text/csv",
    metadata={},
)
user_prompts = [
    (
        "Write code to load this csv file, use the `code_interpreter` tool to execute it, then tell me what's in it",
        [inflation_doc],
        "code_interpreter",
    ),
    # (
    #     "Use the code_interpreter tool to compute 20+12, then answer with the output",
    #     [],
    #     "code_interpreter",
    # ),
    # (
    #     "Use 'knowledge_search' function to answer the question: what are the versions of Llama3? do not use code_interpreter",
    #     [],
    #     "knowledge_search",
    # ),
    (
        "when was the perplexity created?",
        [],
        "knowledge_search",
    ),
    (
        "when was the nba created?",
        [],
        "knowledge_search",
    ),
]

for prompt, docs, tool_name in user_prompts:
    print(f"User> {prompt}")
    session_id = agent.create_session(f"test-session-{uuid4()}")
    response = agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
        documents=docs,
    )
    logs = [str(log) for log in EventLogger().log(response) if log is not None]
    logs_str = "\n".join(logs)
    print(logs_str)
    assert f"Tool:{tool_name}" in logs_str


User> Write code to load this csv file, use the `code_interpreter` tool to execute it, then tell me what's in it

inference> 

import pandas as pd
# Load data
df = pd
.read_csv('/var/folders/cz/vyh7y1d11xg881lsx
sshnc5c0000gn/T/tmpom3cghqv/b3H9u7
Scinflation.csv')
# Rows
print("Number of rows and columns in the data:", df
.shape)
# Columns
print("Columns of the data are:", len(df.columns))
# Column names

print("Columns of the data are:", df.columns)
# Column dtypes
print("Dat
atype of the columns are:", df.dtypes)
# Sample of data
print
("Data sample from file:")
print(df.head())

tool_execution> Tool:code_interpreter Args:{'code': 'import pandas as pd\n# Load data\ndf = pd.read_csv(\'/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpom3cghqv/b3H9u7Scinflation.csv\')\n# Rows\nprint("Number of rows and columns in the data:", df.shape)\n# Columns\nprint("Columns of the data are:", len(df.columns))\n# Column names\nprint("Columns of the data are:", df.columns)\n# Column dtypes\nprint("

In [39]:
from typing import Dict

import requests
from llama_stack_client.lib.agents.client_tool import client_tool


@client_tool
def load_url(url: str):
    """Load the content given a URL

    :param url: The url to load
    """
    return {
        "content": """
Today Google announced that they have released the source code to PebbleOS. This is massive for Rebble, and will accelerate our efforts to produce new hardware.

Previously, we have been working on our own replacement firmware: RebbleOS. As you can see by the commit history though, progress was slow. Building a production-ready realtime OS for the Pebble is no small feat, and although we were confident we’d get there given enough time, it was never our ideal path. Thanks to the hard work of many people both within Google and not, we finally have our hands on the original source code for PebbleOS. You can read Google’s blog post on this for even more information.

This does not mean we instantly have the ability to start developing updates for PebbleOS though, we first will need to spend some concentrated time getting it to build. But before we talk about that, let’s talk about Rebble itself.
"""
    }
    # get the url's contenet
    response = requests.get(url)
    return {"content": response.text}


In [42]:
import os

from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types import Document
from llama_stack_client.types.agent_create_params import AgentConfig
from termcolor import cprint
import uuid



def create_http_client():
    from llama_stack_client import LlamaStackClient

    return LlamaStackClient(
        base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
    )


def create_library_client(template="ollama"):
    from llama_stack import LlamaStackAsLibraryClient

    client = LlamaStackAsLibraryClient(template)
    client.initialize()
    return client


# client = create_library_client()  # or create_http_client() depending on the environment you picked


client = (
    create_http_client()
)  # or create_http_client() depending on the environment you picked

# Documents to be used for RAG
urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"]
documents = [
    Document(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]
documents.append(
    Document(
        document_id=f"num-{10}",
        content="""Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides

Unified API layer for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.

Plugin architecture to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.

Prepackaged verified distributions which offer a one-stop solution for developers to get started quickly and reliably in any environment

Multiple developer interfaces like CLI and SDKs for Python, Node, iOS, and Android

Standalone applications as examples for how to build production-grade AI applications with Llama Stack""",
        mime_type="text/plain",
        metadata={},
    )
)

# Register a vector database
vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
)

# Insert the documents into the vector database
client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)

client_tools = (load_url,)


instructions = """
You are a helpful assistant. You have access to functions, but you should only use them if they are required.

You are an expert in composing functions. You are given a question and a set of possible functions.
Based on the question, you may or may not need to make one or more function/tool calls to achieve the purpose.
If none of the function can be used, don't return [], instead answer the question directly without using functions. If the given question lacks the parameters required by the function,
also point it out. 

{{ function_description }}
"""

# instructions = "You are a helpful assistant."

# instructions = """
# You are an expert in composing functions. You are given a question and a set of possible functions.
# Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
# If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
# also point it out.
# """

agent_config = AgentConfig(
    model=os.environ["INFERENCE_MODEL"],
    # Define instructions for the agent ( aka system prompt)
    instructions=instructions,
    enable_session_persistence=False,
    # Define tools available to the agent
    toolgroups=[
        {
            "name": "builtin::rag",
            "args": {
                "vector_db_ids": [vector_db_id],
            },
        },
        "builtin::code_interpreter",
    ],
    client_tools=[client_tool.get_tool_definition() for client_tool in client_tools],
    tool_config={
        "tool_choice": "auto",
        "tool_prompt_format": "python_list",
        "system_message_behavior": "replace",
    },
    max_infer_iters=10,
)
print([client_tool.get_tool_definition() for client_tool in client_tools])

rag_agent = Agent(client, agent_config, client_tools)
session_id = rag_agent.create_session("test-session")

user_prompts = [
    # "What are the top 5 topics that were explained? Only list succinct bullet points.",
    # "search in the vector database for the term 'torchtune'",
    # "use the search tool and answer what is llama-stack?",
    # "can you summarize https://llama-stack.readthedocs.io/en/latest/introduction/index.html",
    "tell me a joke",
    "load https://llama-stack.readthedocs.io/en/latest/introduction/index.html and summarize it",
]

# Run the agent loop by calling the `create_turn` method
for prompt in user_prompts:
    cprint(f"User> {prompt}", "green")
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
    )
    for log in EventLogger().log(response):
        log.print()


# response = rag_agent.create_turn(
#     messages=[{"role": "user", "content": prompt}],
#     session_id=session_id,
#     stream=False,
# )
# print(response)


[{'name': 'load_url', 'description': 'Load the content given a URL', 'parameters': [{'name': 'url', 'description': 'The url to load', 'parameter_type': 'str', 'default': None, 'required': True}], 'metadata': {}, 'tool_prompt_format': 'python_list'}]
[32mUser> tell me a joke[0m
[33minference> [0m[33mWhy[0m[33m don't skeletons fight each other?

[97m[0mhey don't have the guts![0m
[30m[0m[32mUser> load https://llama-stack.readthedocs.io/en/latest/introduction/index.html and summarize it[0m
[33minference> [0m[33m[[0m[33mload_url(url="https://llama-stack.readthedocs.io/en/latest[0m[33m/introduction/index.html")][0m[97m[0m
[32mCustomTool> {"content": "\nToday Google announced that they have released the source code to PebbleOS. This is massive for Rebble, and will accelerate our efforts to produce new hardware.\n\nPreviously, we have been working on our own replacement firmware: RebbleOS. As you can see by the commit history though, progress was slow. Building a prod