## This notebook uses llama-stack-client to do handle the ingesting of docs and querying them. A simple ootb RAG use case

In [1]:
import asyncio
import os

from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.client_tool import client_tool
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from rich.pretty import pprint
import rich
import json
import uuid
from pydantic import BaseModel
from typing import List
#from llama_stack.distribution.library_client import LlamaStackAsLibraryClient

from dotenv import load_dotenv

load_dotenv()
#BRAVE_SEARCH_API_KEY = os.environ["BRAVE_SEARCH_API_KEY"]
HOST=os.environ["HOST"]
PORT=os.environ["LLAMA_STACK_PORT"]
MODEL_NAME=os.environ["INFERENCE_MODEL"]
#TAVILY_SEARCH_API_KEY=os.environ["TAVILY_API_KEY"]

In [2]:
client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
for tool in client.tools.list() :
    print(tool)
    print('-----')
#for provider in client.providers.list() :
#    print(provider)
#    print('-----')
#vector_providers = [
#    provider for provider in client.providers.list() if provider.api == "vector_io"
#]

Tool(description='Execute code', identifier='code_interpreter', parameters=[Parameter(description='The code to execute', name='code', parameter_type='string', required=True, default=None)], provider_id='code-interpreter', provider_resource_id='code_interpreter', tool_host='distribution', toolgroup_id='builtin::code_interpreter', type='tool', metadata=None)
-----
Tool(description='Insert documents into memory', identifier='insert_into_memory', parameters=[], provider_id='rag-runtime', provider_resource_id='insert_into_memory', tool_host='distribution', toolgroup_id='builtin::rag', type='tool', metadata=None)
-----
Tool(description='Search for information in a database.', identifier='knowledge_search', parameters=[Parameter(description='The query to search for. Can be a natural language sentence or keywords.', name='query', parameter_type='string', required=True, default=None)], provider_id='rag-runtime', provider_resource_id='knowledge_search', tool_host='distribution', toolgroup_id='bu

In [3]:
from llama_stack_client.types import Document
urls = [
    "memory_optimizations.rst",
    "chat.rst",
    "llama3.rst",
    "datasets.rst",
    "qat_finetune.rst",
    "lora_finetune.rst",
]
documents = [
    Document(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]

#vector_providers = [
#    provider for provider in client.providers.list() if provider.api == "vector_io"
#]
#selected_vector_provider = vector_providers[0]
vector_db_id = f"test_vector_db_{uuid.uuid4()}"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
    #provider_id=selected_vector_provider.provider_id,
    provider_id="sqlite-vec",
)

client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)

In [4]:
# First, let's come up with a couple of examples to test the agent
examples = [
    {
        "input_query": "What precision formats does torchtune support?",
        "expected_answer": "Torchtune supports two data types for precision: fp32 (full-precision) which uses 4 bytes per model and optimizer parameter, and bfloat16 (half-precision) which uses 2 bytes per model and optimizer parameter."
    },
    {
        "input_query": "What does DoRA stand for in torchtune?",
        "expected_answer": "Weight-Decomposed Low-Rank Adaptation"
    },
    {
        "input_query": "How does the CPUOffloadOptimizer reduce GPU memory usage?",
        "expected_answer": "The CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It can also optionally offload gradients to CPU by using offload_gradients=True"
    },
    {
        "input_query": "How do I ensure only LoRA parameters are trainable when fine-tuning?",
        "expected_answer": "You can set only LoRA parameters to trainable using torchtune's utility functions: first fetch all LoRA parameters with lora_params = get_adapter_params(lora_model), then set them as trainable with set_trainable_params(lora_model, lora_params). The LoRA recipe handles this automatically."
    }
]

In [5]:
rag_agent = Agent(
    client,
    model=MODEL_NAME,
    instructions="You are a helpful assistant that can answer questions about the Torchtune project. You should always use the RAG tool to answer questions.",
    tools=[{
        "name": "builtin::rag",
        "args": {"vector_db_ids": [vector_db_id]},
    }],
)

for example in examples:
    rag_session_id = rag_agent.create_session(session_name=f"rag_session_{uuid.uuid4()}")
    response = rag_agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": example["input_query"]
            }
        ],
        session_id=rag_session_id,
        stream=False
    )
    rich.print(f"[bold cyan]Question:[/bold cyan] {example['input_query']}")
    rich.print(f"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}")