# Inferencia
## Ejemplo
The full API of this library can be found in api.md.
You may find basic client examples in our llama-stack-apps repo.

In [None]:
from llama_stack_client import LlamaStackClient
from llama_stack_client.types import UserMessage

client = LlamaStackClient(
    base_url=f"http://20.72.80.241:5001",
)

response = client.inference.chat_completion(
    messages=[
        UserMessage(
            content="Hola Llama, escribe un poema de 2 sentencias acerca de la luna",
            role="user",
        ),
    ],
    model_id="meta-llama/Llama-3.2-3B-Instruct",
    stream=False,
)
print(response)

>Formateado:

In [None]:
print(response.completion_message.content)

**Ej. Inferencia sin herramietnas**

In [None]:
from llama_stack_client import LlamaStackClient
from llama_stack_client.types import UserMessage

client = LlamaStackClient(
    base_url=f"http://20.72.80.241:5001",
)

response = client.inference.chat_completion(
    messages=[
        UserMessage(
            content="¿Quien Gano el último mundial?",
            role="user",
        ),
    ],
    model_id="meta-llama/Llama-3.2-3B-Instruct",
    stream=False,
)
print(response.completion_message.content)

# Agentes
## Uso de herramienta de Busqueda
tavily_search

In [None]:
import os
import sys

import fire
from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
from termcolor import colored
from utils import check_model_is_available, get_any_available_model
import textwrap




In [None]:
client = LlamaStackClient(
    base_url=f"http://20.72.80.241:5001",
    provider_data={"tavily_search_api_key": "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"},
)

**Verificar servicio inferencia (Modelo) Levantado**

In [None]:
available_shields = [shield.identifier for shield in client.shields.list()]
if not available_shields:
    print(colored("No available shields. Disabling safety.", "yellow"))
else:
    print(f"Available shields found: {available_shields}")

model_id= None

if model_id is None:
    model_id = get_any_available_model(client)
    if model_id is None:
        sys.exit("No hay un modelo")
else:
    if not check_model_is_available(client, model_id):
        sys.exit("El modelo no esta disponible")

print(f"Using model: {model_id}")

In [None]:
    agent = Agent(
        client,
        model=model_id,
        instructions=textwrap.dedent(
                """
                    Eres un asistente útil que responde a las preguntas del usuario con precisión.
                    Siempre utiliza la herramienta de búsqueda web para obtener resultados relevantes y cita las fuentes.
                    Responde de manera concisa y clara.
                """
            ),
        tools=["builtin::websearch"],
        input_shields=available_shields,
        output_shields=available_shields,
        enable_session_persistence=False,
    )

In [None]:
    user_prompts = [
        "Hola",
        "¿Quien gano el último mundial?",
    ]

In [None]:
    session_id = agent.create_session("test-session")
    for prompt in user_prompts:
        print(f"User> {prompt}")
        response = agent.create_turn(
            messages=[{"role": "user", "content": prompt}],
            session_id=session_id,
        )

        for log in AgentEventLogger().log(response):
            log.print()

## Evaluations
Creamos un agente

In [4]:
import os
import sys

import fire
from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
from termcolor import colored
from utils import check_model_is_available, get_any_available_model
import textwrap

client = LlamaStackClient(
    base_url=f"http://20.72.80.241:5001",
    provider_data={"tavily_search_api_key": "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"},
)

available_shields = [shield.identifier for shield in client.shields.list()]
if not available_shields:
    print(colored("No available shields. Disabling safety.", "yellow"))
else:
    print(f"Available shields found: {available_shields}")

model_id= None

if model_id is None:
    model_id = get_any_available_model(client)
    if model_id is None:
        sys.exit("No hay un modelo")
else:
    if not check_model_is_available(client, model_id):
        sys.exit("El modelo no esta disponible")

print(f"Using model: {model_id}")

agent = Agent(
        client,
        model=model_id,
        instructions="You are a helpful assistant. Use search tool to answer the questions. ",
        tools=["builtin::websearch"],
)
user_prompts = [
    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
    "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
    "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
]

session_id = agent.create_session("test-session")

for prompt in user_prompts:
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        session_id=session_id,
    )

    for log in AgentEventLogger().log(response):
        log.print()

No available shields. Disabling safety.
Using model: meta-llama/Llama-3.2-3B-Instruct
inference> brave_search.call(query="NBA Western Conference Finals 2024 teams")
tool_execution> Tool:brave_search Args:{'query': 'NBA Western Conference Finals 2024 teams'}
tool_execution> Tool:brave_search Response:{"query": "NBA Western Conference Finals 2024 teams", "top_k": [{"title": "2024 NBA Western Conference Finals - Basketball-Reference.com", "url": "https://www.basketball-reference.com/playoffs/2024-nba-western-conference-finals-mavericks-vs-timberwolves.html", "content": "2024 NBA Playoffs Dallas Mavericks vs. Dallas Mavericks vs. Dallas Mavericks vs. 5 Dallas Mavericks (4-1) vs. 7   Derrick Jones Jr. 2024 NBA Playoffs Dallas Mavericks vs. Dallas Mavericks vs. Dallas Mavericks vs. College Tools: Player Season Finder, Player Game Finder, Team Season Finder, Team Game Finder Players, Teams, Seasons, Leaders, Awards ... Players, Teams, Seasons, Leaders, Awards ... Players, Teams, Seasons, Lead

In [None]:
## Revisar que hicieron los agentes durante la sesión.. 

In [5]:

from rich.pretty import pprint

session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=agent.agent_id,
)

pprint(session_response)

In [7]:
num_tool_call = 0
for turn in session_response.turns:
    for step in turn.steps:
        if (
            step.step_type == "tool_execution"
            and step.tool_calls[0].tool_name == "brave_search"
        ):
            num_tool_call += 1

print(
    f"{num_tool_call}/{len(session_response.turns)} user prompts are followed by a tool call to `brave_search`"
)

0/1 user prompts are followed by a tool call to `brave_search`


In [8]:
eval_rows = []

expected_answers = [
    "Dallas Mavericks and the Minnesota Timberwolves",
    "Season 4, Episode 12",
    "King Cobra",
]

for i, turn in enumerate(session_response.turns):
    eval_rows.append(
        {
            "input_query": turn.input_messages[0].content,
            "generated_answer": turn.output_message.content,
            "expected_answer": expected_answers[i],
        }
    )

pprint(eval_rows)



In [9]:
scoring_params = {
    "basic::subset_of": None,
}
scoring_response = client.scoring.score(
    input_rows=eval_rows, scoring_functions=scoring_params
)
pprint(scoring_response)

# Almacenamiento, indexación y recuperación de datos
## DatasetIO
**Sin vectorización**

In [None]:
import fire
from llama_stack_client import Agent, AgentEventLogger, Document, LlamaStackClient
from termcolor import colored

from utils import check_model_is_available, get_any_available_model
from llama_stack_client import Agent, AgentEventLogger, Document, LlamaStackClient

client = LlamaStackClient(
    base_url=f"http://20.72.80.241:5001",
)

urls = [
    "memory_optimizations.rst",
    "chat.rst",
    "llama3.rst",
    "datasets.rst",
    "qat_finetune.rst",
    "lora_finetune.rst",
]

In [None]:
documents = [
    Document(
        content={
            "uri": f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        },
        mime_type="text/plain",
    )
    for _, url in enumerate(urls)
]


In [None]:
print(documents)

**Verificar servicio inferencia (Modelo) Levantado**

In [None]:
available_shields = [shield.identifier for shield in client.shields.list()]
if not available_shields:
    print(colored("No available shields. Disabling safety.", "yellow"))
else:
    print(f"Available shields found: {available_shields}")

model_id= None

if model_id is None:
    model_id = get_any_available_model(client)
    if model_id is None:
        sys.exit("No hay un modelo")
else:
    if not check_model_is_available(client, model_id):
        sys.exit("El modelo no esta disponible")

print(f"Using model: {model_id}")

**Creamos el agente:**

In [None]:
agent = Agent(
    client,
    model=model_id,
    instructions="Tu eres un asistente muy útil",
)
session_id = agent.create_session("test-session")
print(f"Created session_id={session_id} for Agent({agent.agent_id})")

In [None]:
user_prompts = [
    (
        "I am attaching some documentation for Torchtune to ask some questions.",
        documents,
    ),
    (
        "What are the top 5 topics that were explained? Only list succinct bullet points.",
        None,
    ),
    (
        "Was anything related to 'Llama3' discussed, if so what?",
        None,
    ),
    (
        "Tell me how to use LoRA",
        None,
    ),
    (
        "What about Quantization?",
        None,
    ),
]

for prompt, documents in user_prompts:
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        documents=documents,
        session_id=session_id,
    )
    print(colored(f"User> {prompt}", "blue"))
    for log in AgentEventLogger().log(response):
        log.print()

## Rag (VectorIO

In [None]:
from uuid import uuid4

import fire
import time
from llama_stack_client import Agent, AgentEventLogger, LlamaStackClient, RAGDocument
from termcolor import colored

documents = [
    RAGDocument(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]

In [None]:
print(f"Using model: {model_id}")

In [None]:
vector_providers = [
    provider for provider in client.providers.list() if provider.api == "vector_io"
]
if not vector_providers:
    print(colored("No available vector_io providers. Exiting.", "red"))
    sys.exit("No hay un vector_providers disponible")

In [None]:
selected_vector_provider = vector_providers[0]

### Create a vector database

In [None]:
vector_db_id = f"test_vector_db_{uuid4()}"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
    provider_id=selected_vector_provider.provider_id,
)

### Insert documents using the RAG tool

In [None]:
start_time = time.time()
client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)