# Quickstart - Inference
Run inference via chat completions with llama-stack Python SDK.

In [1]:
# Imports
import os
import sys

from llama_stack_client import LlamaStackClient

## Setup

In [2]:
# Create HTTP client
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")

In [3]:
# List available models
models = client.models.list()
print("--- Available models: ---")
for m in models:
    print(f"- {m.identifier}")
print()

--- Available models: ---
- Llama-3.2-11B-Vision-Instruct
- Llama-3.2-90B-Vision-Instruct
- Meta-Llama-3.1-405B-Instruct
- Meta-Llama-3.1-70B-Instruct
- Meta-Llama-3.1-8B-Instruct
- Meta-Llama-3.2-1B-Instruct
- Meta-Llama-3.2-3B-Instruct
- Meta-Llama-3.3-70B-Instruct
- Meta-Llama-Guard-3-8B
- all-MiniLM-L6-v2
- meta-llama/Llama-3.1-405B-Instruct-FP8
- meta-llama/Llama-3.1-70B-Instruct
- meta-llama/Llama-3.1-8B-Instruct
- meta-llama/Llama-3.2-11B-Vision-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-90B-Vision-Instruct
- meta-llama/Llama-3.3-70B-Instruct
- meta-llama/Llama-Guard-3-8B
- sambanova/Llama-3.2-11B-Vision-Instruct
- sambanova/Llama-3.2-90B-Vision-Instruct
- sambanova/Meta-Llama-3.1-405B-Instruct
- sambanova/Meta-Llama-3.1-70B-Instruct
- sambanova/Meta-Llama-3.1-8B-Instruct
- sambanova/Meta-Llama-3.2-1B-Instruct
- sambanova/Meta-Llama-3.2-3B-Instruct
- sambanova/Meta-Llama-3.3-70B-Instruct
- sambanova/Meta-Llama-Guard-3-8B

## Create a Chat Completion Request
Use the chat_completion function to define the conversation context. Each message you include should have a specific role and content:

In [4]:
# Choose an inference model from the previous list
inference_model = "sambanova/Meta-Llama-3.2-3B-Instruct"

In [5]:
response = client.inference.chat_completion(
    messages=[
        {"role": "system", "content": "You are a friendly assistant."},
        {"role": "user", "content": "Write a two-sentence poem about llama."},
    ],
    model_id=inference_model,
)

print(response.completion_message.content)


With gentle eyes and soft, fuzzy hair,
The llama roams, a gentle, peaceful air.


## Conversation Loop
To create a continuous conversation loop, where users can input multiple messages in a session, use the following structure. This example runs an asynchronous loop, ending when the user types 'exit,' 'quit,' or 'bye.'

In [6]:
import asyncio
from llama_stack_client import LlamaStackClient
from termcolor import cprint

model = "sambanova/Meta-Llama-3.2-3B-Instruct"


async def chat_loop():
    while True:
        user_input = input("User> ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            cprint("Ending conversation. Goodbye!", "yellow")
            break

        message = {"role": "user", "content": user_input}
        response = client.inference.chat_completion(messages=[message], model_id=model)
        cprint(f"> Response: {response.completion_message.content}", "cyan")


# Run the chat loop in a Jupyter Notebook cell using await
await chat_loop()
# To run it in a python file, use this line instead
# asyncio.run(chat_loop())

[36m> Response: I can be used in a variety of ways, so feel free to ask me anything that's on your mind. Here are some suggestions to get you started:

1. **Learning a new skill**: I can provide information and guidance on a wide range of topics, from science and history to culture and entertainment.
2. **Research assistance**: I can help you find information on a particular topic or answer specific questions you may have.
3. **Language practice**: I can practice conversing with you in a language you're learning.
4. **Writing and proofreading**: I can help you generate text or edit what you've written.
5. **Conversation and entertainment**: We can chat about your day, interests, or hobbies.
6. **Brainstorming and ideas**: I can help generate ideas or suggestions for projects, problems, or creative pursuits.
7. **Jokes and humor**: I can try to make you laugh with a joke or a funny story.
8. **Trivia and games**: We can play text-based games like Hangman, 20 Questions, or Word Jumble.


## Conversation History
Maintaining a conversation history allows the model to retain context from previous interactions. Use a list to accumulate messages, enabling continuity throughout the chat session.

In [7]:
model = "sambanova/Meta-Llama-3.2-3B-Instruct"


async def chat_loop():
    conversation_history = []
    while True:
        user_input = input("User> ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            cprint("Ending conversation. Goodbye!", "yellow")
            break

        user_message = {"role": "user", "content": user_input}
        conversation_history.append(user_message)

        response = client.inference.chat_completion(
            messages=conversation_history,
            model_id=model,
        )
        cprint(f"> Response: {response.completion_message.content}", "cyan")

        # Append the assistant message with all required fields
        assistant_message = {
            "role": "user",
            "content": response.completion_message.content,
            # Add any additional required fields here if necessary
        }
        conversation_history.append(assistant_message)


# Use `await` in the Jupyter Notebook cell to call the function
await chat_loop()
# To run it in a python file, use this line instead
# asyncio.run(chat_loop())

[36m> Response: I can be used in a variety of ways, so feel free to ask me anything that's on your mind. Here are some suggestions to get you started:

1. **Learning a new skill**: I can provide information and guidance on a wide range of topics, from science and history to culture and entertainment.
2. **Research assistance**: I can help you find information on a particular topic or answer specific questions you may have.
3. **Language practice**: I can practice conversing with you in a language you're learning.
4. **Writing and proofreading**: I can help you generate text or edit what you've written.
5. **Conversation and entertainment**: We can chat about your day, interests, or hobbies.
6. **Brainstorming and ideas**: I can help generate ideas or suggestions for projects, problems, or creative pursuits.
7. **Jokes and humor**: I can try to make you laugh with a joke or a funny story.
8. **Trivia and games**: We can play text-based games like Hangman, 20 Questions, or Word Jumble.


## Streaming Responses
Llama Stack offers a stream parameter in the chat_completion function, which allows partial responses to be returned progressively as they are generated. This can enhance user experience by providing immediate feedback without waiting for the entire response to be processed.

In [8]:
from llama_stack_client.lib.inference.event_logger import EventLogger

model = "sambanova/Meta-Llama-3.2-3B-Instruct"


async def run_main(stream: bool = True):
    message = {"role": "user", "content": "Please write me a 3 sentence poem about llamas."}
    cprint(f'User> {message["content"]}', "green")

    response = client.inference.chat_completion(
        messages=[message],
        model_id=model,
        stream=stream,
    )

    if not stream:
        cprint(f"> Response: {response.completion_message.content}", "cyan")
    else:
        for log in EventLogger().log(response):
            log.print()


# In a Jupyter Notebook cell, use `await` to call the function
await run_main()
# To run it in a python file, use this line instead
# asyncio.run(run_main())

[32mUser> Please write me a 3 sentence poem about llamas.[0m
[36mAssistant> [0m[33mHere's a 3-sentence poem about llamas:

With gentle eyes and soft fur so bright,
[0m[33mThe llama roams, a gentle, peaceful sight.
Their soft humming calls, a [0m[33msoothing delight.[0m[97m[0m
