## 1. Install the llama stack client

In [None]:
%pip install --upgrade pip
%pip install llama-stack
%pip install ragas langchain-together
%pip install pandas
%pip install langchain-groq
%pip install rapidfuzz
%pip install sacrebleu
%pip install python-dotenv

## 2. List available models

In [1]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://lsd-llama-milvus-service:8321")
print(client.models.list())

models = client.models.list()
inference_llm = next((model.identifier for model in models if model.model_type == 'llm'), None)
print(inference_llm)

# Check what vector databases exist
print("=== Available Vector Databases ===")
vector_dbs = client.vector_dbs.list()
if vector_dbs:
    for vdb in vector_dbs:
        print(f"- ID: {vdb.identifier}")
        print(f"  Provider: {vdb.provider_id}")
        print(f"  Embedding Model: {vdb.embedding_model}")
        print()
else:
    print("No vector databases found!")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/vector-dbs "HTTP/1.1 200 OK"


[Model(identifier='vllm', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='vllm', model_type='llm'), Model(identifier='ibm-granite/granite-embedding-125m-english', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]
vllm
=== Available Vector Databases ===
- ID: my_demo_image_ocr_vector_id
  Provider: milvus
  Embedding Model: ibm-granite/granite-embedding-125m-english



## 3. Import and run the KubeFlow Pipeline
Import the "[docling_convert_images_pipeline_ocr_only_compiled.yaml](./docling_convert_images_pipeline_ocr_only_compiled.yaml)" KubeFlow Pipeline into your pipeline server, then run the pipeline to insert your Image documents into the vector database.

When running the pipeline, you can customize the following parameters:

- `base_url`: Base URL to fetch Image files from
- `image_filenames`: Comma-separated list of PNG/JPG/JPEG/tiff/bmp/webp filenames to download and convert
- `num_workers`: Number of parallel workers
- `vector_db_id`: Milvus vector database ID
- `service_url`: Milvus service URL
- `embed_model_id`: Embedding model to use
- `max_tokens`: Maximum tokens per chunk
- `use_gpu`: Enable/disable GPU acceleration

Note: The compiled pipeline was generated by running `python docling_convert_images_pipeline_ocr_only.py`.

## 4. Prompt the LLM
Prompt the LLM with a question in relation to the documents inserted, and see it return accurate answers.

In [2]:
from llama_stack_client import Agent, AgentEventLogger
import uuid

rag_agent = Agent(
    client,
    model="vllm",
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["my_demo_image_ocr_vector_id"]},
        }
    ],
)


user_prompts = [
    "List RAG key market use cases",
    "Describe the sequence of steps of the Ingestion Flow"
]

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

for prompt in user_prompts:
    print("prompt>", prompt)
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
        stream=True,
    )
    for log in AgentEventLogger().log(response):
        log.print()

session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=rag_agent.agent_id,
)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/7680041b-0538-4440-9ff4-1ab62fce7cf5/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/7680041b-0538-4440-9ff4-1ab62fce7cf5/session/6433533b-f845-4956-8fd7-3c052f2ebe29/turn "HTTP/1.1 200 OK"


prompt> List RAG key market use cases
[33minference> [0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'RAG key market use cases'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 4 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\nKnowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\nCode Generation: Retrieving relevant code snippets and documentation to assist in code creation.\nRecommendation Systems: Enhancing recommendations by providing relevant context.\nCustomer Service: Improving support accuracy with access to current product information.\nPersonal Assistants: Enabling more comprehensive and accurate information from Al assistants .\nMulti-hop Question Answering: Handling complex;

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/7680041b-0538-4440-9ff4-1ab62fce7cf5/session/6433533b-f845-4956-8fd7-3c052f2ebe29/turn "HTTP/1.1 200 OK"


prompt> Describe the sequence of steps of the Ingestion Flow
[33minference> [0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Ingestion Flow RAG sequence of steps'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 4 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\nKnowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\nCode Generation: Retrieving relevant code snippets and documentation to assist in code creation.\nRecommendation Systems: Enhancing recommendations by providing relevant context.\nCustomer Service: Improving support accuracy with access to current product information.\nPersonal Assistants: Enabling more comprehensive and accurate information from Al assistants .\nMulti-hop Qu

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/agents/7680041b-0538-4440-9ff4-1ab62fce7cf5/session/6433533b-f845-4956-8fd7-3c052f2ebe29 "HTTP/1.1 200 OK"


## 5. Preparation for evaluating RAG models using [RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/?h=metrics)

We will two key metrics to show the performance of RAG server:
1. [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/) - measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.
2. [Response Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/) - metric measures how relevant a response is to the user input. Higher scores indicate better alignment with the user input, while lower scores are given if the response is incomplete or includes redundant information.

In [3]:
import re
from typing import List, Dict, Any, Union
from llama_stack_client.types.agents import Turn

# This function extracts the search results for the trace of each query
def extract_retrieved_contexts(turn_object: Turn) -> List[str]:
    """
    Extracts retrieved contexts from LlamaStack tool execution responses.
    
    Args:
        turn_object: A Turn object from LlamaStack containing steps with tool responses
        
    Returns:
        List of retrieved context strings for Ragas evaluation
    """
    retrieved_context = []

    for step in turn_object.steps:
        if step.step_type == "tool_execution":
            tool_responses = step.tool_responses
            for response in tool_responses:
                content = response.content
                if content and isinstance(content, list):
                    # Content is a list of TextContentItem objects
                    for item in content:
                        # Check if item has text attribute and is a TextContentItem
                        if (
                            hasattr(item, "text")
                            and hasattr(item, "type")
                            and item.type == "text"
                        ):
                            text = item.text
                            # Look for "Result X" patterns and extract content
                            if (
                                text
                                and text.startswith("Result ")
                                and "Content:" in text
                            ):
                                # Extract the content part after "Content:"
                                content_match = re.search(
                                    r"Content:\s*(.*?)(?=\nMetadata:|$)",
                                    text,
                                    re.DOTALL,
                                )
                                if content_match:
                                    content_text = content_match.group(1).strip()
                                    retrieved_context.append(content_text)

    return retrieved_context

In [4]:
from ragas.dataset_schema import EvaluationDataset

samples = []

references = ['''
1. Knowledge Question Answering: Providing accurate answers in customer service, product manuals, or FAQs.
2. Code Generation: Retrieving relevant code snippets and documentation to assist in code creation.
3. Recommendation Systems: Enhancing recommendations by providing relevant context.
4. Customer Service: Improving support accuracy with access to current product information.
5. Personal Assistants: Enabling more comprehensive and accurate information from AI assistants.
6. Multi-hop Question Answering: Handling complex, multi-step questions through iterative retrieval.
7. Legal Applications: Retrieving legal documents and case law for reliable legal opinions.
8. General Task Assistance: Aiding users in various tasks requiring information access and decision-making.''', 
'''
Ingestion Flow:
1. Document Upload (via UI/API)
2. Docling (Chunking + Metadata)
3. Embedding (Granite Embedding via vLLM)
4. Milvus (Vector DB)
'''
]

# Constructing a Ragas EvaluationDataset
for i, turn in enumerate(session_response.turns):
    samples.append(
        {
            "user_input": turn.input_messages[0].content,
            "response": turn.output_message.content,
            "reference": references[i],
            "retrieved_contexts": extract_retrieved_contexts(turn),
        }
    )

ragas_eval_dataset = EvaluationDataset.from_list(samples)
ragas_eval_dataset.to_pandas()

  from .autonotebook import tqdm as notebook_tqdm
INFO:datasets:PyTorch version 2.7.1 available.


Unnamed: 0,user_input,retrieved_contexts,response,reference
0,List RAG key market use cases,[Market Use Cases Key\nRAG is being adopted ac...,"Based on the search results, RAG key market us...",\n1. Knowledge Question Answering: Providing a...
1,Describe the sequence of steps of the Ingestio...,[Market Use Cases Key\nRAG is being adopted ac...,"Based on the search results, the sequence of s...",\nIngestion Flow:\n1. Document Upload (via UI/...


## 6. Create .env.dev file and paste there your API Key from Groq Cloud
1. Copy run this command: vi .env.dev
2. GROQ_API_KEY="YOUR_GROQ_API_KEY"

In [5]:
from dotenv import load_dotenv

# load env variable
load_dotenv(dotenv_path=".env.dev")

True

## 7. Prerequisites for RAG evaluation

In [6]:
from ragas.metrics import (
    Faithfulness, 
    LLMContextPrecisionWithReference,
    LLMContextRecall,
    ResponseRelevancy,
) 
from ragas.dataset_schema import SingleTurnSample 
from langchain_groq import ChatGroq
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings

llm = ChatGroq(
    # model="llama3-8b-8192",
    model="meta-llama/llama-4-maverick-17b-128e-instruct",
    temperature=0,
)

# Wrap the Groq LLM for use with Ragas
evaluator_llm = LangchainLLMWrapper(llm)

# Using HuggingFace embeddings as a free alternative
embeddings_model = HuggingFaceEmbeddings(
    # model_name="sentence-transformers/all-MiniLM-L6-v2"
    model_name="ibm-granite/granite-embedding-125m-english"
)
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings_model)


# references for both prompts
reference_for_first_prompt = samples[0]["reference"]
reference_for_second_prompt = samples[1]["reference"]

# inputs for both prompts
user_input_for_first_prompt = samples[0]["user_input"]
user_input_for_second_prompt = samples[1]["user_input"]

# responses for both prompts
response_for_first_prompt = samples[0]["response"]
response_for_second_prompt = samples[1]["response"]


reference_list_for_first_prompt = [line.strip() for line in reference_for_first_prompt.strip().split('\n')]
reference_list_for_second_prompt = [line.strip() for line in reference_for_second_prompt.strip().split('\n')]

# Retrieved contexts for both prompts
retrieved_contexts_for_first_prompt = samples[0]["retrieved_contexts"]
retrieved_contexts_for_second_prompt = samples[1]["retrieved_contexts"]

print(f"Retrieved contexts for the first prompt: {retrieved_contexts_for_first_prompt}\n")
print(f"Retrieved contexts for the second prompt: {retrieved_contexts_for_second_prompt}\n")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: ibm-granite/granite-embedding-125m-english


Retrieved contexts for the first prompt: ['Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\nKnowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\nCode Generation: Retrieving relevant code snippets and documentation to assist in code creation.\nRecommendation Systems: Enhancing recommendations by providing relevant context.\nCustomer Service: Improving support accuracy with access to current product information.\nPersonal Assistants: Enabling more comprehensive and accurate information from Al assistants .\nMulti-hop Question Answering: Handling complex; multi-step questions through iterative retrieval.\nLegal Applications: Retrieving legal documents and case law for reliable legal opinions.\nGeneral Task Assistance: Aiding users in various tasks requiring information access and decision-making:\nThe rising demand for hyper-personalized content in areas like marketing and e-c

## 8. Evaluate Faithfulness Score for both prompts

In [7]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
faithfulness_scorer = Faithfulness(llm=evaluator_llm)
faithfulness_score_for_first_prompt = await faithfulness_scorer.single_turn_ascore(first_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[0]}': {faithfulness_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'List RAG key market use cases': 0.8636363636363636


In [8]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
faithfulness_score_for_second_prompt = await faithfulness_scorer.single_turn_ascore(second_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[1]}': {faithfulness_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 17.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'Describe the sequence of steps of the Ingestion Flow': 0.9375


## 9. Evaluate Response Relevancy for both prompts

In [9]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
response_relevancy_scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)
response_relevancy_score_for_first_prompt = await response_relevancy_scorer.single_turn_ascore(first_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[0]}': {response_relevancy_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'List RAG key market use cases': 0.9749768611161626


In [10]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
response_relevancy_score_for_second_prompt = await response_relevancy_scorer.single_turn_ascore(second_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[1]}': {response_relevancy_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 11.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 11.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 11.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'Describe the sequence of steps of the Ingestion Flow': 0.9328236620990006
