# Demonstration of the Granite query expansion intrisic

This notebook shows the usage of the IO processor for the Granite Query Expansion (QE) intrinsic. Given a conversation ending with a user query, QE is specifically designed to probe the retriever from multiple angles by generating a set of semantically diverse versions of that last user query.

This notebook can run its own vLLM server to perform inference, or you can host the models on your own server. To use your own server, set the `run_server` variable below to `False` and set appropriate values for the constants `openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [None]:
# Imports go here
from granite_io.io.query_expansion import QueryExpansionIOProcessor
from granite_io.io.query_rewrite import QueryRewriteIOProcessor
from granite_io.io.rag_agent_lib import obtain_lora

from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io import make_io_processor, make_backend
from granite_io.io.base import ChatCompletionInputs

In [None]:
# Constants go here
base_model_name = "ibm-granite/granite-3.3-8b-instruct"
lora_model_name = "query_rewrite"
run_server = True

In [None]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    # Download and cache the model's LoRA adapter.
    lora_model_path = obtain_lora(lora_model_name)
    print(f"Local path to LoRA adapter: {lora_model_path}")
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_path)]
    )
    server.wait_for_startup(200)
    query_rewrite_lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "http://localhost:55555/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name

    backend = make_backend(
        "openai",
        {
            "model_name": openai_base_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )
    query_rewrite_lora_backend = make_backend(
        "openai",
        {
            "model_name": openai_lora_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

In [None]:
input_messages = [
    {
        "role": "assistant",
        "content": "Welcome to the California State Parks help desk.",
    },
    {
        "role": "user",
        "content": "I'm a student. Do you have internships?",
    },
    {
        "role": "assistant",
        "content": "The California State Parks hires Student Assistants "
        "to perform a variety of tasks that require limited or no previous "
        "work experience.",
    },
    {"role": "user", "content": "Cool, how do I sign up?"},
]

chat_input_tmp = ChatCompletionInputs(
    messages=input_messages,
    generate_inputs={
        "temperature": 1,
        "max_tokens": 4096,
    },
)
print("Inputs for chat completion:", chat_input_tmp)

In [None]:
# Spin up an IO processor for the base model
io_processor = make_io_processor(base_model_name, backend=backend)
rewrite_io_proc = QueryRewriteIOProcessor(query_rewrite_lora_backend)

In [None]:
rag_io_proc = QueryExpansionIOProcessor(
    backend,
    io_processor,
    rewrite_io_proc,
)

qe_result = rag_io_proc.create_chat_completion(chat_input_tmp)
print(qe_result)

qe_result_strs = [r.next_message.content for r in qe_result.results]
print("\nQuery Expansion Results:")
for i, result in enumerate(qe_result_strs):
    print(f"Result {i + 1}: {result}")

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()