# Demonstration of the Granite answerability intrisic

This notebook shows the usage of the IO processor for the Granite answerability intrisic, 
also known as the [LoRA Adapter for Answerability Classification](
    https://huggingface.co/ibm-granite/granite-3.3-8b-rag-agent-lib/blob/main/answerability_prediction_lora/README.md
)

This notebook can run its own vLLM server to perform inference, or you can host the 
models on your own server. 

To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants 
`openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [None]:
# Imports go here
from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io import UserMessage, make_backend
from granite_io.io.granite_3_3.input_processors.granite_3_3_input_processor import (
    Granite3Point3Inputs,
)
from granite_io.io.answerability import AnswerabilityIOProcessor
from granite_io.io.rag_agent_lib import obtain_lora

In [None]:
# Constants go here
base_model_name = "ibm-granite/granite-3.3-8b-instruct"
lora_model_name = "answerability_prediction"
run_server = True

In [None]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    # Download and cache the model's LoRA adapter.
    lora_model_path = obtain_lora(lora_model_name)
    print(f"Local path to LoRA adapter: {lora_model_path}")
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_path)]
    )
    server.wait_for_startup(200)
    lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "http://localhost:55555/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name
    backend = make_backend(
        "openai",
        {
            "model_name": openai_base_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )
    lora_backend = make_backend(
        "openai",
        {
            "model_name": openai_lora_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

In [None]:
# Create an example chat completion with a user question and two documents.
chat_input = Granite3Point3Inputs.model_validate(
    {
        "messages": [
            {"role": "assistant", "content": "Welcome to pet questions!"},
            {"role": "user", "content": "Does my dog have fleas?"},
        ],
        "documents": [
            {"doc_id": 1, "text": "My dog has fleas."},
            {"doc_id": 2, "text": "My cat does not have fleas."},
        ],
        "generate_inputs": {"temperature": 0.0},
    }
)

chat_input

In [None]:
# Instantiate the I/O processor for the answerability LoRA adapter
io_proc = AnswerabilityIOProcessor(lora_backend)

# Pass our example input thorugh the I/O processor and retrieve the result
chat_result = await io_proc.acreate_chat_completion(chat_input)
chat_result.results[0].next_message.content

In [None]:
# Try some variations on the original question
variations = [
    "Does my cat have no fleas?",  # Answerable
    "Does my cat have green eyes?",  # Unanswerable
    "Does my elephant have fleas?",  # Unanswerable
    "Which of my pets have fleas?",  # Answerable
    "What is the population of Australia?",  # Unanswerable
]

for variation in variations:
    updated_messages = chat_input.messages.copy()
    updated_messages[-1] = UserMessage(content=variation)
    chat_result = await io_proc.acreate_chat_completion(
        chat_input.model_copy(update={"messages": updated_messages})
    )
    print(f"'{variation}' => {chat_result.results[0].next_message.content}")

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()