# Demonstration of the Granite certainty intrisic

This notebook shows the usage of the IO processor for the Granite certainty intrisic, 
also known as the [Granite 3.3 8B Instruct Uncertainty LoRA](
    https://huggingface.co/ibm-granite/granite-3.3-8b-rag-agent-lib/blob/main/certainty_lora/README.md
)

This notebook can run its own vLLM server to perform inference, or you can host the 
models on your own server. 

To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants 
`openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [None]:
# Imports go here
from granite_io.io.granite_3_3.input_processors.granite_3_3_input_processor import (
    Granite3Point3Inputs,
)
from granite_io import make_io_processor, make_backend
from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io.io.certainty import CertaintyIOProcessor, CertaintyCompositeIOProcessor
from granite_io.io.rag_agent_lib import obtain_lora

In [None]:
# Constants go here
base_model_name = "ibm-granite/granite-3.3-8b-instruct"
lora_model_name = "certainty"

run_server = True

In [None]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    # Download and cache the model's LoRA adapter.
    lora_model_path = obtain_lora(lora_model_name)
    print(f"Local path to LoRA adapter: {lora_model_path}")
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_path)]
    )
    server.wait_for_startup(200)
    lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "http://localhost:55555/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name
    backend = make_backend(
        "openai",
        {
            "model_name": openai_base_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )
    lora_backend = make_backend(
        "openai",
        {
            "model_name": openai_lora_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

In [None]:
# Create an example chat completion with a user question and two documents.
chat_input = Granite3Point3Inputs.model_validate(
    {
        "messages": [
            {"role": "assistant", "content": "Welcome to pet questions!"},
            {"role": "user", "content": "Which of my pets have fleas?"},
        ],
        "documents": [
            {"doc_id": 1, "text": "My dog has fleas."},
            {"doc_id": 2, "text": "My cat does not have fleas."},
        ],
        "generate_inputs": {
            "temperature": 0.0,
            "max_tokens": 4096,
        },
    }
)
chat_input

In [None]:
# Pass the example input through Granite 3.3 to get an answer
granite_io_proc = make_io_processor("Granite 3.3", backend=backend)
result = await granite_io_proc.acreate_chat_completion(chat_input)
result.results[0].next_message

In [None]:
# Append the model's output to the chat
next_chat_input = chat_input.with_next_message(result.results[0].next_message)
next_chat_input.messages

In [None]:
# Instantiate the I/O processor for the certainty intrinsic
io_proc = CertaintyIOProcessor(lora_backend)

# Set temperature to 0 because we are not sampling from the intrinsic's output
next_chat_input = next_chat_input.with_addl_generate_params({"temperature": 0.0})

# Pass our example input through the I/O processor and retrieve the result
chat_result = await io_proc.acreate_chat_completion(next_chat_input)

print(
    f"Certainty score for the original response is "
    f"{chat_result.results[0].next_message.content}"
)

In [None]:
# Try with an artifical poor-quality assistant response.
from granite_io.types import AssistantMessage

chat_result_2 = await io_proc.acreate_chat_completion(
    chat_input.with_next_message(
        AssistantMessage(content="Your iguana is absolutely covered in fleas.")
    ).with_addl_generate_params({"temperature": 0.0})
)
print(
    f"Certainty score for the low-quality response is "
    f"{chat_result_2.results[0].next_message.content}"
)

In [None]:
# Use majority voting to get a second opinion
from granite_io.io.voting import MBRDMajorityVotingProcessor

voting_proc = MBRDMajorityVotingProcessor(io_proc)
next_chat_input.generate_inputs.temperature = 0.1
chat_result_3 = await voting_proc.acreate_chat_completion(
    next_chat_input.with_addl_generate_params({"n": 10})
)
print(
    f"Certainty score with majority voting is "
    f"{chat_result_3.results[0].next_message.content}"
)

In [None]:
# Use the composite processor to generate multiple completions and filter those that
# are below a certainty threshold
composite_proc = CertaintyCompositeIOProcessor(
    granite_io_proc, lora_backend, threshold=0.8, include_score=True
)
composite_results = await composite_proc.acreate_chat_completion(
    chat_input.with_addl_generate_params({"n": 5, "temperature": 1.0})
)
composite_results.results

In [None]:
# Change the certainty threshold and try again
composite_proc.update_threshold(0.9)
composite_results_2 = await composite_proc.acreate_chat_completion(
    chat_input.with_addl_generate_params({"n": 5, "temperature": 1.0})
)
composite_results_2.results

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()