# Example of the majority voting IO processor

This notebook shows how to use the majority voting IO processor to perform simple
majority voting over multiple model responses.

This notebook can run its own vLLM server to perform inference, or you can host the 
model on your own server. 

To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants in the cell marked
`# Constants go here`.

In [None]:
from granite_io import make_backend
from granite_io.io import make_io_processor
from granite_io.io.base import ChatCompletionInputs
from granite_io.io.voting import MajorityVotingProcessor, integer_normalizer
from granite_io.backend.vllm_server import LocalVLLMServer

In [None]:
# Constants go here.
model_name = "ibm-granite/granite-3.3-8b-instruct"
run_server = False

In [None]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    server = LocalVLLMServer(model_name)
    server.wait_for_startup(200)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # The constants here are for the server that local_vllm_server.ipynb starts.
    # Modify as needed.
    openai_base_url = "http://localhost:11434/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_model_name = "granite3.3:8b"
    backend = make_backend(
        "openai",
        {
            "model_name": openai_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

In [None]:
# Ask a question with an integral answer
completion_inputs = ChatCompletionInputs(
    messages=[
        {
            "role": "user",
            "content": "What is 234651 * 134?\nAnswer with just a number please.",
        }
    ],
    thinking=True,
    generate_inputs={"n": 5, "temperature": 0.8, "max_tokens": 1024},
)
completion_inputs

In [None]:
# Run the question through the base model
base_processor = make_io_processor("Granite 3.3", backend=backend)
results = base_processor.create_chat_completion(completion_inputs)
print("Outputs from base model:")
for result_num, r in enumerate(results.results):
    print(f"{result_num + 1}: {r.next_message.content}")

In [None]:
# Wrap the base model's I/O processor in a majority voting I/O processor.
voting_processor = MajorityVotingProcessor(
    base_processor, integer_normalizer, samples_per_completion=10
)
results = voting_processor.create_chat_completion(completion_inputs)
print("Outputs from base model augmented with majority voting:")
for result_num, r in enumerate(results.results):
    print(f"{result_num + 1}: {r.next_message.content}")

# What's the actual answer?
print(f"---------\nThe actual answer is: {234651 * 134}")

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()