In [None]:
import uuid
from argparse import Namespace
from typing import AsyncGenerator, Optional

import bentoml
import fastapi
from annotated_types import Ge, Le
from typing_extensions import Annotated

In [None]:
openai_api_app = fastapi.FastAPI()

MAX_MODEL_LEN = 8192 # The maximum length of the model's input context
MAX_TOKENS = 1024    # The maximum number of tokens the model can generate 

SYSTEM_PROMPT = """
You are a helpful and respectful assistant. Provide safe, unbiased, and accurate answers.
If a question is unclear or you don't know the answer, explain why instead of guessing.
"""

MODEL_ID = "Aidan777/Llama-3.2-11B-Vision-Instruct_finetuned_ecg"

# Mimick OpenAI's API
OPENAI_ENDPOINTS = [["/chat/completions", "create_chat_completion", ["POST"]],
                    ["/completions", "create_completion", ["POST"]],
                    ["/models", "show_available_models", ["GET"]]]

# Mounting the FastAPI with BentoML
@bentoml.mount_asgi_app(openai_api_app, path="/v1")
@bentoml.service(name="Llama-3.2-11B-Vision-Instruct_finetuned_ecg",
                 traffic={  "timeout": 1200,
                            "concurrency": 256,},
                resources={ "gpu": 1,
                            "gpu_type": ""})    # WARNING

class BentoVLLM:
    def __init__(self) -> None:
        """Initialize the BentoVLLM service with VLLM engine and tokenizer."""
        
        import vllm.entrypoints.openai.api_server as vllm_api_server
        from transformers import AutoTokenizer
        from vllm import AsyncEngineArgs, AsyncLLMEngine

        # Configure VLLM engine arguments
        ENGINE_ARGS = AsyncEngineArgs(
            model=MODEL_ID,
            max_model_len=MAX_MODEL_LEN,
            enable_prefix_caching=True)

        # Initialize engine and tokenizer
        self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

        # Register API endpoints
        for route, endpoint_name, methods in OPENAI_ENDPOINTS:
            endpoint_func = getattr(vllm_api_server, endpoint_name)
            openai_api_app.add_api_route(
                path=route,
                endpoint=endpoint_func,
                methods=methods)

        # Configure model arguments
        model_config = self.engine.engine.get_model_config()
        args = Namespace(
            model=MODEL_ID,
            disable_log_requests=True,
            max_log_len=1000,
            response_role="assistant",
            served_model_name=None,
            chat_template=None,
            lora_modules=None,
            prompt_adapters=None,
            request_logger=None,
            disable_log_stats=True,
            return_tokens_as_token_ids=False,
            enable_tool_call_parser=True,
            enable_auto_tool_choice=True,
            tool_call_parser="llama3_json",
            enable_prompt_tokens_details=False)

        # Initialize application state
        vllm_api_server.init_app_state(
            self.engine, model_config, openai_api_app.state, args)

    @bentoml.api
    async def generate( self,
                        prompt: str = "Describe the process of photosynthesis in simple terms",
                        system_prompt: Optional[str] = SYSTEM_PROMPT,
                        max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS ) -> AsyncGenerator[str, None]:
        
        """
        Generate text based on the input prompt using the VLLM engine.

        Args:
            prompt: The user's input prompt
            system_prompt: Optional system prompt to guide the model's behavior
            max_tokens: Maximum number of tokens to generate

        Returns:
            AsyncGenerator yielding generated text chunks
        """

        from vllm import SamplingParams

        SAMPLING_PARAM = SamplingParams(
            max_tokens=max_tokens,
            skip_special_tokens=True, )

        if system_prompt is None:
            system_prompt = SYSTEM_PROMPT

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},]

        prompt = self.tokenizer.apply_chat_template(messages,
                                                    tokenize=False,
                                                    add_generation_prompt=True,)
        
        stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM)
        
        cursor = 0
        async for request_output in stream:
            text = request_output.outputs[0].text
            yield text[cursor:]
            cursor = len(text)

In [None]:
import asyncio

bento_service = BentoVLLM()

def run_generate(prompt, system_prompt=None, max_tokens=512):
    """
    Run the asynchronous generate method in a synchronous environment.
    """
    async def generate_wrapper():
        
        result = ""

        # Call the async generate function and collect the response chunks
        async for chunk in bento_service.generate(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=max_tokens):
            result += chunk
        return result

    return asyncio.run(generate_wrapper())

prompt = "What are the benefits of renewable energy?"
response = run_generate(prompt)
print("Generated Response:", response)