In [4]:
# pip install bentoml fastapi transformers vllm

import uuid
from argparse import Namespace
from typing import AsyncGenerator, Optional

import bentoml
import fastapi
from annotated_types import Ge, Le
from typing_extensions import Annotated

In [5]:
openai_api_app = fastapi.FastAPI()

MAX_MODEL_LEN = 8192 # The maximum length of the model's input context
MAX_TOKENS = 1024    # The maximum number of tokens the model can generate 

SYSTEM_PROMPT = """
You are a helpful and respectful assistant. Provide safe, unbiased, and accurate answers.
If a question is unclear or you don't know the answer, explain why instead of guessing.
"""

# Model from the hugging  face
MODEL_ID = "Llama-3.2-11B-Vision-Instruct_finetuned_ecg_vllm" # "Aidan777/Llama-3.2-11B-Vision-Instruct_finetuned_ecg"

# Mimick OpenAI's API
OPENAI_ENDPOINTS = [["/chat/completions", "create_chat_completion", ["POST"]],
                    ["/completions", "create_completion", ["POST"]],
                    ["/models", "show_available_models", ["GET"]]]

# Mounting the FastAPI with BentoML
@bentoml.mount_asgi_app(openai_api_app, path="/v1")
@bentoml.service(name="Llama-3.2-11B-Vision-Instruct_finetuned_ecg",
                 traffic={  "timeout": 1200,
                            "concurrency": 256,},
                resources={ "gpu": 1,
                            "gpu_type": "NVIDIA_RTX_A5000"})   

class BentoVLLM:
    def __init__(self) -> None:
        """Initialize the BentoVLLM service with VLLM engine and tokenizer."""
        
        import vllm.entrypoints.openai.api_server as vllm_api_server
        from transformers import AutoTokenizer
        from vllm import AsyncEngineArgs, AsyncLLMEngine
        
        ENGINE_ARGS = AsyncEngineArgs(model=MODEL_ID,
                                      max_model_len=MAX_MODEL_LEN,
                                      enable_prefix_caching=True, 
                                      trust_remote_code=True)

        self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

        # Register API endpoints
        for route, endpoint_name, methods in OPENAI_ENDPOINTS:
            endpoint_func = getattr(vllm_api_server, endpoint_name)
            openai_api_app.add_api_route(
                path=route,
                endpoint=endpoint_func,
                methods=methods)

        # Configure model arguments
        model_config = self.engine.engine.get_model_config()
        args = Namespace(
            model=MODEL_ID,
            disable_log_requests=True,
            max_log_len=1000,
            response_role="assistant",
            served_model_name=None,
            chat_template=None,
            lora_modules=None,
            prompt_adapters=None,
            request_logger=None,
            disable_log_stats=True,
            return_tokens_as_token_ids=False,
            enable_tool_call_parser=True,
            enable_auto_tool_choice=True,
            tool_call_parser="llama3_json",
            enable_prompt_tokens_details=False)

        # Initialize application state
        vllm_api_server.init_app_state(self.engine, model_config, openai_api_app.state, args)

    @bentoml.api
    async def generate( self,
                        prompt: str = "Describe the process of photosynthesis in simple terms",
                        system_prompt: Optional[str] = SYSTEM_PROMPT,
                        max_tokens: Annotated[int, Ge(128), Le(MAX_TOKENS)] = MAX_TOKENS ) -> AsyncGenerator[str, None]:
        
        """
        Generate text based on the input prompt using the VLLM engine.

        Args:
            prompt: The user's input prompt
            system_prompt: Optional system prompt to guide the model's behavior
            max_tokens: Maximum number of tokens to generate

        Returns:
            AsyncGenerator yielding generated text chunks
        """

        from vllm import SamplingParams

        SAMPLING_PARAM = SamplingParams(
            max_tokens=max_tokens,
            skip_special_tokens=True, )

        if system_prompt is None:
            system_prompt = SYSTEM_PROMPT

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},]

        prompt = self.tokenizer.apply_chat_template(messages,
                                                    tokenize=False,
                                                    add_generation_prompt=True,)
        
        stream = await self.engine.add_request(uuid.uuid4().hex, prompt, SAMPLING_PARAM)
        
        cursor = 0
        async for request_output in stream:
            text = request_output.outputs[0].text
            yield text[cursor:]
            cursor = len(text)

  @bentoml.mount_asgi_app(openai_api_app, path="/v1")


In [6]:
bento_service = BentoVLLM()

INFO 03-01 00:24:05 __init__.py:207] Automatically detected platform cuda.
INFO 03-01 00:24:10 config.py:549] This model supports multiple tasks: {'reward', 'generate', 'score', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 03-01 00:24:10 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='Llama-3.2-11B-Vision-Instruct_finetuned_ecg_vllm', speculative_config=None, tokenizer='Llama-3.2-11B-Vision-Instruct_finetuned_ecg_vllm', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward

Initializing service error
Traceback (most recent call last):
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/_bentoml_sdk/service/factory.py", line 303, in __call__
    instance = self.inner()
  File "/tmp/ipykernel_2193/2896168647.py", line 40, in __init__
    self.engine = AsyncLLMEngine.from_engine_args(ENGINE_ARGS)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 644, in from_engine_args
    engine = cls(
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 594, in __init__
    self.engine = self._engine_class(*args, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 267, in __init__
    super().__init__(*args, **kwargs)
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 273, in __init__
    self.model_executor = executor_class(vllm_config=vllm_co

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 27.50 MiB is free. Process 4022426 has 23.62 GiB memory in use. Of the allocated memory 23.34 GiB is allocated by PyTorch, and 23.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import asyncio

bento_service = BentoVLLM()

def run_generate(prompt, system_prompt=None, max_tokens=512):
    """
    Run the asynchronous generate method in a synchronous environment.
    """
    async def generate_wrapper():
        
        result = ""

        # Call the async generate function and collect the response chunks
        async for chunk in bento_service.generate(
            prompt=prompt,
            system_prompt=system_prompt,
            max_tokens=max_tokens):
            result += chunk
        return result

    return asyncio.run(generate_wrapper())

prompt = "What are the benefits of renewable energy?"
response = run_generate(prompt)
response

In [1]:
from unsloth import FastVisionModel

model_name = "Aidan777/Llama-3.2-11B-Vision-Instruct_finetuned_ecg"

model, tokenizer = FastVisionModel.from_pretrained(
                                        model_name=model_name, 
                                        load_in_4bit=True)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.15: Fast Mllama vision patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA RTX A5000. Max memory: 23.679 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
model.save_pretrained_merged("Llama-3.2-11B-Vision-Instruct_finetuned_ecg_vllm",
                             tokenizer,
                             save_method = "merged_16bit")

Downloading safetensors for unsloth/llama-3.2-11b-vision-instruct...


model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 5/5 [01:32<00:00, 18.52s/it]
