In [33]:
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.

Launch the vLLM server with the following command:

(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf

(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \
    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'

(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
    --max-model-len 4096 --trust-remote-code

run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
"""

import base64

import requests
from openai import OpenAI

from vllm.utils import FlexibleArgumentParser

# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://192.168.170.76:8000/v1"

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key=openai_api_key,
    base_url=openai_api_base,
)

headers = {"User-Agent": "vLLM Example Client"}


def encode_base64_content_from_url(content_url: str) -> str:
    """Encode a content retrieved from a remote url to base64 format."""

    with requests.get(content_url, headers=headers) as response:
        response.raise_for_status()
        result = base64.b64encode(response.content).decode("utf-8")

    return result

import base64
from pathlib import Path

def encode_base64_content_from_image_path(image_path: str) -> str:
    """Encode an image file from a local path to base64 format."""
    
    path = Path(image_path)
    if not path.is_file():
        raise FileNotFoundError(f"No file found at {image_path}")
    
    with open(path, "rb") as image_file:
        result = base64.b64encode(image_file.read()).decode("utf-8")
    
    return result


# Text-only inference
def run_text_only(model: str, max_completion_tokens: int) -> None:
    chat_completion = client.chat.completions.create(
        messages=[{"role": "user", "content": "What's the capital of France?"}],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion.choices[0].message.content
    print("Chat completion output:\n", result)



# Single-image input inference
def run_image_path(model: str,image_path, max_completion_tokens: int) -> None:
    image_base64 = encode_base64_content_from_image_path(image_path)
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded image:", result)



# Single-image input inference
def run_single_image(model: str, max_completion_tokens: int) -> None:
    ## Use image url in the payload
    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
    chat_completion_from_url = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url},
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output from image url:\n", result)

    ## Use base64 encoded image in the payload
    image_base64 = encode_base64_content_from_url(image_url)
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded image:", result)


# Multi-image input inference
def run_multi_image(model: str, max_completion_tokens: int) -> None:
    image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
    image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
    chat_completion_from_url = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What are the animals in these images?"},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url_duck},
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url_lion},
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output:\n", result)


# Video input inference
def run_video(model: str, max_completion_tokens: int) -> None:
    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
    video_base64 = encode_base64_content_from_url(video_url)

    ## Use video url in the payload
    chat_completion_from_url = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this video?"},
                    {
                        "type": "video_url",
                        "video_url": {"url": video_url},
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output from video url:\n", result)

    ## Use base64 encoded video in the payload
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this video?"},
                    {
                        "type": "video_url",
                        "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded video:\n", result)


# Audio input inference
def run_audio(model: str, max_completion_tokens: int) -> None:
    from vllm.assets.audio import AudioAsset

    audio_url = AudioAsset("winning_call").url
    audio_base64 = encode_base64_content_from_url(audio_url)

    # OpenAI-compatible schema (`input_audio`)
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this audio?"},
                    {
                        "type": "input_audio",
                        "input_audio": {
                            # Any format supported by librosa is supported
                            "data": audio_base64,
                            "format": "wav",
                        },
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from input audio:\n", result)

    # HTTP URL
    chat_completion_from_url = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            # Any format supported by librosa is supported
                            "url": audio_url
                        },
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_url.choices[0].message.content
    print("Chat completion output from audio url:\n", result)

    # base64 URL
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this audio?"},
                    {
                        "type": "audio_url",
                        "audio_url": {
                            # Any format supported by librosa is supported
                            "url": f"data:audio/ogg;base64,{audio_base64}"
                        },
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from base64 encoded audio:\n", result)


def run_multi_audio(model: str, max_completion_tokens: int) -> None:
    from vllm.assets.audio import AudioAsset

    # Two different audios to showcase batched inference.
    audio_url = AudioAsset("winning_call").url
    audio_base64 = encode_base64_content_from_url(audio_url)
    audio_url2 = AudioAsset("azacinto_foscolo").url
    audio_base64_2 = encode_base64_content_from_url(audio_url2)

    # OpenAI-compatible schema (`input_audio`)
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Are these two audios the same?"},
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": audio_base64,
                            "format": "wav",
                        },
                    },
                    {
                        "type": "input_audio",
                        "input_audio": {
                            "data": audio_base64_2,
                            "format": "wav",
                        },
                    },
                ],
            }
        ],
        model=model,
        max_completion_tokens=max_completion_tokens,
    )

    result = chat_completion_from_base64.choices[0].message.content
    print("Chat completion output from input audio:\n", result)


example_function_map = {
    "text-only": run_text_only,
    "single-image": run_single_image,
    "multi-image": run_multi_image,
    "multi-audio": run_multi_audio,
    "video": run_video,
    "audio": run_audio,
}


def parse_args():
    parser = FlexibleArgumentParser(
        description="Demo on using OpenAI client for online serving with "
        "multimodal language models served with vLLM."
    )
    parser.add_argument(
        "--chat-type",
        "-c",
        type=str,
        default="single-image",
        choices=list(example_function_map.keys()),
        help="Conversation type with multimodal data.",
    )
    parser.add_argument(
        "--max-completion-tokens",
        "-n",
        type=int,
        default=128,
        help="Maximum number of tokens to generate for each completion.",
    )
    return parser.parse_args()


def main(args) -> None:
    chat_type = args.chat_type
    model = get_first_model(client)
    example_function_map[chat_type](model, args.max_completion_tokens)



In [31]:
import base64
from pathlib import Path

from pydantic import BaseModel

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

import json

class ContentType(str, Enum):
    """Type of content block"""
    PARAGRAPH = "paragraph"
    LIST = "list"
    TABLE = "table"
    QUOTE = "quote"

class ContentBlock(BaseModel):
    """A block of content under a section"""
    type: ContentType = Field(description="Type of content block")
    content: str = Field(description="The actual text content")
    
class Section(BaseModel):
    """A section with heading and content"""
    heading: str = Field(description="Section heading/title")
    level: int = Field(description="Heading level (1 for main section, 2 for subsection, etc.)")
    content: List[ContentBlock] = Field(
        description="Content blocks under this section",
        default_factory=list
    )
    subsections: List['Section'] = Field(
        description="Nested subsections under this section",
        default_factory=list
    )

class DocumentMetadata(BaseModel):
    """Metadata found before main content starts"""
    reference_number: Optional[str] = Field(
        None,
        description="Reference/file number (e.g., RBI/2015-16/1)"
    )
    document_id: Optional[str] = Field(
        None,
        description="Document ID or code (e.g., DCBR.CO.BPD.MC.No.)"
    )
    date: Optional[str] = Field(
        None,
        description="Document date"
    )
    sender: Optional[str] = Field(
        None,
        description="Sender/author name and designation"
    )
    recipient: Optional[str] = Field(
        None,
        description="Recipient/addressee"
    )
    subject: Optional[str] = Field(
        None,
        description="Subject line or title of the document"
    )
    organization: Optional[str] = Field(
        None,
        description="Organization name (e.g., Reserve Bank of India)"
    )
    additional_info: Optional[dict] = Field(
        None,
        description="Any other metadata like website, contact info, circular references, etc."
    )

class Document(BaseModel):
    """Complete RBI document structure"""
    metadata: DocumentMetadata = Field(
        description="Document metadata - reference numbers, dates, sender/recipient info, subject that appears before main content starts"
    )
    main_content: List[Section] = Field(
        description="Main document content organized as sections with headings and subsections. Ignore page headers, page footers, watermarks, logos. Start extracting from where the actual policy/circular/notification content begins."
    )
    signature_block: Optional[str] = Field(
        None,
        description="Closing signature section if present (e.g., 'Yours faithfully, Name, Designation')"
    )
    attachments_mentioned: Optional[List[str]] = Field(
        None,
        description="List of attachments or annexures mentioned in the document"
    )

def encode_base64_content_from_image_path(image_path: str) -> str:
    """Encode an image file from a local path to base64 format."""
    
    path = Path(image_path)
    if not path.is_file():
        raise FileNotFoundError(f"No file found at {image_path}")
    
    with open(path, "rb") as image_file:
        result = base64.b64encode(image_file.read()).decode("utf-8")
    
    return result

def run_image_path(image_paths,**kwargs) -> None:
    b64s = [encode_base64_content_from_image_path(img) for img in image_paths]
    images = [{"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{b64}"}} for b64 in b64s]
                    
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    *images
                ],
            }
        ],
        **kwargs
    )

    result = chat_completion_from_base64.choices[0].message.content
    return result

res = run_image_path(image_paths=['/home/ntlpt59/Pictures/Screenshots/Screenshot from 2025-10-03 16-27-23.png'],
model='Qwen/Qwen2.5-VL-7B-Instruct',
response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "Document",
            "schema": Document.model_json_schema()
        },
    }
)
print(json.dumps(json.loads(res),indent=2))

APIConnectionError: Connection error.

In [30]:
import base64
from pathlib import Path

from pydantic import BaseModel

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

import json

from pydantic import BaseModel, Field
from typing import List, Optional

class Section(BaseModel):
    """A section with heading and content"""
    heading: Optional[str] = Field(
        None,
        description="Section heading/title - ONLY if explicitly present in the document. Do NOT create or infer headings."
    )
    level: int = Field(
        description="Heading level (1 for main section, 2 for subsection, etc.)"
    )
    content: str = Field(
        description="The actual text content under this section. Combine all paragraphs as continuous text."
    )
    subsections: List['Section'] = Field(
        description="Nested subsections ONLY if they have explicit headings in the document",
        default_factory=list
    )

class DocumentMetadata(BaseModel):
    """Metadata found before main content starts"""
    reference_number: Optional[str] = Field(None, description="Reference number (e.g., RBI/2015-16/1)")
    document_id: Optional[str] = Field(None, description="Document ID/code")
    date: Optional[str] = Field(None, description="Document date")
    sender: Optional[str] = Field(None, description="Sender name and designation")
    recipient: Optional[str] = Field(None, description="Recipient/addressee")
    subject: Optional[str] = Field(None, description="Subject line")
    organization: Optional[str] = Field(None, description="Organization name")
    website: Optional[str] = Field(None, description="Website if mentioned")

class Document(BaseModel):
    """Complete RBI document structure"""
    metadata: DocumentMetadata = Field(
        description="Extract metadata that appears before main content: reference numbers, dates, sender, recipient, subject"
    )
    main_content: List[Section] = Field(
        description="Extract content as-is. Only create sections where headings actually exist in the document. Do NOT invent or infer headings. If there's no heading, put all content in one section with heading as null. Preserve the document structure exactly as it appears."
    )

Section.model_rebuild()

class Document(BaseModel):
    """Complete RBI document structure"""
    metadata: DocumentMetadata = Field(
        description="Document metadata - reference numbers, dates, sender/recipient info, subject that appears before main content starts"
    )
    main_content: List[Section] = Field(
        description="Main document content organized as sections with headings and subsections. Ignore page headers, page footers, watermarks, logos. Start extracting from where the actual policy/circular/notification content begins."
    )
    signature_block: Optional[str] = Field(
        None,
        description="Closing signature section if present (e.g., 'Yours faithfully, Name, Designation')"
    )
    attachments_mentioned: Optional[List[str]] = Field(
        None,
        description="List of attachments or annexures mentioned in the document"
    )

def encode_base64_content_from_image_path(image_path: str) -> str:
    """Encode an image file from a local path to base64 format."""
    
    path = Path(image_path)
    if not path.is_file():
        raise FileNotFoundError(f"No file found at {image_path}")
    
    with open(path, "rb") as image_file:
        result = base64.b64encode(image_file.read()).decode("utf-8")
    
    return result

def run_image_path(image_paths,**kwargs) -> None:
    b64s = [encode_base64_content_from_image_path(img) for img in image_paths]
    images = [{"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{b64}"}} for b64 in b64s]
                    
    chat_completion_from_base64 = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    *images
                ],
            }
        ],
        **kwargs
    )

    result = chat_completion_from_base64.choices[0].message.content
    return result

res = run_image_path(image_paths=['/home/ntlpt59/Pictures/Screenshots/Screenshot from 2025-10-03 16-27-23.png'],
model='Qwen/Qwen2.5-VL-7B-Instruct',
response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "Document",
            "schema": Document.model_json_schema()
        },
    }
)
print(json.dumps(json.loads(res),indent=2))

APIConnectionError: Connection error.

In [None]:
import base64
from pathlib import Path

from pydantic import BaseModel

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

import json

from pydantic import BaseModel, Field
from typing import List, Optional

class Section(BaseModel):
    """A section with heading and content"""
    heading: Optional[str] = Field(
        None,
        description="Section heading/title - ONLY if explicitly present in the document. Do NOT create or infer headings."
    )
    level: int = Field(
        description="Heading level (1 for main section, 2 for subsection, etc.)"
    )
    content: str = Field(
        description="The actual text content under this section. Combine all paragraphs as continuous text."
    )
    subsections: List['Section'] = Field(
        description="Nested subsections ONLY if they have explicit headings in the document",
        default_factory=list
    )

class DocumentMetadata(BaseModel):
    """Metadata found before main content starts"""
    reference_number: Optional[str] = Field(None, description="Reference number (e.g., RBI/2015-16/1)")
    document_id: Optional[str] = Field(None, description="Document ID/code")
    date: Optional[str] = Field(None, description="Document date")
    sender: Optional[str] = Field(None, description="Sender name and designation")
    recipient: Optional[str] = Field(None, description="Recipient/addressee")
    subject: Optional[str] = Field(None, description="Subject line")
    organization: Optional[str] = Field(None, description="Organization name")
    website: Optional[str] = Field(None, description="Website if mentioned")

class Document(BaseModel):
    """Complete RBI document structure"""
    metadata: DocumentMetadata = Field(
        description="Extract metadata that appears before main content: reference numbers, dates, sender, recipient, subject"
    )
    main_content: List[Section] = Field(
        description="Extract content as-is. Only create sections where headings actually exist in the document. Do NOT invent or infer headings. If there's no heading, put all content in one section with heading as null. Preserve the document structure exactly as it appears."
    )

Section.model_rebuild()

class Document(BaseModel):
    """Complete RBI document structure"""
    metadata: DocumentMetadata = Field(
        description="Document metadata - reference numbers, dates, sender/recipient info, subject that appears before main content starts"
    )
    main_content: List[Section] = Field(
        description="Main document content organized as sections with headings and subsections. Ignore page headers, page footers, watermarks, logos. Start extracting from where the actual policy/circular/notification content begins."
    )
    signature_block: Optional[str] = Field(
        None,
        description="Closing signature section if present (e.g., 'Yours faithfully, Name, Designation')"
    )
    attachments_mentioned: Optional[List[str]] = Field(
        None,
        description="List of attachments or annexures mentioned in the document"
    )

In [34]:
import base64
from pathlib import Path
from typing import List, Dict
import json
from pdf2image import convert_from_path
from pydantic import BaseModel, Field
from typing import List, Optional

def pdf_to_base64_images(pdf_path: str, dpi: int = 200) -> List[str]:
    """Convert PDF pages to base64 encoded images"""
    images = convert_from_path(pdf_path, dpi=dpi)
    b64_images = []
    
    for img in images:
        from io import BytesIO
        buffered = BytesIO()
        img.save(buffered, format="PNG")
        img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
        b64_images.append(img_b64)
    
    return b64_images

In [35]:
def run_image_b64(b64_image: str, client, model: str = 'Qwen/Qwen2.5-VL-7B-Instruct') -> dict:
    """Process a single page image and return structured data"""
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract the document content following the schema."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}
                ],
            }
        ],
        model=model,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "Document",
                "schema": Document.model_json_schema()
            },
        }
    )
    
    result = chat_completion.choices[0].message.content
    return json.loads(result)

In [38]:
import base64

import requests
from openai import OpenAI
from pathlib import Path

pdf_path = "/home/ntlpt59/Documents/RBI/rbi_pdfs/mastercircular/01MCD0D97308D3AD49A3908E2F4410ED4409.pdf"
res = {}

client = OpenAI(base_url="http://192.168.170.76:8000/v1",api_key="EMPTY")

for i,b64 in enumerate(pdf_to_base64_images(pdf_path),1):
    print(i,end="")
    res[i]=run_image_b64(b64,client)
    Path('res.json').write_text(json.dumps(res,indent=2))

1234567