Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions docs/input.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Image, Audio & Document Input
# Image, Audio, Video & Document Input

Some LLMs are now capable of understanding both audio, image and document content.
Some LLMs are now capable of understanding audio, video, image and document content.

## Image Input

Expand Down Expand Up @@ -52,6 +52,13 @@ print(result.data)

You can provide audio input using either [`AudioUrl`][pydantic_ai.AudioUrl] or [`BinaryContent`][pydantic_ai.BinaryContent]. The process is analogous to the examples above.

## Video Input

!!! info
Some models do not support video input. Please check the model's documentation to confirm whether it supports audio input.

You can provide video input using either [`VideoUrl`][pydantic_ai.VideoUrl] or [`BinaryContent`][pydantic_ai.BinaryContent]. The process is analogous to the examples above.

## Document Input

!!! info
Expand Down
3 changes: 2 additions & 1 deletion pydantic_ai_slim/pydantic_ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
UsageLimitExceeded,
UserError,
)
from .messages import AudioUrl, BinaryContent, DocumentUrl, ImageUrl
from .messages import AudioUrl, BinaryContent, DocumentUrl, ImageUrl, VideoUrl
from .tools import RunContext, Tool

__all__ = (
Expand All @@ -33,6 +33,7 @@
# messages
'ImageUrl',
'AudioUrl',
'VideoUrl',
'DocumentUrl',
'BinaryContent',
# tools
Expand Down
116 changes: 98 additions & 18 deletions pydantic_ai_slim/pydantic_ai/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,34 @@
from ._utils import generate_tool_call_id as _generate_tool_call_id, now_utc as _now_utc
from .exceptions import UnexpectedModelBehavior

AudioMediaType: TypeAlias = Literal['audio/wav', 'audio/mpeg']
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just moved this whole block for better readability.

ImageMediaType: TypeAlias = Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']
DocumentMediaType: TypeAlias = Literal[
'application/pdf',
'text/plain',
'text/csv',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'text/html',
'text/markdown',
'application/vnd.ms-excel',
]
VideoMediaType: TypeAlias = Literal[
'video/x-matroska',
'video/quicktime',
'video/mp4',
'video/webm',
'video/x-flv',
'video/mpeg',
'video/x-ms-wmv',
'video/3gpp',
]

AudioFormat: TypeAlias = Literal['wav', 'mp3']
ImageFormat: TypeAlias = Literal['jpeg', 'png', 'gif', 'webp']
DocumentFormat: TypeAlias = Literal['csv', 'doc', 'docx', 'html', 'md', 'pdf', 'txt', 'xls', 'xlsx']
VideoFormat: TypeAlias = Literal['mkv', 'mov', 'mp4', 'webm', 'flv', 'mpeg', 'mpg', 'wmv', 'three_gp']


@dataclass
class SystemPromptPart:
Expand Down Expand Up @@ -42,6 +70,47 @@ def otel_event(self) -> Event:
return Event('gen_ai.system.message', body={'content': self.content, 'role': 'system'})


@dataclass
class VideoUrl:
"""A URL to an video."""

url: str
"""The URL of the video."""

kind: Literal['video-url'] = 'video-url'
"""Type identifier, this is available on all parts as a discriminator."""

@property
def media_type(self) -> VideoMediaType: # pragma: no cover
"""Return the media type of the video, based on the url."""
if self.url.endswith('.mkv'):
return 'video/x-matroska'
elif self.url.endswith('.mov'):
return 'video/quicktime'
elif self.url.endswith('.mp4'):
return 'video/mp4'
elif self.url.endswith('.webm'):
return 'video/webm'
elif self.url.endswith('.flv'):
return 'video/x-flv'
elif self.url.endswith(('.mpeg', '.mpg')):
return 'video/mpeg'
elif self.url.endswith('.wmv'):
return 'video/x-ms-wmv'
elif self.url.endswith('.three_gp'):
return 'video/3gpp'
else:
raise ValueError(f'Unknown video file extension: {self.url}')

@property
def format(self) -> VideoFormat:
"""The file format of the video.

The choice of supported formats were based on the Bedrock Converse API. Other APIs don't require to use a format.
"""
return _video_format(self.media_type)


@dataclass
class AudioUrl:
"""A URL to an audio file."""
Expand Down Expand Up @@ -123,23 +192,6 @@ def format(self) -> DocumentFormat:
return _document_format(self.media_type)


AudioMediaType: TypeAlias = Literal['audio/wav', 'audio/mpeg']
ImageMediaType: TypeAlias = Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']
DocumentMediaType: TypeAlias = Literal[
'application/pdf',
'text/plain',
'text/csv',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'text/html',
'text/markdown',
'application/vnd.ms-excel',
]
AudioFormat: TypeAlias = Literal['wav', 'mp3']
ImageFormat: TypeAlias = Literal['jpeg', 'png', 'gif', 'webp']
DocumentFormat: TypeAlias = Literal['csv', 'doc', 'docx', 'html', 'md', 'pdf', 'txt', 'xls', 'xlsx']


@dataclass
class BinaryContent:
"""Binary content, e.g. an audio or image file."""
Expand All @@ -163,6 +215,11 @@ def is_image(self) -> bool:
"""Return `True` if the media type is an image type."""
return self.media_type.startswith('image/')

@property
def is_video(self) -> bool:
"""Return `True` if the media type is a video type."""
return self.media_type.startswith('video/')

@property
def is_document(self) -> bool:
"""Return `True` if the media type is a document type."""
Expand All @@ -189,10 +246,12 @@ def format(self) -> str:
return _image_format(self.media_type)
elif self.is_document:
return _document_format(self.media_type)
elif self.is_video:
return _video_format(self.media_type)
raise ValueError(f'Unknown media type: {self.media_type}')


UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | BinaryContent'
UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent'


def _document_format(media_type: str) -> DocumentFormat:
Expand Down Expand Up @@ -229,6 +288,27 @@ def _image_format(media_type: str) -> ImageFormat:
raise ValueError(f'Unknown image media type: {media_type}')


def _video_format(media_type: str) -> VideoFormat:
if media_type == 'video/x-matroska':
return 'mkv'
elif media_type == 'video/quicktime':
return 'mov'
elif media_type == 'video/mp4':
return 'mp4'
elif media_type == 'video/webm':
return 'webm'
elif media_type == 'video/x-flv':
return 'flv'
elif media_type == 'video/mpeg':
return 'mpeg'
elif media_type == 'video/x-ms-wmv':
return 'wmv'
elif media_type == 'video/3gpp':
return 'three_gp'
else: # pragma: no cover
raise ValueError(f'Unknown video media type: {media_type}')


@dataclass
class UserPromptPart:
"""A user prompt, generally written by the end user.
Expand Down
16 changes: 15 additions & 1 deletion pydantic_ai_slim/pydantic_ai/models/bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ToolCallPart,
ToolReturnPart,
UserPromptPart,
VideoUrl,
)
from pydantic_ai.models import Model, ModelRequestParameters, StreamedResponse, cached_async_http_client
from pydantic_ai.providers import Provider, infer_provider
Expand All @@ -52,6 +53,7 @@
SystemContentBlockTypeDef,
ToolChoiceTypeDef,
ToolTypeDef,
VideoBlockTypeDef,
)


Expand Down Expand Up @@ -381,21 +383,33 @@ async def _map_user_prompt(part: UserPromptPart) -> list[MessageUnionTypeDef]:
elif item.is_image:
assert format in ('jpeg', 'png', 'gif', 'webp')
content.append({'image': {'format': format, 'source': {'bytes': item.data}}})
elif item.is_video:
assert format in ('mkv', 'mov', 'mp4', 'webm', 'flv', 'mpeg', 'mpg', 'wmv', 'three_gp')
content.append({'video': {'format': format, 'source': {'bytes': item.data}}})
else:
raise NotImplementedError('Binary content is not supported yet.')
elif isinstance(item, (ImageUrl, DocumentUrl)):
elif isinstance(item, (ImageUrl, DocumentUrl, VideoUrl)):
response = await cached_async_http_client().get(item.url)
response.raise_for_status()
if item.kind == 'image-url':
format = item.media_type.split('/')[1]
assert format in ('jpeg', 'png', 'gif', 'webp'), f'Unsupported image format: {format}'
image: ImageBlockTypeDef = {'format': format, 'source': {'bytes': response.content}}
content.append({'image': image})

elif item.kind == 'document-url':
document_count += 1
name = f'Document {document_count}'
data = response.content
content.append({'document': {'name': name, 'format': item.format, 'source': {'bytes': data}}})

elif item.kind == 'video-url':
format = item.media_type.split('/')[1]
assert format in ('mkv', 'mov', 'mp4', 'webm', 'flv', 'mpeg', 'mpg', 'wmv', 'three_gp'), (
f'Unsupported video format: {format}'
)
video: VideoBlockTypeDef = {'format': format, 'source': {'bytes': response.content}}
content.append({'video': video})
elif isinstance(item, AudioUrl): # pragma: no cover
raise NotImplementedError('Audio is not supported yet.')
else:
Expand Down
3 changes: 3 additions & 0 deletions pydantic_ai_slim/pydantic_ai/models/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
ToolCallPart,
ToolReturnPart,
UserPromptPart,
VideoUrl,
)
from ..settings import ModelSettings
from ..tools import ToolDefinition
Expand Down Expand Up @@ -335,6 +336,8 @@ async def _map_user_prompt(part: UserPromptPart) -> list[_GeminiPartUnion]:
inline_data={'data': base64.b64encode(response.content).decode('utf-8'), 'mime_type': mime_type}
)
content.append(inline_data)
elif isinstance(item, VideoUrl): # pragma: no cover
raise NotImplementedError('VideoUrl is not supported for Gemini.')
else:
assert_never(item)
return content
Expand Down
3 changes: 3 additions & 0 deletions pydantic_ai_slim/pydantic_ai/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ToolCallPart,
ToolReturnPart,
UserPromptPart,
VideoUrl,
)
from ..providers import Provider, infer_provider
from ..result import Usage
Expand Down Expand Up @@ -503,6 +504,8 @@ def _map_user_prompt(part: UserPromptPart) -> MistralUserMessage:
raise RuntimeError('Only image binary content is supported for Mistral.')
elif isinstance(item, DocumentUrl):
raise RuntimeError('DocumentUrl is not supported in Mistral.')
elif isinstance(item, VideoUrl):
raise RuntimeError('VideoUrl is not supported in Mistral.')
else: # pragma: no cover
raise RuntimeError(f'Unsupported content type: {type(item)}')
return MistralUserMessage(content=content)
Expand Down
5 changes: 5 additions & 0 deletions pydantic_ai_slim/pydantic_ai/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ToolCallPart,
ToolReturnPart,
UserPromptPart,
VideoUrl,
)
from ..settings import ModelSettings
from ..tools import ToolDefinition
Expand Down Expand Up @@ -448,6 +449,8 @@ async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessa
# file_data = f'data:{media_type};base64,{base64_encoded}'
# file = File(file={'file_data': file_data, 'file_name': item.url, 'file_id': item.url}, type='file')
# content.append(file)
elif isinstance(item, VideoUrl): # pragma: no cover
raise NotImplementedError('VideoUrl is not supported for OpenAI')
else:
assert_never(item)
return chat.ChatCompletionUserMessageParam(role='user', content=content)
Expand Down Expand Up @@ -765,6 +768,8 @@ async def _map_user_prompt(part: UserPromptPart) -> responses.EasyInputMessagePa
filename=f'filename.{item.format}',
)
)
elif isinstance(item, VideoUrl): # pragma: no cover
raise NotImplementedError('VideoUrl is not supported for OpenAI.')
else:
assert_never(item)
return responses.EasyInputMessageParam(role='user', content=content)
Expand Down
2 changes: 1 addition & 1 deletion pydantic_ai_slim/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ vertexai = ["google-auth>=2.36.0", "requests>=2.32.3"]
anthropic = ["anthropic>=0.49.0"]
groq = ["groq>=0.15.0"]
mistral = ["mistralai>=1.2.5"]
bedrock = ["boto3>=1.34.116"]
bedrock = ["boto3>=1.35.74"]
# Tools
duckduckgo = ["duckduckgo-search>=7.0.0"]
tavily = ["tavily-python>=0.5.0"]
Expand Down
Binary file added tests/assets/small_video.mp4
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ def image_content(assets_path: Path) -> BinaryContent:
return BinaryContent(data=image_bytes, media_type='image/png')


@pytest.fixture(scope='session')
def video_content(assets_path: Path) -> BinaryContent:
video_bytes = assets_path.joinpath('small_video.mp4').read_bytes()
return BinaryContent(data=video_bytes, media_type='video/mp4')


@pytest.fixture(scope='session')
def document_content(assets_path: Path) -> BinaryContent:
pdf_bytes = assets_path.joinpath('dummy.pdf').read_bytes()
Expand Down

Large diffs are not rendered by default.

Loading