Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 128 additions & 21 deletions dspy/adapters/types/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,54 @@ def _normalize_audio_format(audio_format: str) -> str:


class Audio(Type):
"""A type for representing audio data in DSPy.

The Audio class provides a standardized way to handle audio inputs for language models
that support audio processing. Audio data is stored as base64-encoded strings along
with format metadata.

Attributes:
data: Base64-encoded audio data.
audio_format: The audio format (e.g., "wav", "mp3", "flac").

Example:
Basic usage with a local file:

```python
import dspy

dspy.configure(lm=dspy.LM("openai/gpt-4o-audio-preview"))

class TranscribeAudio(dspy.Signature):
audio: dspy.Audio = dspy.InputField()
transcription: str = dspy.OutputField()

# Create Audio from a local file
audio = dspy.Audio.from_file("speech.wav")

predict = dspy.Predict(TranscribeAudio)
result = predict(audio=audio)
```

Example:
Creating Audio from different sources:

```python
import dspy

# From a URL
audio = dspy.Audio.from_url("https://example.com/audio.mp3")

# From a local file path (auto-detected)
audio = dspy.Audio("path/to/audio.wav")

# From a numpy array (requires soundfile)
import numpy as np
samples = np.random.randn(16000) # 1 second of audio at 16kHz
audio = dspy.Audio.from_array(samples, sampling_rate=16000)
```
"""

data: str
audio_format: str

Expand All @@ -32,18 +80,20 @@ class Audio(Type):
)

def format(self) -> list[dict[str, Any]]:
"""Format the audio data for consumption by language models.

Returns:
A list containing the audio block in the format expected by
audio-enabled language models.

Raises:
ValueError: If the audio data cannot be formatted.
"""
try:
data = self.data
except Exception as e:
raise ValueError(f"Failed to format audio for DSPy: {e}")
return [{
"type": "input_audio",
"input_audio": {
"data": data,
"format": self.audio_format
}
}]

return [{"type": "input_audio", "input_audio": {"data": data, "format": self.audio_format}}]

@pydantic.model_validator(mode="before")
@classmethod
Expand All @@ -57,8 +107,20 @@ def validate_input(cls, values: Any) -> Any:

@classmethod
def from_url(cls, url: str) -> "Audio":
"""
Download an audio file from URL and encode it as base64.
"""Create an Audio instance by downloading from a URL.

Downloads the audio file from the specified URL, determines the format
from the Content-Type header, and encodes the content as base64.

Args:
url: The URL of the audio file to download.

Returns:
An Audio instance containing the base64-encoded audio data.

Raises:
ValueError: If the Content-Type is not an audio MIME type.
requests.HTTPError: If the HTTP request fails.
"""
response = requests.get(url)
response.raise_for_status()
Expand All @@ -74,8 +136,19 @@ def from_url(cls, url: str) -> "Audio":

@classmethod
def from_file(cls, file_path: str) -> "Audio":
"""
Read local audio file and encode it as base64.
"""Create an Audio instance from a local file.

Reads the audio file from disk, determines the format from the file
extension, and encodes the content as base64.

Args:
file_path: The path to the local audio file.

Returns:
An Audio instance containing the base64-encoded audio data.

Raises:
ValueError: If the file does not exist or has an unsupported MIME type.
"""
if not os.path.isfile(file_path):
raise ValueError(f"File not found: {file_path}")
Expand All @@ -95,11 +168,26 @@ def from_file(cls, file_path: str) -> "Audio":
return cls(data=encoded_data, audio_format=audio_format)

@classmethod
def from_array(
cls, array: Any, sampling_rate: int, format: str = "wav"
) -> "Audio":
"""
Process numpy-like array and encode it as base64. Uses sampling rate and audio format for encoding.
def from_array(cls, array: Any, sampling_rate: int, format: str = "wav") -> "Audio":
"""Create an Audio instance from a numpy array.

Converts a numpy-like array of audio samples into an Audio instance
by encoding it with the specified format and sampling rate.

Args:
array: A numpy-like array containing audio samples.
sampling_rate: The sampling rate in Hz (e.g., 16000 for 16kHz).
format: The output audio format. Defaults to "wav".

Returns:
An Audio instance containing the base64-encoded audio data.

Raises:
ImportError: If the soundfile library is not installed.

Note:
This method requires the ``soundfile`` library to be installed.
Install it with ``pip install soundfile``.
"""
if not SF_AVAILABLE:
raise ImportError("soundfile is required to process audio arrays.")
Expand All @@ -122,11 +210,30 @@ def __repr__(self) -> str:
length = len(self.data)
return f"Audio(data=<AUDIO_BASE_64_ENCODED({length})>, audio_format='{self.audio_format}')"


def encode_audio(audio: Union[str, bytes, dict, "Audio", Any], sampling_rate: int = 16000, format: str = "wav") -> dict:
"""
Encode audio to a dict with 'data' and 'audio_format'.

Accepts: local file path, URL, data URI, dict, Audio instance, numpy array, or bytes (with known format).
"""Encode audio from various sources into a standardized dictionary format.

This function accepts multiple input types and normalizes them into a dictionary
containing base64-encoded audio data and format information.

Args:
audio: The audio input. Supported types include:

- ``str``: Local file path, HTTP(S) URL, or data URI
- ``bytes``: Raw audio bytes
- ``dict``: Dictionary with "data" and "audio_format" keys
- ``Audio``: An existing Audio instance
- ``numpy.ndarray``: Audio samples as a numpy array (requires soundfile)

sampling_rate: The sampling rate in Hz for numpy array inputs. Defaults to 16000.
format: The audio format for numpy array or bytes inputs. Defaults to "wav".

Returns:
A dictionary with "data" (base64-encoded string) and "audio_format" keys.

Raises:
ValueError: If the input type is unsupported or the data URI is malformed.
"""
if isinstance(audio, dict) and "data" in audio and "audio_format" in audio:
return audio
Expand Down