From ae4b555b58ae485947e4489a27239e8d5faa670e Mon Sep 17 00:00:00 2001 From: Mehdi Nazari Date: Mon, 24 Nov 2025 12:30:10 -0800 Subject: [PATCH] [docs] Add Google-style docstrings for dspy/adapters/types/audio.py Audio class #9086 --- dspy/adapters/types/audio.py | 149 ++++++++++++++++++++++++++++++----- 1 file changed, 128 insertions(+), 21 deletions(-) diff --git a/dspy/adapters/types/audio.py b/dspy/adapters/types/audio.py index 0ceb734b73..89738e7031 100644 --- a/dspy/adapters/types/audio.py +++ b/dspy/adapters/types/audio.py @@ -23,6 +23,54 @@ def _normalize_audio_format(audio_format: str) -> str: class Audio(Type): + """A type for representing audio data in DSPy. + + The Audio class provides a standardized way to handle audio inputs for language models + that support audio processing. Audio data is stored as base64-encoded strings along + with format metadata. + + Attributes: + data: Base64-encoded audio data. + audio_format: The audio format (e.g., "wav", "mp3", "flac"). + + Example: + Basic usage with a local file: + + ```python + import dspy + + dspy.configure(lm=dspy.LM("openai/gpt-4o-audio-preview")) + + class TranscribeAudio(dspy.Signature): + audio: dspy.Audio = dspy.InputField() + transcription: str = dspy.OutputField() + + # Create Audio from a local file + audio = dspy.Audio.from_file("speech.wav") + + predict = dspy.Predict(TranscribeAudio) + result = predict(audio=audio) + ``` + + Example: + Creating Audio from different sources: + + ```python + import dspy + + # From a URL + audio = dspy.Audio.from_url("https://example.com/audio.mp3") + + # From a local file path (auto-detected) + audio = dspy.Audio("path/to/audio.wav") + + # From a numpy array (requires soundfile) + import numpy as np + samples = np.random.randn(16000) # 1 second of audio at 16kHz + audio = dspy.Audio.from_array(samples, sampling_rate=16000) + ``` + """ + data: str audio_format: str @@ -32,18 +80,20 @@ class Audio(Type): ) def format(self) -> list[dict[str, Any]]: + """Format the audio data for consumption by language models. + + Returns: + A list containing the audio block in the format expected by + audio-enabled language models. + + Raises: + ValueError: If the audio data cannot be formatted. + """ try: data = self.data except Exception as e: raise ValueError(f"Failed to format audio for DSPy: {e}") - return [{ - "type": "input_audio", - "input_audio": { - "data": data, - "format": self.audio_format - } - }] - + return [{"type": "input_audio", "input_audio": {"data": data, "format": self.audio_format}}] @pydantic.model_validator(mode="before") @classmethod @@ -57,8 +107,20 @@ def validate_input(cls, values: Any) -> Any: @classmethod def from_url(cls, url: str) -> "Audio": - """ - Download an audio file from URL and encode it as base64. + """Create an Audio instance by downloading from a URL. + + Downloads the audio file from the specified URL, determines the format + from the Content-Type header, and encodes the content as base64. + + Args: + url: The URL of the audio file to download. + + Returns: + An Audio instance containing the base64-encoded audio data. + + Raises: + ValueError: If the Content-Type is not an audio MIME type. + requests.HTTPError: If the HTTP request fails. """ response = requests.get(url) response.raise_for_status() @@ -74,8 +136,19 @@ def from_url(cls, url: str) -> "Audio": @classmethod def from_file(cls, file_path: str) -> "Audio": - """ - Read local audio file and encode it as base64. + """Create an Audio instance from a local file. + + Reads the audio file from disk, determines the format from the file + extension, and encodes the content as base64. + + Args: + file_path: The path to the local audio file. + + Returns: + An Audio instance containing the base64-encoded audio data. + + Raises: + ValueError: If the file does not exist or has an unsupported MIME type. """ if not os.path.isfile(file_path): raise ValueError(f"File not found: {file_path}") @@ -95,11 +168,26 @@ def from_file(cls, file_path: str) -> "Audio": return cls(data=encoded_data, audio_format=audio_format) @classmethod - def from_array( - cls, array: Any, sampling_rate: int, format: str = "wav" - ) -> "Audio": - """ - Process numpy-like array and encode it as base64. Uses sampling rate and audio format for encoding. + def from_array(cls, array: Any, sampling_rate: int, format: str = "wav") -> "Audio": + """Create an Audio instance from a numpy array. + + Converts a numpy-like array of audio samples into an Audio instance + by encoding it with the specified format and sampling rate. + + Args: + array: A numpy-like array containing audio samples. + sampling_rate: The sampling rate in Hz (e.g., 16000 for 16kHz). + format: The output audio format. Defaults to "wav". + + Returns: + An Audio instance containing the base64-encoded audio data. + + Raises: + ImportError: If the soundfile library is not installed. + + Note: + This method requires the ``soundfile`` library to be installed. + Install it with ``pip install soundfile``. """ if not SF_AVAILABLE: raise ImportError("soundfile is required to process audio arrays.") @@ -122,11 +210,30 @@ def __repr__(self) -> str: length = len(self.data) return f"Audio(data=, audio_format='{self.audio_format}')" + def encode_audio(audio: Union[str, bytes, dict, "Audio", Any], sampling_rate: int = 16000, format: str = "wav") -> dict: - """ - Encode audio to a dict with 'data' and 'audio_format'. - - Accepts: local file path, URL, data URI, dict, Audio instance, numpy array, or bytes (with known format). + """Encode audio from various sources into a standardized dictionary format. + + This function accepts multiple input types and normalizes them into a dictionary + containing base64-encoded audio data and format information. + + Args: + audio: The audio input. Supported types include: + + - ``str``: Local file path, HTTP(S) URL, or data URI + - ``bytes``: Raw audio bytes + - ``dict``: Dictionary with "data" and "audio_format" keys + - ``Audio``: An existing Audio instance + - ``numpy.ndarray``: Audio samples as a numpy array (requires soundfile) + + sampling_rate: The sampling rate in Hz for numpy array inputs. Defaults to 16000. + format: The audio format for numpy array or bytes inputs. Defaults to "wav". + + Returns: + A dictionary with "data" (base64-encoded string) and "audio_format" keys. + + Raises: + ValueError: If the input type is unsupported or the data URI is malformed. """ if isinstance(audio, dict) and "data" in audio and "audio_format" in audio: return audio