diff --git a/.gitignore b/.gitignore index 27b6686..d69fb17 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,5 @@ tmp/ .claude # Examples +output/ **/output.wav diff --git a/examples/voice/cli/.gitignore b/examples/voice/cli/.gitignore new file mode 100644 index 0000000..ea1472e --- /dev/null +++ b/examples/voice/cli/.gitignore @@ -0,0 +1 @@ +output/ diff --git a/examples/voice/cli/README.md b/examples/voice/cli/README.md index 0a036a2..ffc89f4 100644 --- a/examples/voice/cli/README.md +++ b/examples/voice/cli/README.md @@ -5,15 +5,25 @@ Real-time transcription tool using the Speechmatics Voice SDK. Supports micropho ## Quick Start **Microphone:** + ```bash -python cli.py -p -k YOUR_API_KEY +# Quick example +python cli.py -k YOUR_API_KEY -p + +# Example that saves the output in verbose mode using a preset +python cli.py -k YOUR_API_KEY -vvvvvpDSr -P conversation_smart_turn ``` +Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl` + **Audio file:** + ```bash -python cli.py -p -k YOUR_API_KEY -i audio.wav +python cli.py -k YOUR_API_KEY -i audio.wav -p ``` +Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl` + Press `CTRL+C` to stop. ## Requirements @@ -23,17 +33,32 @@ Press `CTRL+C` to stop. ## Options +### Quick Reference + +Common short codes: + +- `-k` API key | `-i` input file | `-o` output dir | `-p` pretty print | `-v` verbose +- `-r` record | `-S` save slices | `-P` preset | `-W` show config +- `-l` language | `-m` mode | `-d` max delay | `-t` silence trigger +- `-f` focus speakers | `-s` known speakers | `-E` enrol + ### Core - `-k, --api-key` - API key (defaults to `SPEECHMATICS_API_KEY` env var) - `-u, --url` - Server URL (defaults to `SPEECHMATICS_RT_URL` env var) - `-i, --input-file` - Audio file path (WAV, mono 16-bit). Uses microphone if not specified -- `-c, --config` - JSON config string or file path (overrides other Voice Agent options) ### Output +- `-o, --output-dir` - Base output directory (default: ./output) + - Creates a session subdirectory with timestamp (YYYYMMDD_HHMMSS) + - Inside session directory: + - `log.jsonl` - All events with timestamps + - `recording.wav` - Microphone recording (if `-r` is used) + - `slice_*.wav` and `slice_*.json` - Audio slices (if `-S` is used) +- `-r, --record` - Record microphone audio to recording.wav (microphone input only) +- `-S, --save-slices` - Save audio slices on SPEAKER_ENDED events (SMART_TURN mode only) - `-p, --pretty` - Formatted console output with colors -- `-o, --output-file` - Save output to JSONL file - `-v, --verbose` - Increase verbosity (can repeat: `-v`, `-vv`, `-vvv`, `-vvvv`, `-vvvvv`) - `-v` - Add speaker VAD events - `-vv` - Add turn predictions @@ -41,23 +66,38 @@ Press `CTRL+C` to stop. - `-vvvv` - Add metrics - `-vvvvv` - Add STT events - `-L, --legacy` - Show only legacy transcript messages -- `--results` - Include word-level results in segments +- `-D, --default-device` - Use default audio device (skip selection) +- `-w, --results` - Include word-level results in segments ### Audio -- `--sample-rate` - Sample rate in Hz (default: 16000) -- `--chunk-size` - Chunk size in bytes (default: 320) +- `-R, --sample-rate` - Sample rate in Hz (default: 16000) +- `-C, --chunk-size` - Chunk size in bytes (default: 320) - `-M, --mute` - Mute audio playback for file input -- `-D, --default-device` - Use default audio device (skip selection) ### Voice Agent Config -- `-l, --language` - Language code (default: en) -- `-d, --max-delay` - Max transcription delay in seconds (default: 0.7) -- `-t, --end-of-utterance-silence-trigger` - Silence duration for turn end (default: 0.5) -- `-m, --end-of-utterance-mode` - Turn detection mode: `FIXED`, `ADAPTIVE`, `SMART_TURN`, or `EXTERNAL` -- `-e, --emit-sentences` - Emit sentence-level segments -- `--forced-eou` - Enable forced end of utterance +**Configuration Priority:** + +1. Use `--preset` to start with a preset configuration (recommended) +2. Use `-c/--config` to provide a complete JSON configuration +3. Use individual parameters (`-l`, `-d`, `-t`, `-m`) to override preset settings or create custom config + +**Preset Options:** + +- `-P, --preset` - Use preset configuration: `scribe`, `low_latency`, `conversation_adaptive`, `conversation_smart_turn`, or `captions` +- `--list-presets` - List available presets and exit +- `-W, --show` - Display the final configuration as JSON and exit (after applying preset/config and overrides) + +**Configuration Options:** + +- `-c, --config` - JSON config string or file path (complete configuration) +- `-l, --language` - Language code (overrides preset if used together) +- `-d, --max-delay` - Max transcription delay in seconds (overrides preset if used together) +- `-t, --end-of-utterance-silence-trigger` - Silence duration for turn end in seconds (overrides preset if used together) +- `-m, --end-of-utterance-mode` - Turn detection mode: `FIXED`, `ADAPTIVE`, `SMART_TURN`, or `EXTERNAL` (overrides preset if used together) + +**Note:** When using `-c/--config`, you cannot use `-l`, `-d`, `-t`, `-m`, `-f`, `-I`, `-x`, or `-s` as the config JSON should contain all settings. ### Speaker Management @@ -72,62 +112,142 @@ Press `CTRL+C` to stop. ## Examples +**List presets:** + +```bash +python cli.py --list-presets +``` + +**Show config (from preset):** + +```bash +python cli.py -P scribe -W +``` + +**Show config (with overrides):** + +```bash +python cli.py -P scribe -l fr -d 1.0 -W +``` + +**Use preset:** + +```bash +python cli.py -k YOUR_KEY -P scribe -p +``` + +**Use preset with overrides:** + +```bash +python cli.py -k YOUR_KEY -P scribe -l fr -d 1.0 -p +``` + **Basic microphone:** + ```bash python cli.py -k YOUR_KEY -p ``` +Output saved to `./output/YYYYMMDD_HHMMSS/log.jsonl` + +**Record microphone audio:** + +```bash +python cli.py -k YOUR_KEY -r -p +``` + +Recording saved to `./output/YYYYMMDD_HHMMSS/recording.wav` + +**Custom output directory:** + +```bash +python cli.py -k YOUR_KEY -o ./my_sessions -p +``` + +Output saved to `./my_sessions/YYYYMMDD_HHMMSS/log.jsonl` + +**EXTERNAL mode with manual turn control:** + +```bash +python cli.py -k YOUR_KEY -m EXTERNAL -p +``` + +Press 't' or 'T' to manually signal end of turn. + +**Save audio slices (SMART_TURN mode):** + +```bash +python cli.py -k YOUR_KEY -P conversation_smart_turn -S -p +``` + +Audio slices (~8 seconds) saved to `./output/YYYYMMDD_HHMMSS/slice_*.wav` with matching `.json` metadata files on each SPEAKER_ENDED event. + **Audio file:** + ```bash python cli.py -k YOUR_KEY -i audio.wav -p ``` **Audio file (muted):** -```bash -python cli.py -k YOUR_KEY -i audio.wav -Mp -``` -**Save output:** ```bash -python cli.py -k YOUR_KEY -o output.jsonl -p +python cli.py -k YOUR_KEY -i audio.wav -Mp ``` **Verbose logging:** + ```bash python cli.py -k YOUR_KEY -vv -p ``` +Shows additional events (speaker VAD, turn predictions, etc.) + **Focus on speakers:** + ```bash python cli.py -k YOUR_KEY -f S1 S2 -p ``` **Enrol speakers:** + ```bash python cli.py -k YOUR_KEY -Ep ``` + Press `CTRL+C` when done to see speaker identifiers. **Use known speakers:** + ```bash python cli.py -k YOUR_KEY -s speakers.json -p ``` Example `speakers.json`: + ```json [ - {"label": "Alice", "speaker_identifiers": ["XX...XX"]}, - {"label": "Bob", "speaker_identifiers": ["YY...YY"]} + { "label": "Alice", "speaker_identifiers": ["XX...XX"] }, + { "label": "Bob", "speaker_identifiers": ["YY...YY"] } ] ``` **Custom config:** + ```bash python cli.py -k YOUR_KEY -c config.json -p ``` ## Notes +- Output directory (`-o`) defaults to `./output` +- Each session creates a timestamped subdirectory (YYYYMMDD_HHMMSS format) +- Session directory contains: + - `log.jsonl` - All events with timestamps + - `recording.wav` - Microphone recording (if `-r` is used) + - `slice_*.wav` and `slice_*.json` - Audio slices (if `--save-slices` is used in SMART_TURN mode) +- Session subdirectories prevent accidental data loss from multiple runs +- Audio slices are ~8 seconds and saved on each SPEAKER_ENDED event +- JSON metadata includes event details, speaker ID, timing, and slice duration - Speaker identifiers are encrypted and unique to your API key - Allow speakers to say at least 20 words before enrolling - Avoid labels `S1`, `S2` (reserved by engine) diff --git a/examples/voice/cli/cli.py b/examples/voice/cli/cli.py index a581c9c..67d7958 100644 --- a/examples/voice/cli/cli.py +++ b/examples/voice/cli/cli.py @@ -13,7 +13,9 @@ from pathlib import Path from typing import Any +from utils import AudioFileWriter from utils import AudioPlayer +from utils import load_json from utils import select_audio_device from utils import select_audio_output_device @@ -25,15 +27,25 @@ from speechmatics.voice import SpeakerFocusConfig from speechmatics.voice import SpeakerFocusMode from speechmatics.voice import SpeakerIdentifier -from speechmatics.voice import SpeechSegmentConfig from speechmatics.voice import VoiceAgentClient from speechmatics.voice import VoiceAgentConfig -from speechmatics.voice._models import TranscriptionUpdatePreset +from speechmatics.voice import VoiceAgentConfigPreset # ============================================================================== # CONSTANTS # ============================================================================== +# Audio slice duration (seconds of audio to capture before speaker ends) +AUDIO_SLICE_DURATION = 8.0 + +# Default output directory +DEFAULT_OUTPUT_DIR = "./output" + +# Output filenames +LOG_FILENAME = "log.jsonl" +RECORDING_FILENAME = "recording.wav" + +# Console colors for message types COLORS = { # Segments "AddPartialSegment": "\033[93m", @@ -60,22 +72,65 @@ async def main() -> None: - """Run the transcription CLI.""" + """Run the transcription CLI. + + Main entry point for the CLI application. Handles: + - Command-line argument parsing + - Audio source setup (microphone or file) + - Output directory management + - Configuration setup (preset, custom, or default) + - Event handler registration + - Audio streaming and transcription + """ # Parse the command line arguments args = parse_args() - # Setup audio source (microphone or file) - audio_source = setup_audio_source(args) - if not audio_source: + # Handle preset listing + if args.list_presets: + print("Available presets:") + for preset in VoiceAgentConfigPreset.list_presets(): + print(f" - {preset}") return - # Setup audio output (for file playback) - audio_player = setup_audio_output(audio_source, args) + # Setup audio source (microphone or file) - skip if just showing config + if not args.show: + audio_source = setup_audio_source(args) + if not audio_source: + return + + # Warn if trying to record from file input + if args.record and audio_source["type"] == "file": + print("Warning: --record is only supported for microphone input, not file playback. Recording disabled.") + args.record = None - # Remove JSONL output file if it already exists - if args.output_file and os.path.exists(args.output_file): - os.remove(args.output_file) + # Setup audio output (for file playback) + audio_player = setup_audio_output(audio_source, args) + else: + # Dummy audio source for config display + audio_source = {"sample_rate": 16000} + audio_player = None + + # Setup output directory with session subdirectory + base_output_dir = Path(args.output_dir) + + # Create session reference (YYYYMMDD_HHMMSS format for better sorting) + session_ref = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = base_output_dir / session_ref + + # Create session directory + output_dir.mkdir(parents=True, exist_ok=True) + print(f"Session output directory: {output_dir}") + + # Setup file paths + log_file = output_dir / LOG_FILENAME + record_file = output_dir / RECORDING_FILENAME if args.record else None + slices_dir = output_dir if args.save_slices else None + + # Store in args for easy access + args.log_file = str(log_file) + args.record_file = str(record_file) if record_file else None + args.slices_dir = str(slices_dir) if slices_dir else None # Create speaker configuration speaker_config = create_speaker_config(args) @@ -91,38 +146,60 @@ async def main() -> None: print(f"Error validating config: {e}") return - # Create Voice Agent configuration + # Use a preset + elif args.preset: + try: + config = VoiceAgentConfigPreset.load(args.preset) + except ValueError as e: + print(f"Error loading preset {args.preset}: {e}") + return + + # Default config else: config = VoiceAgentConfig( - language=args.language or "en", - end_of_utterance_silence_trigger=args.end_of_utterance_silence_trigger or 0.5, - max_delay=args.max_delay or 0.7, - end_of_utterance_mode=( - args.end_of_utterance_mode.lower() if args.end_of_utterance_mode else EndOfUtteranceMode.ADAPTIVE - ), - speaker_config=speaker_config, - use_forced_eou_message=args.forced_eou, additional_vocab=[ AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]), - ], - known_speakers=known_speakers, - speech_segment_config=SpeechSegmentConfig( - emit_sentences=args.emit_sentences, - ), - transcription_update_preset=TranscriptionUpdatePreset.COMPLETE_PLUS_TIMING, - include_results=args.results, + ] ) - # Display instructions - if audio_source["type"] == "file": - print("\nStreaming audio file... (Press CTRL+C to stop)\n") - else: - print("\nMicrophone ready - speak now... (Press CTRL+C to stop)\n") + # Copy in overrides + if args.language: + config.language = args.language + if args.end_of_utterance_silence_trigger: + config.end_of_utterance_silence_trigger = args.end_of_utterance_silence_trigger + if args.max_delay: + config.max_delay = args.max_delay + if args.end_of_utterance_mode: + config.end_of_utterance_mode = args.end_of_utterance_mode + + # Copy speaker settings + config.speaker_config = speaker_config + config.known_speakers = known_speakers + config.include_results = args.results # Set common items config.enable_diarization = True + + # Handle config display + if args.show: + print(config.model_dump_json(indent=2, exclude_unset=True, exclude_none=True)) + return + + # Set the audio sample rate config.sample_rate = audio_source["sample_rate"] + # Display instructions + if audio_source["type"] == "file": + print("\nStreaming audio file... (Press CTRL+C to stop)") + else: + print("\nMicrophone ready - speak now... (Press CTRL+C to stop)") + + # Show press 't' to trigger end of turn + if config.end_of_utterance_mode == EndOfUtteranceMode.EXTERNAL: + print("EXTERNAL end of utterance mode enabled (Press 't' to trigger end of turn)\n") + else: + print(f"{config.end_of_utterance_mode.value.upper()} end of utterance mode enabled\n") + # Create Voice Agent client client = VoiceAgentClient(api_key=args.api_key, url=args.url, config=config) @@ -143,7 +220,7 @@ async def main() -> None: # Stream audio try: - await stream_audio(audio_source, audio_player, client, args.chunk_size) + await stream_audio(audio_source, audio_player, client, args.chunk_size, config, args.record_file) except asyncio.CancelledError: pass finally: @@ -324,6 +401,70 @@ def register_event_handlers(client: VoiceAgentClient, args, start_time: datetime start_time: Start time for timestamp calculation """ + # Audio slice counter + slice_counter = {"count": 0} + + async def async_save_audio_slice(message: dict) -> None: + """Save audio slice when speaker ends (SMART_TURN mode only).""" + if not args.slices_dir: + return + + # Only save slices in SMART_TURN mode + if client._config.end_of_utterance_mode != "smart_turn": + return + + # Get time from message + event_time = message.get("time") + if not event_time: + return + + speaker_id = message.get("speaker_id", "unknown") + + # Get audio slice from buffer + # Capture audio leading up to the speaker ending + start_time = event_time - AUDIO_SLICE_DURATION + end_time = event_time + + try: + audio_data = await client._audio_buffer.get_frames( + start_time=start_time, + end_time=end_time, + ) + + if audio_data: + # Generate filenames + slice_counter["count"] += 1 + base_filename = f"slice_{slice_counter['count']:04d}_{speaker_id}_{event_time:.2f}" + wav_filepath = Path(args.slices_dir) / f"{base_filename}.wav" + json_filepath = Path(args.slices_dir) / f"{base_filename}.json" + + # Save audio file + async with AudioFileWriter( + str(wav_filepath), client._audio_sample_rate, client._audio_sample_width + ) as writer: + await writer.write(audio_data) + + # Save JSON metadata + metadata = { + "message": message, + "speaker_id": speaker_id, + "is_active": message.get("is_active"), + "time": event_time, + "slice_start_time": start_time, + "slice_end_time": end_time, + "slice_duration": end_time - start_time, + "audio_file": f"{base_filename}.wav", + } + with open(json_filepath, "w") as f: + json.dump(metadata, f, indent=2) + + except Exception as e: + print(f"Error saving audio slice: {e}") + + def save_audio_slice(message: dict) -> None: + """Save audio slice when speaker ends (SMART_TURN mode only).""" + asyncio.create_task(async_save_audio_slice(message)) + def console_print(ts: datetime.datetime, message: dict) -> None: """Print message to console with optional formatting.""" if not args.pretty: @@ -354,9 +495,9 @@ def log_message(message: dict[str, Any]) -> None: """Log message to console and optional JSONL file.""" now = datetime.datetime.now() console_print(now, message) - if args.output_file: + if args.log_file: ts_str = now.strftime("%Y-%m-%d %H:%M:%S") + f".{now.microsecond // 1000:03d}" - with open(args.output_file, "a") as f: + with open(args.log_file, "a") as f: f.write(json.dumps({"ts": ts_str, **message}) + "\n") # Register standard handlers @@ -378,6 +519,10 @@ def log_message(message: dict[str, Any]) -> None: client.on(AgentServerMessageType.SPEAKER_STARTED, log_message) client.on(AgentServerMessageType.SPEAKER_ENDED, log_message) + # Save audio slices on SPEAKER_ENDED (SMART_TURN mode only) + if args.slices_dir: + client.on(AgentServerMessageType.SPEAKER_ENDED, save_audio_slice) + # Verbose turn prediction if args.verbose >= 2: client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, log_message) @@ -420,6 +565,8 @@ async def stream_audio( audio_player: AudioPlayer | None, client: VoiceAgentClient, chunk_size: int, + config: VoiceAgentConfig, + record_path: str | None = None, ) -> None: """Stream audio from source to client. @@ -428,11 +575,13 @@ async def stream_audio( audio_player: Audio player for file playback (optional) client: Voice Agent client chunk_size: Audio chunk size in bytes + config: Voice agent configuration (for EXTERNAL mode detection) + record_path: Path to save recorded audio (microphone only) """ if audio_source["type"] == "file": await stream_file(audio_source, audio_player, client, chunk_size) else: - await stream_microphone(audio_source, client, chunk_size) + await stream_microphone(audio_source, client, chunk_size, config, record_path) async def stream_file( @@ -489,6 +638,8 @@ async def stream_microphone( audio_source: dict, client: VoiceAgentClient, chunk_size: int, + config: VoiceAgentConfig, + record_path: str | None = None, ) -> None: """Stream microphone audio to client. @@ -496,46 +647,72 @@ async def stream_microphone( audio_source: Audio source information client: Voice Agent client chunk_size: Audio chunk size in bytes + config: Voice agent configuration (for EXTERNAL mode detection) + record_path: Path to save recorded audio (optional) """ - mic = audio_source["mic"] - while True: - frame = await mic.read(chunk_size) - await client.send_audio(frame) - - -# ============================================================================== -# COMMAND-LINE ARGUMENT PARSING -# ============================================================================== + import select + import sys + import termios + import tty + mic = audio_source["mic"] + sample_rate = audio_source["sample_rate"] -def load_json(value: str): - """Load JSON string or file path. + # Check if EXTERNAL mode for keyboard input + is_external_mode = config.end_of_utterance_mode == "external" + + # Setup keyboard input for EXTERNAL mode + old_settings = None + if is_external_mode: + # print("EXTERNAL mode: Press 't' or 'T' to send end of turn") + old_settings = termios.tcgetattr(sys.stdin) + tty.setcbreak(sys.stdin.fileno()) + + # Setup WAV file recording if requested + if record_path: + async with AudioFileWriter(record_path, sample_rate) as writer: + try: + while True: + # Read audio frame + frame = await mic.read(chunk_size) + await client.send_audio(frame) + + # Write to WAV file + await writer.write(frame) + + # Check for keyboard input in EXTERNAL mode + if is_external_mode and select.select([sys.stdin], [], [], 0.0)[0]: + char = sys.stdin.read(1) + if char.lower() == "t": + client.finalize(end_of_turn=True) + + finally: + # Restore terminal settings + if old_settings: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) + else: + # No recording - simpler logic + try: + while True: + # Read audio frame + frame = await mic.read(chunk_size) + await client.send_audio(frame) - Args: - value: Either a JSON string or path to a JSON file + # Check for keyboard input in EXTERNAL mode + if is_external_mode and select.select([sys.stdin], [], [], 0.0)[0]: + char = sys.stdin.read(1) + if char.lower() == "t": + client.finalize(end_of_turn=True) - Returns: - Parsed json object + finally: + # Restore terminal settings + if old_settings: + termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings) - Raises: - argparse.ArgumentTypeError: If the value cannot be parsed - """ - # First, try to parse as JSON string - try: - return json.loads(value) - except json.JSONDecodeError: - pass - # If that fails, try to load as a file path - try: - file_path = Path(value) - if file_path.exists() and file_path.is_file(): - with open(file_path) as f: - return json.load(f) - else: - raise argparse.ArgumentTypeError(f"File not found: {value}") - except Exception as e: - raise argparse.ArgumentTypeError(f"Could not parse as JSON or load from file: {value}. Error: {e}") +# ============================================================================== +# COMMAND-LINE ARGUMENT PARSING +# ============================================================================== def parse_args(): @@ -550,7 +727,7 @@ def parse_args(): ) # ============================================================================== - # Core parameters + # Core parameters (authentication) # ============================================================================== parser.add_argument( @@ -567,7 +744,35 @@ def parse_args(): ) # ============================================================================== - # Audio source + # Configuration (preset or custom) + # ============================================================================== + + parser.add_argument( + "-P", + "--preset", + type=str, + help="Preset configuration name (e.g., scribe, low_latency, conversation_adaptive)", + ) + parser.add_argument( + "--list-presets", + action="store_true", + help="List available preset configurations and exit", + ) + parser.add_argument( + "-W", + "--show", + action="store_true", + help="Display the final configuration as JSON and exit (after applying preset/config and overrides)", + ) + parser.add_argument( + "-c", + "--config", + type=load_json, + help="Config JSON string or path to JSON file (default: None)", + ) + + # ============================================================================== + # Input/Output # ============================================================================== parser.add_argument( @@ -576,18 +781,33 @@ def parse_args(): type=str, help="Path to input audio file (WAV format, mono 16-bit). If not provided, uses microphone", ) + parser.add_argument( + "-o", + "--output-dir", + type=str, + default=DEFAULT_OUTPUT_DIR, + help=f"Output directory for {LOG_FILENAME}, {RECORDING_FILENAME}, and audio slices (default: {DEFAULT_OUTPUT_DIR})", + ) + parser.add_argument( + "-r", + "--record", + action="store_true", + help=f"Record microphone audio to {RECORDING_FILENAME} in output directory (microphone input only)", + ) # ============================================================================== - # Audio configuration + # Audio settings # ============================================================================== parser.add_argument( + "-R", "--sample-rate", type=int, default=16000, help="Audio sample rate in Hz (default: 16000)", ) parser.add_argument( + "-C", "--chunk-size", type=int, default=320, @@ -601,14 +821,14 @@ def parse_args(): ) # ============================================================================== - # Output configuration + # Output options # ============================================================================== parser.add_argument( - "-o", - "--output-file", - type=str, - help="Output to a JSONL file", + "-S", + "--save-slices", + action="store_true", + help="Save audio slices to output directory on SPEAKER_ENDED events (SMART_TURN mode only)", ) parser.add_argument( "-p", @@ -636,21 +856,16 @@ def parse_args(): help="Use default device (default: False)", ) parser.add_argument( + "-w", "--results", action="store_true", - help="Include word transcription payload results in output (default: False)", + help="Include word-level transcription results in output (default: False)", ) # ============================================================================== - # Voice Agent configuration + # Voice Agent configuration overrides # ============================================================================== - parser.add_argument( - "-c", - "--config", - type=load_json, - help="Config JSON string or path to JSON file (default: None)", - ) parser.add_argument( "-l", "--language", @@ -676,15 +891,9 @@ def parse_args(): choices=["FIXED", "ADAPTIVE", "EXTERNAL", "SMART_TURN"], help="End of utterance detection mode (default: ADAPTIVE)", ) - parser.add_argument( - "-e", - "--emit-sentences", - action="store_true", - help="Emit sentences (default: False)", - ) # ============================================================================== - # Speaker configuration + # Speaker management # ============================================================================== parser.add_argument( @@ -722,11 +931,6 @@ def parse_args(): type=load_json, help="Known speakers as JSON string or path to JSON file (default: None)", ) - parser.add_argument( - "--forced-eou", - action="store_true", - help="Use forced end of utterance (default: False)", - ) # ============================================================================== # Check for mutually exclusive options @@ -735,7 +939,7 @@ def parse_args(): args = parser.parse_args() mutually_excludive = [ - "emit-sentences", + "preset", "end-of-utterance-mode", "end-of-utterance-silence-trigger", "focus-speakers", @@ -743,7 +947,6 @@ def parse_args(): "ignore-speakers", "language", "max-delay", - "forced-eou", "speakers", ] diff --git a/examples/voice/cli/utils.py b/examples/voice/cli/utils.py index a22a61d..304d5b5 100644 --- a/examples/voice/cli/utils.py +++ b/examples/voice/cli/utils.py @@ -5,11 +5,15 @@ - Audio playback functionality - Custom logging with colour support - Helper functions for async operations +- File system utilities """ +import argparse import asyncio +import json import logging import sys +from pathlib import Path import pyaudio @@ -23,6 +27,112 @@ import tty +# ============================================================================== +# FILE SYSTEM UTILITIES +# ============================================================================== + + +def ensure_directory_exists(file_path: str) -> None: + """Ensure the directory for a file path exists. + + Creates all parent directories if they don't exist. + + Args: + file_path: Path to a file (can include filename) + """ + path = Path(file_path) + directory = path.parent if path.suffix else path + + if directory and not directory.exists(): + directory.mkdir(parents=True, exist_ok=True) + + +def load_json(value: str): + """Load JSON string or file path. + + Args: + value: Either a JSON string or path to a JSON file + + Returns: + Parsed json object + + Raises: + argparse.ArgumentTypeError: If the value cannot be parsed + """ + # First, try to parse as JSON string + try: + return json.loads(value) + except json.JSONDecodeError: + pass + + # If that fails, try to load as a file path + try: + file_path = Path(value) + if file_path.exists() and file_path.is_file(): + with open(file_path) as f: + return json.load(f) + else: + raise argparse.ArgumentTypeError(f"File not found: {value}") + except Exception as e: + raise argparse.ArgumentTypeError(f"Could not parse as JSON or load from file: {value}. Error: {e}") + + +# ============================================================================== +# AUDIO FILE UTILITIES +# ============================================================================== + + +class AudioFileWriter: + """Context manager for writing audio to WAV files. + + Usage: + async with AudioFileWriter(filepath, sample_rate, sample_width) as writer: + async for audio_chunk in source: + await writer.write(audio_chunk) + """ + + def __init__(self, filepath: str, sample_rate: int, sample_width: int = 2, channels: int = 1): + """Initialize audio file writer. + + Args: + filepath: Path to output WAV file + sample_rate: Audio sample rate in Hz + sample_width: Sample width in bytes (default: 2 for 16-bit) + channels: Number of audio channels (default: 1 for mono) + """ + self.filepath = filepath + self.sample_rate = sample_rate + self.sample_width = sample_width + self.channels = channels + self._wav_file = None + + async def __aenter__(self): + """Open WAV file for writing.""" + import wave + + ensure_directory_exists(self.filepath) + self._wav_file = wave.open(self.filepath, "wb") # noqa: SIM115 + self._wav_file.setnchannels(self.channels) + self._wav_file.setsampwidth(self.sample_width) + self._wav_file.setframerate(self.sample_rate) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Close WAV file.""" + if self._wav_file: + self._wav_file.close() + return False + + async def write(self, audio_data: bytes) -> None: + """Write audio data to file. + + Args: + audio_data: Raw audio bytes to write + """ + if self._wav_file: + self._wav_file.writeframes(audio_data) + + # ============================================================================== # ASYNC UTILITIES # ============================================================================== diff --git a/sdk/voice/README.md b/sdk/voice/README.md index 4c8085e..9ff5d7d 100644 --- a/sdk/voice/README.md +++ b/sdk/voice/README.md @@ -1,7 +1,8 @@ # Speechmatics Voice SDK +[![License](https://img.shields.io/badge/license-MIT-yellow.svg)](https://github.com/speechmatics/speechmatics-python-sdk/blob/main/LICENSE) [![PyPI](https://img.shields.io/pypi/v/speechmatics-voice)](https://pypi.org/project/speechmatics-voice/) -![PythonSupport](https://img.shields.io/badge/Python-3.9%2B-green) +[![PythonSupport](https://img.shields.io/badge/Python-3.9%2B-green)](https://www.python.org/) Python SDK for building voice-enabled applications with the Speechmatics Real-Time API. Optimized for conversational AI, voice agents, transcription services, and real-time captioning. @@ -95,6 +96,9 @@ if __name__ == "__main__": Presets provide optimized configurations for common use cases: ```python +# External end of turn preset - endpointing handled by the client +client = VoiceAgentClient(api_key=api_key, preset="external") + # Scribe preset - for note-taking client = VoiceAgentClient(api_key=api_key, preset="scribe") diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py index e1575b3..3277090 100644 --- a/sdk/voice/speechmatics/voice/_client.py +++ b/sdk/voice/speechmatics/voice/_client.py @@ -71,6 +71,9 @@ class VoiceAgentClient(AsyncClient): and provides additional functionality for processing partial and final transcription from the STT engine into accumulated transcriptions with flags to indicate changes between messages, etc. + + If no config or preset is provided, the client will default to the EXTERNAL + preset. """ # ============================================================================ @@ -157,8 +160,12 @@ def __init__( # Client Configuration # ------------------------------------- + # Default to EXTERNAL if no config or preset string provided + if config is None and not preset: + config = VoiceAgentConfigPreset.EXTERNAL() + # Check for preset - if preset: + elif preset: preset_config = VoiceAgentConfigPreset.load(preset) config = VoiceAgentConfigPreset._merge_configs(preset_config, config) @@ -238,6 +245,7 @@ def __init__( # Handlers self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize) self._smart_turn_detector: Optional[SmartTurnDetector] = None + self._eot_calculation_task: Optional[asyncio.Task] = None # Current turn self._turn_start_time: Optional[float] = None @@ -1134,9 +1142,13 @@ async def _process_speech_fragments(self, change_filter: Optional[list[Annotatio # Turn prediction if self._uses_forced_eou: - ttl = await self._calculate_finalize_delay() - if ttl: - self._turn_handler.update_timer(ttl) + + async def fn() -> None: + ttl = await self._calculate_finalize_delay() + if ttl: + self._turn_handler.update_timer(ttl) + + self._run_background_eot_calculation(fn) # Check for gaps # FragmentUtils.find_segment_pauses(self._client_session, self._current_view) @@ -1351,6 +1363,16 @@ async def _emit_end_of_turn(self) -> None: # TURN DETECTION & FINALIZATION # ============================================================================ + def _run_background_eot_calculation(self, fn: Callable) -> None: + """Run the calculation async.""" + + # Existing task takes precedence + if self._eot_calculation_task and not self._eot_calculation_task.done(): + return + + # Create new task + self._eot_calculation_task = asyncio.create_task(fn()) + async def _calculate_finalize_delay( self, smart_turn_prediction: Optional[SmartTurnPredictionResult] = None, @@ -1470,8 +1492,6 @@ async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartT end_time=end_time + self._config.smart_turn_config.slice_margin, ) - # TODO - Output audio (for client to use) - # Evaluate prediction = await self._smart_turn_detector.predict( segment_audio, @@ -1650,8 +1670,12 @@ async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: floa # Turn prediction if self._uses_forced_eou: - ttl = await self._eot_prediction(event_time) - self._turn_handler.update_timer(ttl) + + async def fn() -> None: + ttl = await self._eot_prediction(event_time) + self._turn_handler.update_timer(ttl) + + self._run_background_eot_calculation(fn) # Emit the event self._emit_message( diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py index 2b00230..fa9c1ea 100644 --- a/sdk/voice/speechmatics/voice/_models.py +++ b/sdk/voice/speechmatics/voice/_models.py @@ -364,7 +364,7 @@ class EndOfTurnConfig(BaseConfigModel): """ base_multiplier: float = 1.0 - min_end_of_turn_delay: float = 0.015 + min_end_of_turn_delay: float = 0.3 end_of_turn_adjustment_factor: float = 1.0 penalties: list[EndOfTurnPenaltyItem] = Field( default_factory=lambda: [ @@ -372,7 +372,7 @@ class EndOfTurnConfig(BaseConfigModel): EndOfTurnPenaltyItem(penalty=3.0, annotation=[AnnotationFlags.VERY_SLOW_SPEAKER]), EndOfTurnPenaltyItem(penalty=2.0, annotation=[AnnotationFlags.SLOW_SPEAKER]), EndOfTurnPenaltyItem(penalty=2.5, annotation=[AnnotationFlags.ENDS_WITH_DISFLUENCY]), - EndOfTurnPenaltyItem(penalty=1.2, annotation=[AnnotationFlags.HAS_DISFLUENCY]), + EndOfTurnPenaltyItem(penalty=1.1, annotation=[AnnotationFlags.HAS_DISFLUENCY]), EndOfTurnPenaltyItem( penalty=2.0, annotation=[AnnotationFlags.ENDS_WITH_EOS], @@ -417,8 +417,8 @@ class SmartTurnConfig(BaseConfigModel): audio_buffer_length: float = 0.0 smart_turn_threshold: float = 0.5 slice_margin: float = 0.05 - positive_penalty: float = 0.2 - negative_penalty: float = 2.5 + positive_penalty: float = 0.3 + negative_penalty: float = 1.7 class VoiceAgentConfig(BaseConfigModel): diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py index 2452d14..88703c9 100644 --- a/sdk/voice/speechmatics/voice/_presets.py +++ b/sdk/voice/speechmatics/voice/_presets.py @@ -123,6 +123,25 @@ def CAPTIONS(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # overlay, ) + @staticmethod + def EXTERNAL(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig: # noqa: N802 + """Best suited for external turn control. + + This mode will emit partial and final segments as they become available. The end of + utterance is set to external. End of turn is not required for external turn control. + """ + return VoiceAgentConfigPreset._merge_configs( + VoiceAgentConfig( + operating_point=OperatingPoint.ENHANCED, + enable_diarization=True, + max_delay=1.0, + end_of_utterance_silence_trigger=1.2, + end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL, + speech_segment_config=SpeechSegmentConfig(emit_sentences=True), + ), + overlay, + ) + @staticmethod def list_presets() -> list[str]: """List available presets."""