diff --git a/specs/swml/Methods/ai/ai_languages.tsp b/specs/swml/Methods/ai/ai_languages.tsp index 9278f3ce2..6036e96ad 100644 --- a/specs/swml/Methods/ai/ai_languages.tsp +++ b/specs/swml/Methods/ai/ai_languages.tsp @@ -1,4 +1,5 @@ import "@typespec/json-schema"; +import "../../Shared/Types/main.tsp"; using TypeSpec.JsonSchema; @@ -33,6 +34,22 @@ model LanguagesBase { #deprecated "The `engine` property is deprecated. Please include the engine in the voice field." @doc("The engine to use for the language. For example, 'elevenlabs'.") engine?: string; + + @doc("TTS engine-specific parameters for this language.") + params?: LanguageParams; +} + +@summary("LanguageParams") +model LanguageParams { + @doc("The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice. IMPORTANT: Only works with ElevenLabs TTS engine.") + @minValue(0.0) + @maxValue(1.0) + stability?: float | SWMLVar = 0.50; + + @doc("The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice. IMPORTANT: Only works with ElevenLabs TTS engine.") + @minValue(0.0) + @maxValue(1.0) + similarity?: float | SWMLVar = 0.75; } @summary("LanguagesWithSoloFillers") diff --git a/specs/swml/Methods/ai/ai_params.tsp b/specs/swml/Methods/ai/ai_params.tsp index 1e2f2e7ba..192d11bc8 100644 --- a/specs/swml/Methods/ai/ai_params.tsp +++ b/specs/swml/Methods/ai/ai_params.tsp @@ -1,4 +1,5 @@ import "@typespec/json-schema"; +import "../../Shared/Types/main.tsp"; using TypeSpec.JsonSchema; @@ -190,16 +191,6 @@ model AIParams { @maxValue(10000) end_of_speech_timeout?: integer | SWMLVar; - @doc("The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice.") - @minValue(0.01) - @maxValue(1.0) - eleven_labs_stability?: float | SWMLVar; - - @doc("The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice.") - @minValue(0.01) - @maxValue(1.0) - eleven_labs_similarity?: float | SWMLVar; - @doc("If `true`, enables usage accounting. The default is `false`.") enable_accounting?: boolean | SWMLVar; @@ -466,5 +457,17 @@ model AIParams { """) wake_prefix?: string; + #deprecated "The `eleven_labs_stability` property is deprecated. Please use `languages[].params.stability` instead." + @doc("The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice.") + @minValue(0.0) + @maxValue(1.0) + eleven_labs_stability?: float | SWMLVar = 0.50; + + #deprecated "The `eleven_labs_similarity` property is deprecated. Please use `languages[].params.similarity` instead." + @doc("The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice.") + @minValue(0.0) + @maxValue(1.0) + eleven_labs_similarity?: float | SWMLVar = 0.75; + ...TypeSpec.Record; } diff --git a/specs/swml/tsp-output/@typespec/json-schema/SWMLObject.json b/specs/swml/tsp-output/@typespec/json-schema/SWMLObject.json index 35c7869d0..529718a41 100644 --- a/specs/swml/tsp-output/@typespec/json-schema/SWMLObject.json +++ b/specs/swml/tsp-output/@typespec/json-schema/SWMLObject.json @@ -3917,32 +3917,6 @@ "maximum": 10000, "description": "Amount of silence, in ms, at the end of an utterance to detect end of speech. Allowed values from `250` - `10,000`. **Default:** `700` ms (Note: Documentation incorrectly lists 2000ms)." }, - "eleven_labs_stability": { - "anyOf": [ - { - "type": "number" - }, - { - "$ref": "#/$defs/SWMLVar" - } - ], - "minimum": 0.01, - "maximum": 1, - "description": "The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice." - }, - "eleven_labs_similarity": { - "anyOf": [ - { - "type": "number" - }, - { - "$ref": "#/$defs/SWMLVar" - } - ], - "minimum": 0.01, - "maximum": 1, - "description": "The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice." - }, "enable_accounting": { "anyOf": [ { @@ -4554,6 +4528,36 @@ "wake_prefix": { "type": "string", "description": "Specifies an additional prefix that must be spoken along with the agent's name (`ai_name`)\nto wake the agent from a paused state. For example, if `ai_name` is \"computer\" and\n`wake_prefix` is \"hey\", the user would need to say \"hey computer\" to activate the agent." + }, + "eleven_labs_stability": { + "anyOf": [ + { + "type": "number" + }, + { + "$ref": "#/$defs/SWMLVar" + } + ], + "default": 0.5, + "minimum": 0, + "maximum": 1, + "description": "The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice.", + "deprecated": true + }, + "eleven_labs_similarity": { + "anyOf": [ + { + "type": "number" + }, + { + "$ref": "#/$defs/SWMLVar" + } + ], + "default": 0.75, + "minimum": 0, + "maximum": 1, + "description": "The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice.", + "deprecated": true } }, "unevaluatedProperties": {}, @@ -5284,6 +5288,10 @@ "description": "The engine to use for the language. For example, 'elevenlabs'.", "deprecated": true }, + "params": { + "$ref": "#/$defs/LanguageParams", + "description": "TTS engine-specific parameters for this language." + }, "fillers": { "type": "array", "items": { @@ -5337,6 +5345,10 @@ "description": "The engine to use for the language. For example, 'elevenlabs'.", "deprecated": true }, + "params": { + "$ref": "#/$defs/LanguageParams", + "description": "TTS engine-specific parameters for this language." + }, "function_fillers": { "type": "array", "items": { @@ -6423,6 +6435,43 @@ "not": {} } }, + "LanguageParams": { + "type": "object", + "properties": { + "stability": { + "anyOf": [ + { + "type": "number" + }, + { + "$ref": "#/$defs/SWMLVar" + } + ], + "default": 0.5, + "minimum": 0, + "maximum": 1, + "description": "The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice. IMPORTANT: Only works with ElevenLabs TTS engine." + }, + "similarity": { + "anyOf": [ + { + "type": "number" + }, + { + "$ref": "#/$defs/SWMLVar" + } + ], + "default": 0.75, + "minimum": 0, + "maximum": 1, + "description": "The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice. IMPORTANT: Only works with ElevenLabs TTS engine." + } + }, + "unevaluatedProperties": { + "not": {} + }, + "title": "LanguageParams" + }, "ConversationRole": { "type": "string", "enum": [ diff --git a/website/docs/main/swml/guides/AI/holiday-special-santa-ai/index.mdx b/website/docs/main/swml/guides/AI/holiday-special-santa-ai/index.mdx index 38904d8cb..fae3654a4 100644 --- a/website/docs/main/swml/guides/AI/holiday-special-santa-ai/index.mdx +++ b/website/docs/main/swml/guides/AI/holiday-special-santa-ai/index.mdx @@ -144,20 +144,23 @@ languages: --- -### Params +### ElevenLabs Voice Parameters -The `params` parameter is used to define the AI's `eleven_labs_stability` and `eleven_labs_similarity` parameters. +We use ElevenLabs TTS engine-specific parameters to fine-tune Santa's voice. These parameters are configured per-language using `languages[].params`. -The `eleven_labs_stability` parameter is used to define the stability of the AI's voice, while the `eleven_labs_similarity` -parameter is used to define the similarity of the AI's voice to the voice that is defined in the `voice` parameter. +The `stability` parameter controls the stability of the AI's voice, while the `similarity` parameter defines how closely the voice adheres to the original voice characteristics. This allows us to control the AI's voice and make it more realistic and as close to Santa's voice as possible. You can learn more about these settings here: [Eleven Labs Documentation](https://elevenlabs.io/docs/speech-synthesis/voice-settings#stability). ```yaml andJson -params: - eleven_labs_stability: 0.1 - eleven_labs_similarity: 0.25 +languages: + - name: English + code: en-US + voice: elevenlabs.rachel + params: + stability: 0.1 + similarity: 0.25 ``` --- @@ -509,15 +512,14 @@ sections: ### Step 6 Continue the conversation, keeping it playful and entertaining. If another present is requested, gently remind them that only one gift can be chosen. post_prompt_url: Post Prompt Webhook Here - params: - eleven_labs_stability: 0.1 - eleven_labs_similarity: 0.25 languages: - name: English code: en-US - voice: gvU4yEv29ZpMc9IXoZcd - engine: elevenlabs - fillers: + voice: elevenlabs.gvU4yEv29ZpMc9IXoZcd + params: + stability: 0.1 + similarity: 0.25 + speech_fillers: - one moment please, - uhh ha, - aah ha, diff --git a/website/docs/main/swml/reference/methods/ai/ai_params/index.mdx b/website/docs/main/swml/reference/methods/ai/ai_params/index.mdx index fbb9c87ca..558d491d2 100644 --- a/website/docs/main/swml/reference/methods/ai/ai_params/index.mdx +++ b/website/docs/main/swml/reference/methods/ai/ai_params/index.mdx @@ -16,7 +16,8 @@ import APIField from "@site/src/components/APIField"; [conscience]: ./conscience.mdx [hold-music]: ./hold_music.mdx [interrupt-prompt]: ./interrupt_prompt.mdx -[ai-languages]: ../ai_languages.mdx +[ai-languages]: /swml/methods/ai/languages +[ai-languages-params]: /swml/methods/ai/languages#params [ai-params]: ./index.mdx [post-prompt-url]: /swml/methods/ai/post_prompt_url [get-visual-input]: /swml/methods/ai/swaig/internal_fillers#internal_fillers-parameters @@ -329,20 +330,6 @@ Customize the AI agent's voice output, including volume control, voice character Adjust the volume of the AI. Allowed values from `-50`-`50`. - - The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice. Valid values range from `0.01` to `1.0`.

**Important**: This will only works when `elevenlabs` is set in the [`ai.languages.voice`][ai-languages] as the engine id. -
- - - The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice. Valid values range from `0.01` to `1.0`.

**Important**: This will only works when `elevenlabs` is set in the [`ai.languages.voice`][ai-languages] as the engine id. -
- + + The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. The higher the similarity, the closer the AI will sound to the original voice. Valid values range from `0.0` to `1.0`.**Deprecated**: Use [`languages[].params.similarity`][ai-languages-params] instead. + + + + The stability slider determines how stable the voice is and the randomness between each generation. Lowering this slider introduces a broader emotional range for the voice. Valid values range from `0.0` to `1.0`.**Deprecated**: Use [`languages[].params.stability`][ai-languages-params] instead. + + ### Interruption & Barge Control Manage how the AI agent handles interruptions when users speak over it, including when to stop speaking, acknowledge interruptions, or continue regardless. diff --git a/website/docs/main/swml/reference/methods/ai/index.mdx b/website/docs/main/swml/reference/methods/ai/index.mdx index 075c6a7ab..8de1b8a8f 100644 --- a/website/docs/main/swml/reference/methods/ai/index.mdx +++ b/website/docs/main/swml/reference/methods/ai/index.mdx @@ -9,7 +9,7 @@ tags: ['swml'] --- [hints]: /swml/methods/ai/hints -[languages]: ./ai_languages.mdx +[languages]: ./languages [params]: ./ai_params/index.mdx [post_prompt]: /swml/methods/ai/post_prompt [post_prompt_url]: /swml/methods/ai/post_prompt_url diff --git a/website/docs/main/swml/reference/methods/ai/ai_languages.mdx b/website/docs/main/swml/reference/methods/ai/languages/index.mdx similarity index 83% rename from website/docs/main/swml/reference/methods/ai/ai_languages.mdx rename to website/docs/main/swml/reference/methods/ai/languages/index.mdx index cfae0686a..083a6b8fb 100644 --- a/website/docs/main/swml/reference/methods/ai/ai_languages.mdx +++ b/website/docs/main/swml/reference/methods/ai/languages/index.mdx @@ -15,6 +15,7 @@ import APIField from "@site/src/components/APIField"; [voices-and-languages]: /voice/getting-started/voice-and-languages [swaig-functions]: /swml/methods/ai/swaig/functions [deepgram-codes]: https://developers.deepgram.com/docs/models-languages-overview#nova-3 +[ai-params]: /swml/methods/ai/params # ai.languages @@ -103,6 +104,15 @@ Use `ai.languages` to configure the spoken language of your AI Agent, as well as The speed to use for the specified TTS engine. This allows the AI to speak at a different speed at different points in the conversation. The speed behavior can be defined in the prompt of the AI.
*Valid values:** `auto`
**IMPORTANT:** Only works with [`Cartesia`](/voice/tts/cartesia) TTS engine.
+ + TTS engine-specific parameters for this language. + Accepts the [`languages.params` parameters](/swml/methods/ai/languages/params). + + The engine to use for the language. For example, `"elevenlabs"`.**Deprecated.** Set the engine with the [`voice`](#use-voice-strings) parameter. +--- + ### Use `voice` strings Compose the `voice` string using the `.` syntax. @@ -169,6 +181,64 @@ languages: voice: elevenlabs.rachel ``` +### Configure global ElevenLabs parameters + +Configure stability and similarity globally for all ElevenLabs voices using `ai.params`: + +```yaml andJson +ai: + params: + eleven_labs_stability: 0.6 + eleven_labs_similarity: 0.8 + languages: + - name: English + code: en-US + voice: elevenlabs.josh +``` + +### Configure per-language ElevenLabs parameters + +Configure different stability and similarity values for each language using `languages[].params`: + +```yaml andJson +ai: + languages: + - name: English + code: en-US + voice: elevenlabs.josh + params: + stability: 0.6 + similarity: 0.8 + - name: Spanish + code: es-ES + voice: elevenlabs.maria + params: + stability: 0.4 + similarity: 0.9 +``` + +### Mixed configuration + +Per-language params override global params: + +```yaml andJson +ai: + params: + eleven_labs_stability: 0.5 + eleven_labs_similarity: 0.75 + languages: + - name: English + code: en-US + voice: elevenlabs.josh + # Uses global defaults: stability=0.5, similarity=0.75 + - name: Spanish + code: es-ES + voice: elevenlabs.maria + params: + stability: 0.3 + # Overrides only stability, similarity still uses global 0.75 +``` + {/* This example commented out as the language-switching behavior is a bit inconsistent. diff --git a/website/docs/main/swml/reference/methods/ai/languages/params.mdx b/website/docs/main/swml/reference/methods/ai/languages/params.mdx new file mode 100644 index 000000000..cc0418c79 --- /dev/null +++ b/website/docs/main/swml/reference/methods/ai/languages/params.mdx @@ -0,0 +1,48 @@ +--- +sidebar_label: languages.params +hide_title: false +slug: /swml/methods/ai/languages/params +title: languages.params +description: Engine-specific voice and language configuration. +tags: ['swml'] +--- + +[tts-providers]: /voice/getting-started/voice-and-languages#providers +[voices-and-languages]: /voice/getting-started/voice-and-languages +[swaig-functions]: /swml/methods/ai/swaig/functions +[deepgram-codes]: https://developers.deepgram.com/docs/models-languages-overview#nova-3 +[ai-params]: /swml/methods/ai/params + +# languages.params + +Use `languages[].params` to configure TTS engine-specific parameters for individual languages. + + + The similarity slider dictates how closely the AI should adhere to the original voice when attempting to replicate it. + The higher the similarity, the closer the AI will sound to the original voice. + Valid values range from `0.0` to `1.0`. + :::info + + Only works with the ElevenLabs TTS engine. + + ::: + + + + The stability slider determines how stable the voice is and the randomness between each generation. + Lowering this slider introduces a broader emotional range for the voice. + Valid values range from `0.0` to `1.0`. + :::info + + Only works with the ElevenLabs TTS engine. + + ::: + diff --git a/website/src/theme/MDXComponents/index.js b/website/src/theme/MDXComponents/index.js index f099ce3db..44a36fe1a 100644 --- a/website/src/theme/MDXComponents/index.js +++ b/website/src/theme/MDXComponents/index.js @@ -27,6 +27,7 @@ import Tables from "../../components/Tables"; import APIBadge from "../../components/APIBadge"; import APIField from "../../components/APIField"; import InstallHero from "../../components/InstallHero"; +import APIField from "@site/src/components/APIField"; export default { ...MDXComponents, @@ -66,6 +67,6 @@ export default { ReleaseCard, Tables, APIBadge, - APIField, - InstallHero + InstallHero, + APIField };