From 9da1ab7e526d55e6185547bb73b4dae355faa2b3 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Wed, 15 Oct 2025 13:25:06 +0100 Subject: [PATCH 01/11] add batch speaker id docs --- docs/speech-to-text/batch/sidebar.ts | 4 + .../batch/speaker-identification.mdx | 153 ++++++++++++++++++ .../features/speaker-identification.mdx | 2 +- docs/speech-to-text/realtime/sidebar.ts | 2 +- ...ication.mdx => speaker-identification.mdx} | 21 ++- 5 files changed, 179 insertions(+), 3 deletions(-) create mode 100644 docs/speech-to-text/batch/speaker-identification.mdx rename docs/speech-to-text/realtime/{realtime-speaker-identification.mdx => speaker-identification.mdx} (83%) diff --git a/docs/speech-to-text/batch/sidebar.ts b/docs/speech-to-text/batch/sidebar.ts index 550dc85..09c87cf 100644 --- a/docs/speech-to-text/batch/sidebar.ts +++ b/docs/speech-to-text/batch/sidebar.ts @@ -22,6 +22,10 @@ export default { type: "doc", id: "speech-to-text/batch/batch-diarization", }, + { + type: "doc", + id: "speech-to-text/batch/speaker-identification", + }, { type: "category", label: "Speech intelligence", diff --git a/docs/speech-to-text/batch/speaker-identification.mdx b/docs/speech-to-text/batch/speaker-identification.mdx new file mode 100644 index 0000000..f91a720 --- /dev/null +++ b/docs/speech-to-text/batch/speaker-identification.mdx @@ -0,0 +1,153 @@ +--- +sidebar_label: 'Speaker identification' +description: "Learn how to use the Speechmatics API to identify speakers in Batch" +keywords: + [ + speechmatics, + batch, + diarization, + transcription, + speech recognition, + automatic speech recognition, + asr, + ] +sidebar_position: 2 +--- + +import DocCardList from '@theme/DocCardList'; +import { Card, DataList, Text } from '@radix-ui/themes'; + +# Batch speaker identification + +:::tip +For an overview of the feature, see the [speaker identification](../features/speaker-identification.mdx) page. +::: + +## Enrollment + +To generate identifiers for a desired speaker, run a [speaker diarization](../features/diarization.mdx#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. +You can request the identifiers back from the engine by setting the `get_speakers` flag in the transcription config: + +```json +{ + "type": "transcription", + "transcription_config": { + "language": "en", + "diarization": "speaker" + "speaker_diarization_config": { + // highlight-start + "get_speakers": True + // highlight-end + } + } +} +``` + +When the transcription is done, the speakers identifiers will be attached to the returned transcript: + +```json +{ + "results": [ + { + "alternatives": [ + { + "confidence": 0.93, + "content": "Hello", + "language": "en", + "speaker": "S1" + } + ], + ... + }, + { + "alternatives": [ + { + "confidence": 1.0, + "content": "Hi", + "language": "en", + "speaker": "S2" + } + ], + ... + }], + // highlight-start + "speakers": [ + { + "label": "S1", + "speaker_identifiers": [""] + }, + { + "label": "S2", + "speaker_identifiers": [""] + }] + // highlight-end +} +``` + +## Identification + +Once you have generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. All other [speaker diarization options](batch_diarization.mdx#configuration) remain supported. Notably, the `speakers_sensitivity` parameter can be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones, where lower values make it more likely to match existing enrolled speakers. + +An example configuration is shown below: + +```json +{ + "type": "transcription", + "transcription_config": { + "language": "en", + "diarization": "speaker", + "speaker_diarization_config": { + // highlight-start + "speakers": [ + {"label": "Alice", "speaker_identifiers": ["", ""]}, + {"label": "Bob", "speaker_identifiers": [""]} + ] + // highlight-end + } + } +} +``` + +With the config above, transcript segments should be tagged with `"Alice"` and `"Bob"` whenever these speakers are detected, whereas any other speakers should be tagged with the internal labels: + +```json +{ + "results": [ + { + "alternatives": [ + { + "confidence": 0.93, + "content": "Morning", + "language": "en", + // highlight-start + "speaker": "Alice" + // highlight-end + } + ], + ... + }, + { + "alternatives": [ + { + "confidence": 0.93, + "content": "Hi", + "language": "en", + "speaker": "S1" + } + ], + ... + }, + { + "alternatives": [ + { + "confidence": 1.0, + "content": "Morning", + "language": "en", + // highlight-start + "speaker": "Bob" + // highlight-end + } + ], + }] +} +``` diff --git a/docs/speech-to-text/features/speaker-identification.mdx b/docs/speech-to-text/features/speaker-identification.mdx index dc311ba..970727e 100644 --- a/docs/speech-to-text/features/speaker-identification.mdx +++ b/docs/speech-to-text/features/speaker-identification.mdx @@ -55,4 +55,4 @@ In all of the above cases — including model mismatches or attempts to use iden ## Supported modes -Speaker identification is currently only supported in **[Realtime speaker identification](../realtime/realtime-speaker-identification.mdx)**
mode with the batch support coming soon. +Speaker identification is supported in both **[Realtime speaker identification](../realtime/speaker-identification.mdx)**
and **[Batch speaker identification](../batch/speaker-identification.mdx)**
modes. diff --git a/docs/speech-to-text/realtime/sidebar.ts b/docs/speech-to-text/realtime/sidebar.ts index c938394..3a12739 100644 --- a/docs/speech-to-text/realtime/sidebar.ts +++ b/docs/speech-to-text/realtime/sidebar.ts @@ -24,7 +24,7 @@ export default { }, { type: "doc", - id: "speech-to-text/realtime/realtime-speaker-identification", + id: "speech-to-text/realtime/speaker-identification", }, { type: "doc", diff --git a/docs/speech-to-text/realtime/realtime-speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx similarity index 83% rename from docs/speech-to-text/realtime/realtime-speaker-identification.mdx rename to docs/speech-to-text/realtime/speaker-identification.mdx index 77fb9e9..5aa5321 100644 --- a/docs/speech-to-text/realtime/realtime-speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -34,7 +34,7 @@ For an overview of the feature, see the [speaker identification](../features/spe To generate identifiers for a desired speaker, run a [speaker diarization](../features/diarization.mdx#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. You can request the identifiers back from the engine by sending a `GetSpeakers` request. -By default, the engine returns identifiers created up to the time of the request, but you can also wait until the end of the stream by setting the optional `final` flag in the `GetSpeakers` request (recommended for enrollment): +By default, the engine returns identifiers created up to the time of the request, but you can also wait until the end of the stream by setting the optional `final` flag in the `GetSpeakers` request: ```json { @@ -46,6 +46,25 @@ By default, the engine returns identifiers created up to the time of the request - final: false (default) — returns identifiers generated up to the point of the request. To avoid empty results, wait until the server has issued at least one `AddTranscript` message before sending the request. - final: true — waits until the end of the stream and returns identifiers based on all audio. +Alternatively, you can enable automatic speaker retrieval by setting the `get_speakers` option to true in the diarization configuration (recommended for enrollment). This ensures the engine automatically returns speaker identifiers when the transcription completes — the same behavior as issuing the `GetSpeakers(final=true)` request manually. If the `get_speakers` option is not present in the configuration or is set to false, you can still request speakers explicitly by sending the `GetSpeakers(final=true)` message. In this case, the request takes precedence, and the engine will return the speaker identifiers at the end of the transcription. + +Example speaker diarization config: + +```json +{ + "type": "transcription", + "transcription_config": { + "language": "en", + "diarization": "speaker" + "speaker_diarization_config": { + // highlight-start + "get_speakers": True + // highlight-end + } + } +} +``` + When the request is processed, the server replies with a `SpeakersResult` message that contains the identifiers for each diarized speaker: ```json From 893fbbe27f3a01ae5c9c448b121fe76bf08138f6 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 21 Oct 2025 12:19:51 +0100 Subject: [PATCH 02/11] add a recommended limit on the number of speaker ids --- docs/speech-to-text/features/speaker-identification.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/speech-to-text/features/speaker-identification.mdx b/docs/speech-to-text/features/speaker-identification.mdx index 970727e..3d87b00 100644 --- a/docs/speech-to-text/features/speaker-identification.mdx +++ b/docs/speech-to-text/features/speaker-identification.mdx @@ -31,13 +31,13 @@ By tagging known speakers with consistent labels, speaker identification makes t To use speaker identification you must enable diarization in the [speaker mode](../features/diarization.mdx#diarization-modes) and then follow the two steps below: -- **Enrollment** — For each speaker you want to recognize, generate identifiers from short audio clips (5–30s) where they ideally speak alone. +- **Enrollment** - For each speaker you want to recognize, generate identifiers from short audio clips (5–30s) where they ideally speak alone. To improve robustness, you can enroll the same speaker with multiple clips recorded under different acoustic conditions, selected to represent the degree of variety and quality that could be expected in the target audio. -- **Identification** — Use the enrolled identifiers in transcription jobs to label known speakers with meaningful names (for example, `Alice` or `John`). The system matches voices to identifiers and tags the output with the desired labels. +- **Identification** - Use the enrolled identifiers in transcription jobs to label known speakers with meaningful names (for example, `Alice` or `John`). The system matches voices to identifiers and tags the output with the desired labels. :::info -Labels for identified speakers must not use reserved internal labels (UU, S1, S2, etc.) and must not include leading or trailing spaces. +It is recommended to keep the number of speaker ID's to a minimum for best accuracy. The number of speaker ID's should not go above 50. Furthermore, labels for identified speakers must not use reserved internal labels (UU, S1, S2, etc.) and must not include leading or trailing spaces. ::: ## Known caveats @@ -55,4 +55,4 @@ In all of the above cases — including model mismatches or attempts to use iden ## Supported modes -Speaker identification is supported in both **[Realtime speaker identification](../realtime/speaker-identification.mdx)**
and **[Batch speaker identification](../batch/speaker-identification.mdx)**
modes. +Speaker identification is supported in both **[Realtime speaker identification](../realtime/speaker-identification.mdx)** and **[Batch speaker identification](../batch/speaker-identification.mdx)** modes. From 25f5d0e9ef9a501feb1ac1d0264dec20dc7e44f7 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Mon, 10 Nov 2025 10:35:52 +0000 Subject: [PATCH 03/11] update max identifiers comment --- docs/speech-to-text/features/speaker-identification.mdx | 2 +- docs/speech-to-text/realtime/speaker-identification.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/speech-to-text/features/speaker-identification.mdx b/docs/speech-to-text/features/speaker-identification.mdx index 3d87b00..69aad20 100644 --- a/docs/speech-to-text/features/speaker-identification.mdx +++ b/docs/speech-to-text/features/speaker-identification.mdx @@ -37,7 +37,7 @@ To use speaker identification you must enable diarization in the [speaker mode]( - **Identification** - Use the enrolled identifiers in transcription jobs to label known speakers with meaningful names (for example, `Alice` or `John`). The system matches voices to identifiers and tags the output with the desired labels. :::info -It is recommended to keep the number of speaker ID's to a minimum for best accuracy. The number of speaker ID's should not go above 50. Furthermore, labels for identified speakers must not use reserved internal labels (UU, S1, S2, etc.) and must not include leading or trailing spaces. +It is recommended to minimize the number of speaker IDs to achieve optimal accuracy. A maximum of 50 speaker identifiers across all speakers can be configured per session. Additionally, labels for identified speakers must not use reserved internal labels (e.g., UU, S1, S2) and should not contain leading or trailing spaces. ::: ## Known caveats diff --git a/docs/speech-to-text/realtime/speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx index 5aa5321..0703ad7 100644 --- a/docs/speech-to-text/realtime/speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -79,7 +79,7 @@ When the request is processed, the server replies with a `SpeakersResult` messag ## Identification -Once you've generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. A maximum of 50 speaker identifiers across all speakers can be configured per session. +Once you've generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. All [speaker diarization options](realtime-diarization.mdx#configuration) work with speaker identification. The `max_speakers` parameter only applies to generic (non-enrolled) speakers. For example, if it’s set to 10 and 10 speakers are enrolled, the system can still add up to 10 additional generic speakers. The `speakers_sensitivity` parameter can also be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones. Lower values make it more likely to match existing enrolled speakers. From fce8b6da292ecaab0aeb837e86dccc98abf0f1f8 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Mon, 10 Nov 2025 10:57:59 +0000 Subject: [PATCH 04/11] update the public specs --- spec/batch.yaml | 57 ++++++++++++++++++++++++++++++++++++++++++++++ spec/realtime.yaml | 4 ++++ 2 files changed, 61 insertions(+) diff --git a/spec/batch.yaml b/spec/batch.yaml index 824cae3..c0f9af0 100644 --- a/spec/batch.yaml +++ b/spec/batch.yaml @@ -1103,6 +1103,21 @@ definitions: maximum: 1 description: >- Controls how sensitive the algorithm is in terms of keeping similar speakers separate, as opposed to combining them into a single speaker. Higher values will typically lead to more speakers, as the degree of difference between speakers in order to allow them to remain distinct will be lower. A lower value for this parameter will conversely guide the algorithm towards being less sensitive in terms of retaining similar speakers, and as such may lead to fewer speakers overall. The default is 0.5. + get_speakers: + type: boolean + description: >- + If true, speaker identifiers will be returned at the end of transcript. + speakers: + type: array + x-omitempty: true + description: >- + Use this option to provide speaker labels linked to their speaker identifiers. + When passed, the transcription system will tag spoken words in the transcript + with the provided speaker labels whenever any of the specified speakers + is detected in the audio. A maximum of 50 speakers identifiers across all speakers + can be provided. + items: + $ref: '#/definitions/SpeakersInputItem' example: language: en output_locale: en-GB @@ -1871,6 +1886,12 @@ definitions: type: array items: $ref: "#/definitions/RecognitionResult" + speakers: + type: array + x-omitempty: true + items: + $ref: '#/definitions/SpeakersResultItem' + description: List of unique speaker identifiers detected in the transcript. translations: type: object description: >- @@ -2315,3 +2336,39 @@ definitions: type: string transcript: type: string + SpeakersInputItem: + type: object + properties: + label: + type: string + minLength: 1 + description: Speaker label, which must not match the format used internally (e.g. S1, S2, etc) + speaker_identifiers: + type: array + minItems: 1 + uniqueItems: true + items: + type: string + format: bytes + description: Speaker identifiers. + required: + - label + - speaker_identifiers + SpeakersResultItem: + type: object + properties: + label: + type: string + minLength: 1 + description: Speaker label. + speaker_identifiers: + type: array + minItems: 1 + uniqueItems: true + items: + type: string + format: bytes + description: Speaker identifiers. + required: + - label + - speaker_identifiers \ No newline at end of file diff --git a/spec/realtime.yaml b/spec/realtime.yaml index d8aaec0..d229c3b 100644 --- a/spec/realtime.yaml +++ b/spec/realtime.yaml @@ -932,6 +932,10 @@ components: format: float minimum: 0 maximum: 1 + get_speakers: + type: boolean + description: >- + If true, speaker identifiers will be returned at the end of transcript. speakers: type: array description: >- From 94b3b72898c2372f6800a4fc1ac2ba87d1ba184e Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 16:17:52 +0000 Subject: [PATCH 05/11] fix JSON boolean flag --- docs/speech-to-text/batch/speaker-identification.mdx | 2 +- docs/speech-to-text/realtime/speaker-identification.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/speech-to-text/batch/speaker-identification.mdx b/docs/speech-to-text/batch/speaker-identification.mdx index f91a720..54c0457 100644 --- a/docs/speech-to-text/batch/speaker-identification.mdx +++ b/docs/speech-to-text/batch/speaker-identification.mdx @@ -36,7 +36,7 @@ You can request the identifiers back from the engine by setting the `get_speaker "diarization": "speaker" "speaker_diarization_config": { // highlight-start - "get_speakers": True + "get_speakers": true // highlight-end } } diff --git a/docs/speech-to-text/realtime/speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx index 0703ad7..24e4d2e 100644 --- a/docs/speech-to-text/realtime/speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -58,7 +58,7 @@ Example speaker diarization config: "diarization": "speaker" "speaker_diarization_config": { // highlight-start - "get_speakers": True + "get_speakers": true // highlight-end } } From 1e594f71eed0cedf9793a026819ceee16585f652 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 16:32:28 +0000 Subject: [PATCH 06/11] fix hyperlinkgs --- docs/speech-to-text/batch/speaker-identification.mdx | 8 ++++---- docs/speech-to-text/features/speaker-identification.mdx | 4 ++-- docs/speech-to-text/realtime/speaker-identification.mdx | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/speech-to-text/batch/speaker-identification.mdx b/docs/speech-to-text/batch/speaker-identification.mdx index 54c0457..0406f41 100644 --- a/docs/speech-to-text/batch/speaker-identification.mdx +++ b/docs/speech-to-text/batch/speaker-identification.mdx @@ -20,12 +20,12 @@ import { Card, DataList, Text } from '@radix-ui/themes'; # Batch speaker identification :::tip -For an overview of the feature, see the [speaker identification](../features/speaker-identification.mdx) page. +For an overview of the feature, see the [speaker identification](/speech-to-text/features/speaker-identification) page. ::: ## Enrollment -To generate identifiers for a desired speaker, run a [speaker diarization](../features/diarization.mdx#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. +To generate identifiers for a desired speaker, run a [speaker diarization](/speech-to-text/features/diarization#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. You can request the identifiers back from the engine by setting the `get_speakers` flag in the transcription config: ```json @@ -78,7 +78,7 @@ When the transcription is done, the speakers identifiers will be attached to the }, { "label": "S2", - "speaker_identifiers": [""] + "speaker_identifiers": [""] }] // highlight-end } @@ -86,7 +86,7 @@ When the transcription is done, the speakers identifiers will be attached to the ## Identification -Once you have generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. All other [speaker diarization options](batch_diarization.mdx#configuration) remain supported. Notably, the `speakers_sensitivity` parameter can be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones, where lower values make it more likely to match existing enrolled speakers. +Once you have generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. All other [speaker diarization options](/speech-to-text/batch/batch-diarization#configuration) remain supported. Notably, the `speakers_sensitivity` parameter can be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones, where lower values make it more likely to match existing enrolled speakers. An example configuration is shown below: diff --git a/docs/speech-to-text/features/speaker-identification.mdx b/docs/speech-to-text/features/speaker-identification.mdx index 69aad20..e76cbb8 100644 --- a/docs/speech-to-text/features/speaker-identification.mdx +++ b/docs/speech-to-text/features/speaker-identification.mdx @@ -29,7 +29,7 @@ By tagging known speakers with consistent labels, speaker identification makes t ## How it works -To use speaker identification you must enable diarization in the [speaker mode](../features/diarization.mdx#diarization-modes) and then follow the two steps below: +To use speaker identification you must enable diarization in the [speaker mode](/speech-to-text/features/diarization#diarization-modes) and then follow the two steps below: - **Enrollment** - For each speaker you want to recognize, generate identifiers from short audio clips (5–30s) where they ideally speak alone. To improve robustness, you can enroll the same speaker with multiple clips recorded under different acoustic conditions, selected to represent the degree of variety and quality that could be expected in the target audio. @@ -55,4 +55,4 @@ In all of the above cases — including model mismatches or attempts to use iden ## Supported modes -Speaker identification is supported in both **[Realtime speaker identification](../realtime/speaker-identification.mdx)** and **[Batch speaker identification](../batch/speaker-identification.mdx)** modes. +Speaker identification is supported in both **[Realtime speaker identification](/speech-to-text/realtime/speaker-identification)** and **[Batch speaker identification](/speech-to-text/batch/speaker-identification)** modes. diff --git a/docs/speech-to-text/realtime/speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx index 24e4d2e..fb4cbfb 100644 --- a/docs/speech-to-text/realtime/speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -26,12 +26,12 @@ import speakerIdIdentificationPythonExample from "./assets/speaker-id-identifica # Realtime speaker identification :::tip -For an overview of the feature, see the [speaker identification](../features/speaker-identification.mdx) page. +For an overview of the feature, see the [speaker identification](/speech-to-text/features/speaker-identification) page. ::: ## Enrollment -To generate identifiers for a desired speaker, run a [speaker diarization](../features/diarization.mdx#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. +To generate identifiers for a desired speaker, run a [speaker diarization](/speech-to-text/features/diarization#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. You can request the identifiers back from the engine by sending a `GetSpeakers` request. By default, the engine returns identifiers created up to the time of the request, but you can also wait until the end of the stream by setting the optional `final` flag in the `GetSpeakers` request: @@ -81,7 +81,7 @@ When the request is processed, the server replies with a `SpeakersResult` messag Once you've generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. -All [speaker diarization options](realtime-diarization.mdx#configuration) work with speaker identification. The `max_speakers` parameter only applies to generic (non-enrolled) speakers. For example, if it’s set to 10 and 10 speakers are enrolled, the system can still add up to 10 additional generic speakers. The `speakers_sensitivity` parameter can also be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones. Lower values make it more likely to match existing enrolled speakers. +All [speaker diarization options](/speech-to-text/realtime/realtime-diarization#configuration) work with speaker identification. The `max_speakers` parameter only applies to generic (non-enrolled) speakers. For example, if it’s set to 10 and 10 speakers are enrolled, the system can still add up to 10 additional generic speakers. The `speakers_sensitivity` parameter can also be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones. Lower values make it more likely to match existing enrolled speakers. An example configuration is shown below: From b0fa881f7be7083731ca332f6bf655f6c1612230 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 16:33:25 +0000 Subject: [PATCH 07/11] Rephrase one sentence about get_speakers option. --- docs/speech-to-text/realtime/speaker-identification.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/speech-to-text/realtime/speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx index fb4cbfb..d5f574c 100644 --- a/docs/speech-to-text/realtime/speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -46,7 +46,7 @@ By default, the engine returns identifiers created up to the time of the request - final: false (default) — returns identifiers generated up to the point of the request. To avoid empty results, wait until the server has issued at least one `AddTranscript` message before sending the request. - final: true — waits until the end of the stream and returns identifiers based on all audio. -Alternatively, you can enable automatic speaker retrieval by setting the `get_speakers` option to true in the diarization configuration (recommended for enrollment). This ensures the engine automatically returns speaker identifiers when the transcription completes — the same behavior as issuing the `GetSpeakers(final=true)` request manually. If the `get_speakers` option is not present in the configuration or is set to false, you can still request speakers explicitly by sending the `GetSpeakers(final=true)` message. In this case, the request takes precedence, and the engine will return the speaker identifiers at the end of the transcription. +Alternatively, you can enable automatic speaker retrieval by setting the `get_speakers` option to true in the diarization configuration (recommended for enrollment). This guarantees that the engine automatically provides speaker identifiers once the transcription is complete, equivalent to manually calling GetSpeakers(final=true). If the `get_speakers` option is not present in the configuration or is set to false, you can still request speakers explicitly by sending the `GetSpeakers(final=true)` message. In this case, the request takes precedence, and the engine will return the speaker identifiers at the end of the transcription. Example speaker diarization config: From 1108c9874966e3891f239731d9f782a794116f03 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 16:40:17 +0000 Subject: [PATCH 08/11] fix redirects --- scripts/redirects/redirects.json | 4 ++++ vercel.json | 15 +++++---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/scripts/redirects/redirects.json b/scripts/redirects/redirects.json index d556bf0..4780099 100644 --- a/scripts/redirects/redirects.json +++ b/scripts/redirects/redirects.json @@ -18,5 +18,9 @@ { "source": "/speech-to-text/realtime/end-of-utterance", "destination": "/speech-to-text/realtime/turn-detection" + }, + { + "source": "/speech-to-text/realtime/realtime-speaker-identification", + "target": "/speech-to-text/realtime/speaker-identification" } ] diff --git a/vercel.json b/vercel.json index 4e19d73..e36398d 100644 --- a/vercel.json +++ b/vercel.json @@ -1,16 +1,6 @@ { "trailingSlash": false, "redirects": [ - { - "source": "/speech-to-text/realtime/realtime_diarization", - "destination": "/speech-to-text/realtime/realtime-diarization", - "permanent": true - }, - { - "source": "/speech-to-text/batch/batch_diarization", - "destination": "/speech-to-text/batch/batch-diarization", - "permanent": true - }, { "source": "/speech-to-text/batch/tracking", "destination": "/speech-to-text/batch/output#tracking-metadata", @@ -26,6 +16,11 @@ "destination": "/speech-to-text/realtime/turn-detection", "permanent": true }, + { + "source": "/speech-to-text/realtime/realtime-speaker-identification", + "target": "/speech-to-text/realtime/speaker-identification", + "permanent": true + }, { "source": "/jobsapi", "destination": "/api-ref/batch/create-a-new-job", From ac0825e9c8ff3c86e4b1c729308b358bb370a5dc Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 16:44:03 +0000 Subject: [PATCH 09/11] fix redirects --- scripts/redirects/redirects.json | 2 +- vercel.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/redirects/redirects.json b/scripts/redirects/redirects.json index 4780099..9481173 100644 --- a/scripts/redirects/redirects.json +++ b/scripts/redirects/redirects.json @@ -21,6 +21,6 @@ }, { "source": "/speech-to-text/realtime/realtime-speaker-identification", - "target": "/speech-to-text/realtime/speaker-identification" + "destination": "/speech-to-text/realtime/speaker-identification" } ] diff --git a/vercel.json b/vercel.json index e36398d..2e024a4 100644 --- a/vercel.json +++ b/vercel.json @@ -18,7 +18,7 @@ }, { "source": "/speech-to-text/realtime/realtime-speaker-identification", - "target": "/speech-to-text/realtime/speaker-identification", + "destination": "/speech-to-text/realtime/speaker-identification", "permanent": true }, { From f195ba8480276dfff263dcd55a0c61da820da9e8 Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 17:04:40 +0000 Subject: [PATCH 10/11] add missing backticks --- docs/speech-to-text/realtime/speaker-identification.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/speech-to-text/realtime/speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx index d5f574c..9c0c660 100644 --- a/docs/speech-to-text/realtime/speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -46,7 +46,7 @@ By default, the engine returns identifiers created up to the time of the request - final: false (default) — returns identifiers generated up to the point of the request. To avoid empty results, wait until the server has issued at least one `AddTranscript` message before sending the request. - final: true — waits until the end of the stream and returns identifiers based on all audio. -Alternatively, you can enable automatic speaker retrieval by setting the `get_speakers` option to true in the diarization configuration (recommended for enrollment). This guarantees that the engine automatically provides speaker identifiers once the transcription is complete, equivalent to manually calling GetSpeakers(final=true). If the `get_speakers` option is not present in the configuration or is set to false, you can still request speakers explicitly by sending the `GetSpeakers(final=true)` message. In this case, the request takes precedence, and the engine will return the speaker identifiers at the end of the transcription. +Alternatively, you can enable automatic speaker retrieval by setting the `get_speakers` option to true in the diarization configuration (recommended for enrollment). This guarantees that the engine automatically provides speaker identifiers once the transcription is complete, equivalent to manually calling `GetSpeakers(final=true)`. If the `get_speakers` option is not present in the configuration or is set to false, you can still request speakers explicitly by sending the `GetSpeakers(final=true)` message. In this case, the request takes precedence, and the engine will return the speaker identifiers at the end of the transcription. Example speaker diarization config: From 73e4e5391e031a5ded6f2340e2d2afe85f6b83cf Mon Sep 17 00:00:00 2001 From: Daniel Nurkowski Date: Tue, 11 Nov 2025 17:15:41 +0000 Subject: [PATCH 11/11] post rebase fixes --- vercel.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vercel.json b/vercel.json index 2e024a4..9786e04 100644 --- a/vercel.json +++ b/vercel.json @@ -1,6 +1,16 @@ { "trailingSlash": false, "redirects": [ + { + "source": "/speech-to-text/realtime/realtime_diarization", + "destination": "/speech-to-text/realtime/realtime-diarization", + "permanent": true + }, + { + "source": "/speech-to-text/batch/batch_diarization", + "destination": "/speech-to-text/batch/batch-diarization", + "permanent": true + }, { "source": "/speech-to-text/batch/tracking", "destination": "/speech-to-text/batch/output#tracking-metadata",