diff --git a/docs/speech-to-text/batch/sidebar.ts b/docs/speech-to-text/batch/sidebar.ts index 550dc85..09c87cf 100644 --- a/docs/speech-to-text/batch/sidebar.ts +++ b/docs/speech-to-text/batch/sidebar.ts @@ -22,6 +22,10 @@ export default { type: "doc", id: "speech-to-text/batch/batch-diarization", }, + { + type: "doc", + id: "speech-to-text/batch/speaker-identification", + }, { type: "category", label: "Speech intelligence", diff --git a/docs/speech-to-text/batch/speaker-identification.mdx b/docs/speech-to-text/batch/speaker-identification.mdx new file mode 100644 index 0000000..0406f41 --- /dev/null +++ b/docs/speech-to-text/batch/speaker-identification.mdx @@ -0,0 +1,153 @@ +--- +sidebar_label: 'Speaker identification' +description: "Learn how to use the Speechmatics API to identify speakers in Batch" +keywords: + [ + speechmatics, + batch, + diarization, + transcription, + speech recognition, + automatic speech recognition, + asr, + ] +sidebar_position: 2 +--- + +import DocCardList from '@theme/DocCardList'; +import { Card, DataList, Text } from '@radix-ui/themes'; + +# Batch speaker identification + +:::tip +For an overview of the feature, see the [speaker identification](/speech-to-text/features/speaker-identification) page. +::: + +## Enrollment + +To generate identifiers for a desired speaker, run a [speaker diarization](/speech-to-text/features/diarization#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. +You can request the identifiers back from the engine by setting the `get_speakers` flag in the transcription config: + +```json +{ + "type": "transcription", + "transcription_config": { + "language": "en", + "diarization": "speaker" + "speaker_diarization_config": { + // highlight-start + "get_speakers": true + // highlight-end + } + } +} +``` + +When the transcription is done, the speakers identifiers will be attached to the returned transcript: + +```json +{ + "results": [ + { + "alternatives": [ + { + "confidence": 0.93, + "content": "Hello", + "language": "en", + "speaker": "S1" + } + ], + ... + }, + { + "alternatives": [ + { + "confidence": 1.0, + "content": "Hi", + "language": "en", + "speaker": "S2" + } + ], + ... + }], + // highlight-start + "speakers": [ + { + "label": "S1", + "speaker_identifiers": [""] + }, + { + "label": "S2", + "speaker_identifiers": [""] + }] + // highlight-end +} +``` + +## Identification + +Once you have generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. All other [speaker diarization options](/speech-to-text/batch/batch-diarization#configuration) remain supported. Notably, the `speakers_sensitivity` parameter can be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones, where lower values make it more likely to match existing enrolled speakers. + +An example configuration is shown below: + +```json +{ + "type": "transcription", + "transcription_config": { + "language": "en", + "diarization": "speaker", + "speaker_diarization_config": { + // highlight-start + "speakers": [ + {"label": "Alice", "speaker_identifiers": ["", ""]}, + {"label": "Bob", "speaker_identifiers": [""]} + ] + // highlight-end + } + } +} +``` + +With the config above, transcript segments should be tagged with `"Alice"` and `"Bob"` whenever these speakers are detected, whereas any other speakers should be tagged with the internal labels: + +```json +{ + "results": [ + { + "alternatives": [ + { + "confidence": 0.93, + "content": "Morning", + "language": "en", + // highlight-start + "speaker": "Alice" + // highlight-end + } + ], + ... + }, + { + "alternatives": [ + { + "confidence": 0.93, + "content": "Hi", + "language": "en", + "speaker": "S1" + } + ], + ... + }, + { + "alternatives": [ + { + "confidence": 1.0, + "content": "Morning", + "language": "en", + // highlight-start + "speaker": "Bob" + // highlight-end + } + ], + }] +} +``` diff --git a/docs/speech-to-text/features/speaker-identification.mdx b/docs/speech-to-text/features/speaker-identification.mdx index dc311ba..e76cbb8 100644 --- a/docs/speech-to-text/features/speaker-identification.mdx +++ b/docs/speech-to-text/features/speaker-identification.mdx @@ -29,15 +29,15 @@ By tagging known speakers with consistent labels, speaker identification makes t ## How it works -To use speaker identification you must enable diarization in the [speaker mode](../features/diarization.mdx#diarization-modes) and then follow the two steps below: +To use speaker identification you must enable diarization in the [speaker mode](/speech-to-text/features/diarization#diarization-modes) and then follow the two steps below: -- **Enrollment** — For each speaker you want to recognize, generate identifiers from short audio clips (5–30s) where they ideally speak alone. +- **Enrollment** - For each speaker you want to recognize, generate identifiers from short audio clips (5–30s) where they ideally speak alone. To improve robustness, you can enroll the same speaker with multiple clips recorded under different acoustic conditions, selected to represent the degree of variety and quality that could be expected in the target audio. -- **Identification** — Use the enrolled identifiers in transcription jobs to label known speakers with meaningful names (for example, `Alice` or `John`). The system matches voices to identifiers and tags the output with the desired labels. +- **Identification** - Use the enrolled identifiers in transcription jobs to label known speakers with meaningful names (for example, `Alice` or `John`). The system matches voices to identifiers and tags the output with the desired labels. :::info -Labels for identified speakers must not use reserved internal labels (UU, S1, S2, etc.) and must not include leading or trailing spaces. +It is recommended to minimize the number of speaker IDs to achieve optimal accuracy. A maximum of 50 speaker identifiers across all speakers can be configured per session. Additionally, labels for identified speakers must not use reserved internal labels (e.g., UU, S1, S2) and should not contain leading or trailing spaces. ::: ## Known caveats @@ -55,4 +55,4 @@ In all of the above cases — including model mismatches or attempts to use iden ## Supported modes -Speaker identification is currently only supported in **[Realtime speaker identification](../realtime/realtime-speaker-identification.mdx)**
mode with the batch support coming soon. +Speaker identification is supported in both **[Realtime speaker identification](/speech-to-text/realtime/speaker-identification)** and **[Batch speaker identification](/speech-to-text/batch/speaker-identification)** modes. diff --git a/docs/speech-to-text/realtime/sidebar.ts b/docs/speech-to-text/realtime/sidebar.ts index c938394..3a12739 100644 --- a/docs/speech-to-text/realtime/sidebar.ts +++ b/docs/speech-to-text/realtime/sidebar.ts @@ -24,7 +24,7 @@ export default { }, { type: "doc", - id: "speech-to-text/realtime/realtime-speaker-identification", + id: "speech-to-text/realtime/speaker-identification", }, { type: "doc", diff --git a/docs/speech-to-text/realtime/realtime-speaker-identification.mdx b/docs/speech-to-text/realtime/speaker-identification.mdx similarity index 70% rename from docs/speech-to-text/realtime/realtime-speaker-identification.mdx rename to docs/speech-to-text/realtime/speaker-identification.mdx index 77fb9e9..9c0c660 100644 --- a/docs/speech-to-text/realtime/realtime-speaker-identification.mdx +++ b/docs/speech-to-text/realtime/speaker-identification.mdx @@ -26,15 +26,15 @@ import speakerIdIdentificationPythonExample from "./assets/speaker-id-identifica # Realtime speaker identification :::tip -For an overview of the feature, see the [speaker identification](../features/speaker-identification.mdx) page. +For an overview of the feature, see the [speaker identification](/speech-to-text/features/speaker-identification) page. ::: ## Enrollment -To generate identifiers for a desired speaker, run a [speaker diarization](../features/diarization.mdx#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. +To generate identifiers for a desired speaker, run a [speaker diarization](/speech-to-text/features/diarization#diarization-modes) enabled transcription on an audio sample where the speaker is ideally speaking alone. You can request the identifiers back from the engine by sending a `GetSpeakers` request. -By default, the engine returns identifiers created up to the time of the request, but you can also wait until the end of the stream by setting the optional `final` flag in the `GetSpeakers` request (recommended for enrollment): +By default, the engine returns identifiers created up to the time of the request, but you can also wait until the end of the stream by setting the optional `final` flag in the `GetSpeakers` request: ```json { @@ -46,6 +46,25 @@ By default, the engine returns identifiers created up to the time of the request - final: false (default) — returns identifiers generated up to the point of the request. To avoid empty results, wait until the server has issued at least one `AddTranscript` message before sending the request. - final: true — waits until the end of the stream and returns identifiers based on all audio. +Alternatively, you can enable automatic speaker retrieval by setting the `get_speakers` option to true in the diarization configuration (recommended for enrollment). This guarantees that the engine automatically provides speaker identifiers once the transcription is complete, equivalent to manually calling `GetSpeakers(final=true)`. If the `get_speakers` option is not present in the configuration or is set to false, you can still request speakers explicitly by sending the `GetSpeakers(final=true)` message. In this case, the request takes precedence, and the engine will return the speaker identifiers at the end of the transcription. + +Example speaker diarization config: + +```json +{ + "type": "transcription", + "transcription_config": { + "language": "en", + "diarization": "speaker" + "speaker_diarization_config": { + // highlight-start + "get_speakers": true + // highlight-end + } + } +} +``` + When the request is processed, the server replies with a `SpeakersResult` message that contains the identifiers for each diarized speaker: ```json @@ -60,9 +79,9 @@ When the request is processed, the server replies with a `SpeakersResult` messag ## Identification -Once you've generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. A maximum of 50 speaker identifiers across all speakers can be configured per session. +Once you've generated speaker identifiers, you can provide them in your next transcription job to identify and tag known speakers. This is done through the `speakers` option in the speaker diarization configuration. -All [speaker diarization options](realtime-diarization.mdx#configuration) work with speaker identification. The `max_speakers` parameter only applies to generic (non-enrolled) speakers. For example, if it’s set to 10 and 10 speakers are enrolled, the system can still add up to 10 additional generic speakers. The `speakers_sensitivity` parameter can also be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones. Lower values make it more likely to match existing enrolled speakers. +All [speaker diarization options](/speech-to-text/realtime/realtime-diarization#configuration) work with speaker identification. The `max_speakers` parameter only applies to generic (non-enrolled) speakers. For example, if it’s set to 10 and 10 speakers are enrolled, the system can still add up to 10 additional generic speakers. The `speakers_sensitivity` parameter can also be used to adjust how strongly the system prefers enrolled speakers over detecting new generic ones. Lower values make it more likely to match existing enrolled speakers. An example configuration is shown below: diff --git a/scripts/redirects/redirects.json b/scripts/redirects/redirects.json index d556bf0..9481173 100644 --- a/scripts/redirects/redirects.json +++ b/scripts/redirects/redirects.json @@ -18,5 +18,9 @@ { "source": "/speech-to-text/realtime/end-of-utterance", "destination": "/speech-to-text/realtime/turn-detection" + }, + { + "source": "/speech-to-text/realtime/realtime-speaker-identification", + "destination": "/speech-to-text/realtime/speaker-identification" } ] diff --git a/spec/batch.yaml b/spec/batch.yaml index 824cae3..c0f9af0 100644 --- a/spec/batch.yaml +++ b/spec/batch.yaml @@ -1103,6 +1103,21 @@ definitions: maximum: 1 description: >- Controls how sensitive the algorithm is in terms of keeping similar speakers separate, as opposed to combining them into a single speaker. Higher values will typically lead to more speakers, as the degree of difference between speakers in order to allow them to remain distinct will be lower. A lower value for this parameter will conversely guide the algorithm towards being less sensitive in terms of retaining similar speakers, and as such may lead to fewer speakers overall. The default is 0.5. + get_speakers: + type: boolean + description: >- + If true, speaker identifiers will be returned at the end of transcript. + speakers: + type: array + x-omitempty: true + description: >- + Use this option to provide speaker labels linked to their speaker identifiers. + When passed, the transcription system will tag spoken words in the transcript + with the provided speaker labels whenever any of the specified speakers + is detected in the audio. A maximum of 50 speakers identifiers across all speakers + can be provided. + items: + $ref: '#/definitions/SpeakersInputItem' example: language: en output_locale: en-GB @@ -1871,6 +1886,12 @@ definitions: type: array items: $ref: "#/definitions/RecognitionResult" + speakers: + type: array + x-omitempty: true + items: + $ref: '#/definitions/SpeakersResultItem' + description: List of unique speaker identifiers detected in the transcript. translations: type: object description: >- @@ -2315,3 +2336,39 @@ definitions: type: string transcript: type: string + SpeakersInputItem: + type: object + properties: + label: + type: string + minLength: 1 + description: Speaker label, which must not match the format used internally (e.g. S1, S2, etc) + speaker_identifiers: + type: array + minItems: 1 + uniqueItems: true + items: + type: string + format: bytes + description: Speaker identifiers. + required: + - label + - speaker_identifiers + SpeakersResultItem: + type: object + properties: + label: + type: string + minLength: 1 + description: Speaker label. + speaker_identifiers: + type: array + minItems: 1 + uniqueItems: true + items: + type: string + format: bytes + description: Speaker identifiers. + required: + - label + - speaker_identifiers \ No newline at end of file diff --git a/spec/realtime.yaml b/spec/realtime.yaml index d8aaec0..d229c3b 100644 --- a/spec/realtime.yaml +++ b/spec/realtime.yaml @@ -932,6 +932,10 @@ components: format: float minimum: 0 maximum: 1 + get_speakers: + type: boolean + description: >- + If true, speaker identifiers will be returned at the end of transcript. speakers: type: array description: >- diff --git a/vercel.json b/vercel.json index 4e19d73..9786e04 100644 --- a/vercel.json +++ b/vercel.json @@ -26,6 +26,11 @@ "destination": "/speech-to-text/realtime/turn-detection", "permanent": true }, + { + "source": "/speech-to-text/realtime/realtime-speaker-identification", + "destination": "/speech-to-text/realtime/speaker-identification", + "permanent": true + }, { "source": "/jobsapi", "destination": "/api-ref/batch/create-a-new-job",