In [1]:
import demo

# **pyannoteAI** STT Orchestration Release Webinar

> Enhance your own transcription with the most accurate speaker diarization

In [2]:
from ipyannote.apps.transcription import ITranscription
ITranscription(audio=demo.AUDIO, transcription=demo.GOLD_TRANSCRIPTION)

ITranscription(children=(<ipyannote.blocks.waveform.Waveform object at 0x17e7aa900>, <ipyannote.blocks.htranscâ€¦

## STT with Parakeet (`parakeet-tdt-0.6b-v3`) 

In [None]:
parakeet = demo.parakeet(demo.AUDIO)

In [None]:
from ipyannote.apps import ITranscription
ITranscription(audio=demo.AUDIO, transcription=parakeet)

## Speaker diarization with pyannoteAI (`precision-2`) 

* Create an account on [dashboard.pyannote.ai](https://dashboard.pyannote.ai)
* Create a pyannoteAI API key (stored in `PYANNOTEAI_API_KEY` environment variable below)

In [None]:
# initialize Python SDK client
import os
from pyannoteai.sdk import Client
client = Client(os.environ["PYANNOTEAI_API_KEY"])

In this demo, the audio file is available locally and has to be uploaded to `pyannoteAI` cloud servers to get an `audio_url` back.  
However, when using our API in production, we recommend you use your own set of [signed urls](https://docs.pyannote.ai/tutorials/use-s3-private-files).

In [None]:
# 1. upload to pyannoteAI cloud
audio_url = client.upload(demo.AUDIO)

# 2. create a diarization job
diarization_job = client.diarize(audio_url)

# 3. retrieve output
diarization_output = client.retrieve(diarization_job)
diarization = diarization_output['output']['diarization']

In [None]:
from ipyannote.apps import IAnnotation
IAnnotation(audio=demo.AUDIO, annotation=diarization)

## Reconciliation?

In [None]:
demo.Reconciliation(
    audio=demo.AUDIO, 
    transcript=parakeet, 
    diarization=diarization)

## STT orchestration job

To benefit from STT orchestration, simply add `transcription=True` to the previous call.  
It will automatically orchestrate `pyannoteAI` diarization with `parakeet` STT.  
Support for additional STTs will be released progressively in 2026.

In [None]:
# 1. create an STT orchestration job
orchestration_job = client.diarize(
    audio_url, 
    model="precision-2",
    transcription=True,
    transcription_config={
        "model": "parakeet-tdt-0.6b-v3"
    })

In [None]:
# 2. retrieve output
orchestration = client.retrieve(orchestration_job)
orchestration['output'].keys()

Two new keys have been added to the job output!
* `wordLevelTranscription` provide timestamps for each word;
* `turnLevelTranscription` is aligned with the speaker turn returned by our diarization.

## Turn-level transcription

In [None]:
ITranscription(audio=demo.AUDIO, transcription=orchestration['output']['turnLevelTranscription'])

## Word-level transcription

In [None]:
ITranscription(audio=demo.AUDIO, transcription=orchestration['output']['wordLevelTranscription'])

# Part 2

In [None]:
# precision-2 + Parakeet
parakeet_job = client.diarize(
    audio_url, 
    transcription=True, 
    transcription_config={"model": "parakeet-tdt-0.6b-v3"})
stt_parakeet = client.retrieve(parakeet_job)['output']['wordLevelTranscription']

In [None]:
# precision-2 + Whisper
whisper_job = client.diarize(
    audio_url, 
    transcription=True, 
    transcription_config={"model": "faster-whisper-large-v3-turbo"})
stt_whisper = client.retrieve(whisper_job)['output']['wordLevelTranscription']

In [None]:
ITranscription(audio=demo.AUDIO, transcription=stt_whisper)

In [None]:
# difference between Parakeet and Whisper
from ipyannote.apps.wer import IWordErrorRate
IWordErrorRate(
    demo.AUDIO, 
    reference=stt_parakeet, 
    hypothesis=stt_whisper, 
    variant='tcp')