In [1]:
import librosa

from speechless.onnx import PyannoteONNX
from speechless.transcribe_audio import transcribe_audio
from speechless.speaker_diarization import merge_transcript

In [2]:
AUDIO_PATH = '../data/audio/sk_slsp_chalupa.mp3'
PATH_TO_ONNX_MODEL = '../pyannote-onnx/segmentation-3.0.onnx'
OVERLAP = 0.7
NUM_SPEAKERS = 2

In [3]:
def perform_diarization(
    audio_path: str,
    num_speakers: int = 2,
    sample_rate: int = 16000,
):
    """
    Perform speaker diarization using PyannoteONNX and optionally plot VAD probabilities.

    Parameters
    ----------
    audio_path : str
        Path to the input audio file.
    plot : bool, default=False
        Whether to plot the VAD probabilities.
    sample_rate : int, default=16000
        The sample rate to load the audio with, matching the model's expected rate.

    Returns
    -------
    None
    """
    # Initialize the PyannoteONNX model
    pyannote = PyannoteONNX(
        num_speakers,
        sample_rate,
        PATH_TO_ONNX_MODEL,
    )
    output = []
    print("Model loaded with PyannoteONNX.")

    # Load the audio file as a waveform
    wav, sr = librosa.load(audio_path, sr=sample_rate)
    print(f"Audio loaded with sample rate: {sr}")

    # Perform diarization and print each detected segment
    print("Performing diarization...")
    for turn in pyannote.itertracks(wav):
        output.append(turn)

    return output

In [None]:
diarization_result = perform_diarization(
    audio_path=AUDIO_PATH,
    num_speakers=NUM_SPEAKERS,
)

In [None]:
transcript = transcribe_audio(
    AUDIO_PATH
)

In [None]:
merged_transcript = merge_transcript(
    transcript,
    diarization_result,
    OVERLAP
)

merged_transcript