In [None]:
import torch
import torchaudio
from pyannote.audio import Pipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook

HF_TOKEN = "TOKEN_HERE"

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-community-1", token=HF_TOKEN)

assert pipeline is not None, "Something happened"

if torch.cuda.is_available():
    pipeline.to(torch.device("cuda"))

with ProgressHook() as hook:
    waveform, sample_rate = torchaudio.load("./Aufzeichnung.m4a")
    output = pipeline({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
    # output = pipeline("test.wav", hook=hook)

In [11]:
for turn, speaker in output.speaker_diarization:
    print(turn.__dir__())

['start', 'end', '__module__', '__annotations__', '__doc__', 'set_precision', '__bool__', '__post_init__', 'duration', 'middle', '__iter__', 'copy', '__contains__', '__and__', 'intersects', 'overlaps', '__or__', '__xor__', '_str_helper', '__str__', '__repr__', '_repr_png_', '__dict__', '__weakref__', '__dataclass_params__', '__dataclass_fields__', '__init__', '__eq__', '__lt__', '__le__', '__gt__', '__ge__', '__setattr__', '__delattr__', '__hash__', '__match_args__', '__new__', '__getattribute__', '__ne__', '__reduce_ex__', '__reduce__', '__subclasshook__', '__init_subclass__', '__format__', '__sizeof__', '__dir__', '__class__']
['start', 'end', '__module__', '__annotations__', '__doc__', 'set_precision', '__bool__', '__post_init__', 'duration', 'middle', '__iter__', 'copy', '__contains__', '__and__', 'intersects', 'overlaps', '__or__', '__xor__', '_str_helper', '__str__', '__repr__', '_repr_png_', '__dict__', '__weakref__', '__dataclass_params__', '__dataclass_fields__', '__init__', '

In [14]:
import whisper

model = whisper.load_model("large-v3")

result = model.transcribe("Aufzeichnung.m4a")

100%|█████████████████████████████████████| 2.88G/2.88G [01:47<00:00, 28.6MiB/s]


In [22]:
result["segments"]

[{'id': 0,
  'seek': 0,
  'start': 0.0,
  'end': 6.140000000000001,
  'text': ' Okay, test one, two, three.',
  'tokens': [50365, 1033, 11, 1500, 472, 11, 732, 11, 1045, 13, 50672],
  'temperature': 0.0,
  'avg_logprob': -0.48067413676868787,
  'compression_ratio': 1.0,
  'no_speech_prob': 0.0120825981721282},
 {'id': 1,
  'seek': 0,
  'start': 6.140000000000001,
  'end': 8.5,
  'text': ' And test four, five, six.',
  'tokens': [50672, 400, 1500, 1451, 11, 1732, 11, 2309, 13, 50790],
  'temperature': 0.0,
  'avg_logprob': -0.48067413676868787,
  'compression_ratio': 1.0,
  'no_speech_prob': 0.0120825981721282}]

In [23]:
for turn, speaker in output.speaker_diarization:
    print(turn, speaker)

[ 00:00:01.785 -->  00:00:04.367] SPEAKER_01
[ 00:00:05.819 -->  00:00:08.451] SPEAKER_00


In [30]:
transcription = ""

whisper_index = 0

for turn, speaker in output.speaker_diarization:
    transcription += f"[{speaker}]\n"
    found_end = False
    while not found_end:
        current_segment = result["segments"][whisper_index]
        
        w_start = current_segment["start"]
        w_end = current_segment["end"]

        p_end = turn.end

        if w_end < p_end:
            transcription += f"{current_segment['text']} "
            whisper_index += 1

        elif w_end - p_end < p_end - w_start:
            transcription += f"{current_segment['text']}\n\n"
            whisper_index += 1
            found_end = True

        else:
            transcription += "\n"
            found_end = True

In [31]:
print(transcription)

[SPEAKER_01]
 Okay, test one, two, three.

[SPEAKER_00]
 And test four, five, six.


