In [1]:
!pip install pyannote.audio

Collecting pyannote.audio
  Downloading pyannote.audio-3.2.0-py2.py3-none-any.whl (873 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m873.5/873.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)
Collecting einops>=0.6.0 (from pyannote.audio)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.2.5-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omegaconf<3.0,>=2.1 (from pyannote.audio)
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m13.6 MB/s[0

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('drive/MyDrive/CS224S_Final_Project/data')

In [3]:
from pyannote.database import registry, FileFinder
registry.load_database("database.yml")



In [5]:
multilingual_data = registry.get_protocol("classbank.SpeakerDiarization.multilingual", {"audio": FileFinder()})
aus_data = registry.get_protocol("classbank.SpeakerDiarization.aus_only", {"audio": FileFinder()})
west_data = registry.get_protocol("classbank.SpeakerDiarization.us-aus-ned", {"audio": FileFinder()})
east_data = registry.get_protocol("classbank.SpeakerDiarization.jap-hk", {"audio": FileFinder()})
hk_data = registry.get_protocol("classbank.SpeakerDiarization.hk_only", {"audio": FileFinder()})

In [None]:
from pyannote.audio import Model

aus_seg = Model.from_pretrained("outputs/fine_tuned_models/pyannote_ausonly_finetune_1epoch.ckpt")
east_seg = Model.from_pretrained("outputs/fine_tuned_models/east_finetune_1epoch.ckpt")
hk_seg = Model.from_pretrained("outputs/fine_tuned_models/hk_finetune_2epoch.ckpt")
multilingual_seg = Model.from_pretrained("outputs/fine_tuned_models/multilingual_finetune_1epoch.ckpt")
west_seg = Model.from_pretrained("outputs/fine_tuned_models/west_finetune_2epoch.ckpt")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Now, running inference using the full pipeline using default hyperparams
from pyannote.audio import Pipeline
pretrained_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token='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')

In [None]:
# default params
default_params = {'segmentation': {'min_duration_off': 0.5817029604921046, 'threshold': 0.4442333667381752},
                  'clustering': {'method': 'centroid', 'min_cluster_size': 15, 'threshold': 0.7153814381597874}}

In [None]:
from pyannote.audio.pipelines import SpeakerDiarization

aus_pipeline = SpeakerDiarization(
    segmentation=aus_seg,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

east_pipeline = SpeakerDiarization(
    segmentation=east_seg,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

hk_pipeline = SpeakerDiarization(
    segmentation=hk_seg,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

multilingual_pipeline = SpeakerDiarization(
    segmentation=multilingual_seg,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

west_pipeline = SpeakerDiarization(
    segmentation=west_seg,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

In [None]:
aus_pipeline.instantiate(default_params)
east_pipeline.instantiate(default_params)
hk_pipeline.instantiate(default_params)
multilingual_pipeline.instantiate(default_params)
west_pipeline.instantiate(default_params)

In [None]:
import torch
aus_pipeline.to(torch.device("cuda"))
east_pipeline.to(torch.device("cuda"))
hk_pipeline.to(torch.device("cuda"))
multilingual_pipeline.to(torch.device("cuda"))
west_pipeline.to(torch.device("cuda"))

In [None]:
from pyannote.audio.pipelines.utils.hook import ProgressHook
from pyannote.audio import Audio
from pyannote.metrics.diarization import DiarizationErrorRate

metric_aus = DiarizationErrorRate()
metric_east = DiarizationErrorRate()
metric_hk = DiarizationErrorRate()
metric_multilingual = DiarizationErrorRate()
metric_west = DiarizationErrorRate()

In [None]:
counter=0
with ProgressHook() as hook:
  for file in aus_data.test():
      io = Audio(mono='downmix', sample_rate=16000)
      waveform, sample_rate = io(file)

      file["finetuned pipeline"] = aus_pipeline({"waveform":waveform, "sample_rate":sample_rate}, hook=hook)
      der = metric_aus(file["annotation"], file["finetuned pipeline"], uem=file["annotated"]) # use this line instead since we don't have uems yet
      counter+=1
      print(f"Finished running inference on example #{counter}, on filename {file['uri']} for Australia. Got a DER of {der}.")