В данной части кода модель файн-тюнится на тренировочном датасете. Затем гиперпараметры оптимизируются на валидационном датасете. Считается DER и JER. Код адаптирован с туториала https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/adapting_pretrained_pipeline.ipynb

In [1]:
#загружаем протокол, данны для которого мы создали в creatingprotocol.ipynb
#сам протокол лежит в файле «database.yml»
from pyannote.database import get_protocol, FileFinder
preprocessors = {"audio": FileFinder()}
ami = get_protocol('MyDatabase.SpeakerDiarization.MyProtocol', 
                   preprocessors=preprocessors)

In [2]:
#входим на huggingface.hub, чтобы скачать оттуда тренированный пайплайн для диаризации говорящих
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
#скачиваем пайплайн
from pyannote.audio import Pipeline
pretrained_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token='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') 

In [5]:
#применяем заранее натренированный пайплайн на нашем тестовом датасете и оцениваем ее работу с помощью DER из pyannote.metrics
from pyannote.metrics.diarization import DiarizationErrorRate
metric = DiarizationErrorRate()

for file in ami.test():
    file["pretrained pipeline"] = pretrained_pipeline(file)
    metric(file["annotation"], file["pretrained pipeline"], uem=file["annotated"])

print(f"The pretrained pipeline reaches a Diarization Error Rate (DER) of {100 * abs(metric):.1f}% on {ami.name}test set.")

The pretrained pipeline reaches a Diarization Error Rate (DER) of 29.1% on MyDatabase.SpeakerDiarization.MyProtocoltest set.


Файнтюнинг сегментационной модели

In [None]:
#загружаем модель
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/segmentation", use_auth_token=True)

In [6]:
#подготавливаем для файнтюнинга на нашем тренировчном датасете
from pyannote.audio.tasks import Segmentation
task = Segmentation(
    ami, 
    duration=model.specifications.duration, 
    max_num_speakers=len(model.specifications.classes), 
    batch_size=32,
    num_workers=2, 
    loss="bce", 
    vad_loss="bce")
model.task = task
model.setup(stage="fit")

Protocol MyDatabase.SpeakerDiarization.MyProtocol does not precompute the output of torchaudio.info(): adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. See pyannote.database documentation on how to do that yourself.


In [7]:
#тренируем модель с помощью pytorch-lightning на 20 epochs
from types import MethodType
from torch.optim import Adam
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    RichProgressBar,
)

def configure_optimizers(self):
    return Adam(self.parameters(), lr=1e-4)

model.configure_optimizers = MethodType(configure_optimizers, model)

monitor, direction = task.val_monitor
checkpoint = ModelCheckpoint(
    monitor=monitor,
    mode=direction,
    save_top_k=1,
    every_n_epochs=1,
    save_last=False,
    save_weights_only=False,
    filename="{epoch}",
    verbose=False,
)
early_stopping = EarlyStopping(
    monitor=monitor,
    mode=direction,
    min_delta=0.0,
    patience=10,
    strict=True,
    verbose=False,
)

callbacks = [RichProgressBar(), checkpoint, early_stopping]

from pytorch_lightning import Trainer
trainer = Trainer(devices = 1, accelerator="gpu",
                  callbacks=callbacks, 
                  max_epochs=20,
                  gradient_clip_val=0.5)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

In [8]:
# сохраняем путь до лучшего чекпоинта
finetuned_model = checkpoint.best_model_path


In [109]:
#сохраняем модель 
import torch
import pickle
torch.save(finetuned_model, 'model.pth')

In [9]:
#гиперпараметры модели до файн-тюнинга
pretrained_hyperparameters = pretrained_pipeline.parameters(instantiated=True)
pretrained_hyperparameters   

{'segmentation': {'min_duration_off': 0.5817029604921046,
  'threshold': 0.4442333667381752},
 'clustering': {'method': 'centroid',
  'min_cluster_size': 15,
  'threshold': 0.7153814381597874}}

In [10]:
# Оптимизируем segmentation.threshold  с 20 итерациями
from pyannote.audio.pipelines import SpeakerDiarization
from pyannote.pipeline import Optimizer

pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    clustering="OracleClustering",  
)
pipeline.freeze({"segmentation": {"min_duration_off": 0.0}})

optimizer = Optimizer(pipeline)
dev_set = list(ami.development())

iterations = optimizer.tune_iter(dev_set, show_progress=False)
best_loss = 1.0
for i, iteration in enumerate(iterations):
    print(f"Best segmentation threshold so far: {iteration['params']['segmentation']['threshold']}")
    if i > 20: break   
     

Best segmentation threshold so far: 0.3714352248774171
Best segmentation threshold so far: 0.822177212538127
Best segmentation threshold so far: 0.822177212538127
Best segmentation threshold so far: 0.822177212538127
Best segmentation threshold so far: 0.5326851199720662
Best segmentation threshold so far: 0.5326851199720662
Best segmentation threshold so far: 0.6438912779899371
Best segmentation threshold so far: 0.6438912779899371
Best segmentation threshold so far: 0.6327076853708551
Best segmentation threshold so far: 0.6327076853708551
Best segmentation threshold so far: 0.6327076853708551
Best segmentation threshold so far: 0.6327076853708551
Best segmentation threshold so far: 0.6554349132912024
Best segmentation threshold so far: 0.6554349132912024
Best segmentation threshold so far: 0.6554349132912024
Best segmentation threshold so far: 0.6554349132912024
Best segmentation threshold so far: 0.6554349132912024
Best segmentation threshold so far: 0.6554349132912024
Best segmenta

In [11]:
best_segmentation_threshold = optimizer.best_params["segmentation"]["threshold"] #0.6411790142535215

In [12]:
# оптимизируем clustering.threshold с помощью segmentation.threshold с 20 итерациями 
pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

pipeline.freeze({
    "segmentation": {
        "threshold": best_segmentation_threshold,
        "min_duration_off": 0.0,
    },
    "clustering": {
        "method": "centroid",
        "min_cluster_size": 15,
    },
})

optimizer = Optimizer(pipeline)
iterations = optimizer.tune_iter(dev_set, show_progress=False)
best_loss = 1.0
for i, iteration in enumerate(iterations):
    print(f"Best clustering threshold so far: {iteration['params']['clustering']['threshold']}")
    if i > 20: break  # 50 iterations should give slightly better results

Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 1.0178581606313155
Best clustering threshold so far: 0.7368852266062508
Best clustering threshold so far: 0.7368852266062508
Best clustering threshold so far: 0.6240540134103624
Best clustering threshold so far: 0.6240540134103624
Best clustering threshold so far: 0.6240540134103624
Best clustering threshold so far: 0.6240540134

In [13]:
best_clustering_threshold = optimizer.best_params['clustering']['threshold'] #0.593157817904834

In [14]:
# используем оптимизированные clustering.threshold и segmentation.threshold, чтобы оценить работу модели (DER)
finetuned_pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

finetuned_pipeline.instantiate({
    "segmentation": {
        "threshold": best_segmentation_threshold,
        "min_duration_off": 0.0,
    },
    "clustering": {
        "method": "centroid",
        "min_cluster_size": 15,
        "threshold": best_clustering_threshold,
    },
})

metric = DiarizationErrorRate()

for file in ami.test():
    file["finetuned pipeline"] = finetuned_pipeline(file)
    metric(file["annotation"], file["finetuned pipeline"], uem=file["annotated"])

print(f"The finetuned pipeline reaches a Diarization Error Rate (DER) of {100 * abs(metric):.1f}% on {ami.name} test set.")
     

The finetuned pipeline reaches a Diarization Error Rate (DER) of 23.3% on MyDatabase.SpeakerDiarization.MyProtocol test set.


In [97]:
#считаем JER 
from pyannote.metrics.diarization import JaccardErrorRate
jaccardmetric = JaccardErrorRate()
for i in ami.test():
    i["finetuned pipeline"] = finetuned_pipeline(i)
    jaccardmetric(i["annotation"], i["finetuned pipeline"], uem=i["annotated"])
print(f"The finetuned pipeline reaches a Jaccard Error Rate (JER) of {100 * abs(jaccardmetric):.1f}% on {ami.name} test set.")

The finetuned pipeline reaches a Jaccard Error Rate (JER) of 29.8% on MyDatabase.SpeakerDiarization.MyProtocol test set.
