In [None]:
!PYANNOTE_DATABASE_CONFIG="/work/proy/AMI-diarization-setup/pyannote/database.yml" pyannote-database info AMI-SDM.SpeakerDiarization.mini

In [None]:
from pyannote.database import registry, FileFinder

registry.load_database("/work/proy/AMI-diarization-setup/pyannote/database.yml")
dataset = registry.get_protocol("AMI-SDM.SpeakerDiarization.mini", {"audio": FileFinder()})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

GPU availability check

In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")
print(torch.version.cuda)

Dedicating GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Overlapped Speech Detection

In [None]:
# #Create OSD task & fine-tune "pyannote/segmentation-3.0" model
# from pyannote.audio.tasks import OverlappedSpeechDetection
# from pyannote.audio.core.model import Model
# import pytorch_lightning as pl

# osd_task = OverlappedSpeechDetection(
#     dataset,      # same AMI mini dataset
#     duration=4.0, # 2s chunks
#     batch_size=16
# )

# osd_model = Model.from_pretrained("pyannote/segmentation-3.0", use_auth_token=True)
# osd_model.to(device)
# osd_model.task = osd_task

# osd_trainer = pl.Trainer(
#     max_epochs=60,   # quick example (increase for better performance)
#     accelerator="gpu",
#     devices=1
# )
# osd_trainer.fit(osd_model)

In [None]:
# from pyannote.audio.pipelines import OverlappedSpeechDetection as OSDPipeline
# osd_pipeline = OSDPipeline(osd_model)  

In [None]:
# from pyannote.pipeline import Optimizer
# validation_files = list(dataset.development())
# optimizer = Optimizer(osd_pipeline)

# best = optimizer.tune(validation_files, n_iterations=150, show_progress=True)
# print("Best parameters:", best)

# # Re-instantiate pipeline with best parameters
# best_params = optimizer.best_params
# osd_pipeline = OSDPipeline(osd_model).instantiate(best_params)


In [None]:
# from pyannote.core import Annotation, Timeline

# def timeline_to_annotation(timeline: Timeline, label="OVERLAP") -> Annotation:
#     annotation = Annotation()
#     for segment in timeline:
#         annotation[segment] = label
#     return annotation

# from pyannote.metrics.detection import DetectionErrorRate

# osd_metric = DetectionErrorRate()

# for file in dataset.test():
#     # Hypothesis: OverlappedSpeechDetection pipeline returns an *Annotation* already
#     overlap_hyp_annot = osd_pipeline(file)
#     print("Hypothesis labels:", overlap_hyp_annot.labels())


#     # Reference: get_overlap() returns a *Timeline* that you must convert
#     overlap_ref_tl = file["annotation"].get_overlap()
#     overlap_ref_annot = timeline_to_annotation(overlap_ref_tl, label="OVERLAP")
#     print("Reference labels:", overlap_ref_annot.labels())

#     # Pass them both as Annotations
#     osd_metric(
#         reference=overlap_ref_annot,
#         hypothesis=overlap_hyp_annot,
#         uem=file.get("annotated", None),
#         uri=file["uri"]
#     )

# print(osd_metric.report(display=False))


Pretrained Pipeline and its components

In [None]:
from transformers import pipeline
from pyannote.audio import Pipeline

pretrained_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=True) 
pretrained_pipeline.to(device)

In [None]:
print(f"segmentation_model: {pretrained_pipeline.segmentation_model}")
print(f"embedding: {pretrained_pipeline.embedding}")
print(f"klustering: {pretrained_pipeline.klustering}")


Baseline (Default Pipeline run along with latencies and no.of segments detected)

In [None]:
import time
import torch
from pyannote.metrics.diarization import DiarizationErrorRate
from pyannote.core import Annotation

# Initializing the Diarization Error Rate metric
metric = DiarizationErrorRate()

# Dictionaries to store latency and segment counts per file
latency_dict = {}
segment_count_dict = {}

# Record the overall start time
overall_start_time = time.time()

for file in dataset.test():
    # Start timer for the individual file
    start_time = time.time()

    # Applying the pretrained pipeline.
    diarization_result: Annotation = pretrained_pipeline(file["audio"])
    
    # Ensuring synchronization so timing is accurate, since we are using GPU
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    # End timer for the individual file
    end_time = time.time()
    latency = end_time - start_time
    latency_dict[file['uri']] = latency
    
    # number of segments produced
    number_of_segments = sum(1 for _ in diarization_result.itertracks())
    segment_count_dict[file['uri']] = number_of_segments

    # DER with detailed metrics.
    detailed_der = metric(
        file["annotation"],
        diarization_result,
        uem=file["annotated"],
        detailed=True
    )
    
    print(f"File: {file['uri']} - DER: {detailed_der['diarization error rate']:.2f}")
    print(f"  False Alarm: {detailed_der['false alarm']:.2f}")
    print(f"  Miss: {detailed_der['missed detection']:.2f}")
    print(f"  Confusion: {detailed_der['confusion']:.2f}")
    print(f"  Processed in {latency:.2f} seconds, producing {number_of_segments} segments.\n")

# Record the overall end time
overall_end_time = time.time()


if latency_dict:
    mean_latency = sum(latency_dict.values()) / len(latency_dict)
else:
    mean_latency = 0

print(f"Mean processing time for all files: {mean_latency:.2f} seconds.")

#optional
# print("Latency per file:", latency_dict)
# print("Segment count per file:", segment_count_dict)


Visualizations for Baseline

In [None]:
file["annotation"]

In [None]:
diarization_result

In [None]:
import matplotlib.pyplot as plt

def overlay_annotations(ground_truth, hypothesis, file_id=""):
    
    fig, ax = plt.subplots(figsize=(8, 3))
    
    # Plot ground truth segments in blue 
    for segment, _, label in ground_truth.itertracks(yield_label=True):
        ax.hlines(
            y=1.2, 
            xmin=segment.start, 
            xmax=segment.end, 
            colors='b', 
            linewidth=10, 
            label='Ground Truth'
        )
    
    # Plot hypothesis segments in red 
    for segment, _, label in hypothesis.itertracks(yield_label=True):
        ax.hlines(
            y=0.8, 
            xmin=segment.start, 
            xmax=segment.end, 
            colors='r', 
            linewidth=10, 
            label='Prediction'
        )
    
    ax.set_yticks([0.8, 1.2])
    ax.set_yticklabels(['Prediction', 'Ground Truth'])
    ax.set_xlabel("Time (s)")
    
    
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    
    plt.tight_layout()
    plt.show()

for file in dataset.test():
    if file["uri"] == "ES2004a":
        
        diarization_result = pretrained_pipeline(file["audio"])
        overlay_annotations(file["annotation"], diarization_result, file_id=file["uri"])
        break  


#DER+Total_no._segments+Total_latency_for_eachfile+overall_latency for NOTSOFAR Dataset

In [None]:
# import time
# import torch
# from datasets import load_dataset

# # Load the dataset in streaming mode.
# dataset = load_dataset("microsoft/NOTSOFAR", streaming=True)

# # Define the number of files to process.
# num_files_to_process = 30

# total_audio_duration = 0.0  # in seconds
# file_counter = 0
# latency_list = []  # to store per-file latencies

# overall_start_time = time.time()

# for file in dataset["test"]:
#     if file_counter >= num_files_to_process:
#         break

#     # Convert audio array to a torch tensor and cast to float.
#     waveform = torch.tensor(file["audio"]["array"]).float()

#     # Adjust dimensions: if mono, add a channel dimension.
#     if waveform.ndim == 1:
#         waveform = waveform.unsqueeze(0)
#     elif waveform.shape[0] < waveform.shape[1]:
#         waveform = waveform.transpose(0, 1)

#     sample_rate = file["audio"]["sampling_rate"]
#     # Calculate the duration of this audio file in seconds.
#     duration = waveform.shape[1] / sample_rate
#     total_audio_duration += duration

#     # Prepare the input dictionary for the pipeline.
#     audio_input = {
#         "waveform": waveform,
#         "sample_rate": sample_rate
#     }

#     # Measure processing time for this file.
#     file_start = time.time()
#     _ = pretrained_pipeline(audio_input)
#     if torch.cuda.is_available():
#         torch.cuda.synchronize()
#     file_end = time.time()

#     file_latency = file_end - file_start
#     latency_list.append(file_latency)
    
#     #print(f"Processed file {file_counter+1} in {file_latency:.2f} seconds (duration: {duration:.2f} seconds).")
    
#     file_counter += 1

# overall_end_time = time.time()
# total_processing_time = overall_end_time - overall_start_time

# average_latency_per_file = sum(latency_list) / len(latency_list)
# latency_per_audio_second = total_processing_time / total_audio_duration

# print(f"\nTotal audio duration processed: {total_audio_duration:.2f} seconds")
# print(f"Total processing time for {num_files_to_process} files: {total_processing_time:.2f} seconds")
# print(f"Average latency per file: {average_latency_per_file:.2f} seconds")
# print(f"Processing latency per second of audio: {latency_per_audio_second:.4f} seconds/second")


Segmentwise_latency(Initial_pipeline)

In [None]:
# import time
# from pyannote.core import Annotation

# segment_latencies = {}

# for file in dataset.test():
    
#     file["pretrained pipeline"] = pretrained_pipeline(file)
#     diarization_result: Annotation = file["pretrained pipeline"]

#     # Storing latency information for the current file
#     segment_latencies[file["uri"]] = []

#     for segment in diarization_result.get_timeline():
#         # Get start time for latency measurement
#         start_time = time.time()

#         _ = pretrained_pipeline({"audio": file["audio"], "segment": segment})

#         # Get end time and calculate latency
#         end_time = time.time()
#         segment_latency = end_time - start_time

#         # Save segment and its latency
#         segment_latencies[file["uri"]].append((segment, segment_latency))

#     # Print segment-wise latency for the file
#     print(f"File: {file['uri']}")
#     for seg, latency in segment_latencies[file["uri"]]:
#         print(f"  Segment: [{seg.start:.2f} -> {seg.end:.2f}] - Latency: {latency:.2f} seconds")

# # To access latencies later
# # print(segment_latencies)


Finetuning the Pipeline

In [None]:
from pyannote.audio import Model
model_train = Model.from_pretrained("pyannote/segmentation", use_auth_token=True)
model_train.to(device)


In [None]:
print(next(model_train.parameters()).device)


In [None]:
from pyannote.audio.tasks import Segmentation
task = Segmentation(
    dataset, 
    duration=model_train.specifications.duration, 
    max_num_speakers=len(model_train.specifications.classes), 
    batch_size=32,
    num_workers=2, 
    loss="bce", 
    vad_loss="bce")
model_train.task = task
model_train.prepare_data()
model_train.setup()

In [None]:
from types import MethodType
from torch.optim import Adam
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    RichProgressBar,
)

# we use Adam optimizer with 1e-4 learning rate
def configure_optimizers(self):
    return Adam(self.parameters(), lr=1e-4)

model_train.configure_optimizers = MethodType(configure_optimizers, model_train)


# we monitor DER on the validation set
# and use to keep the best checkpoint and stop early
monitor, direction = task.val_monitor
checkpoint = ModelCheckpoint(
    monitor=monitor,
    mode=direction,
    save_top_k=1,
    every_n_epochs=1,
    save_last=False,
    save_weights_only=False,
    filename="{epoch}",
    verbose=False,
)
early_stopping = EarlyStopping(
    monitor=monitor,
    mode=direction,
    min_delta=0.0,
    patience=10,
    strict=True,
    verbose=False,
)

callbacks = [RichProgressBar(), checkpoint, early_stopping]

from pytorch_lightning import Trainer
trainer = Trainer(accelerator="gpu", 
                  devices = [1],
                  callbacks=callbacks, 
                  max_epochs=20,
                  strategy = "dp",
                  gradient_clip_val=0.5)


trainer.fit(model_train)

In [None]:
best_model_path = checkpoint.best_model_path

In [None]:
pretrained_hyperparameters = pretrained_pipeline.parameters(instantiated=True)
pretrained_hyperparameters

In [None]:
finetuned_model = Model.from_pretrained(checkpoint.best_model_path) 

In [None]:
finetuned_model.to(torch.device('cuda:0'))

In [None]:
for name, param in finetuned_model.named_parameters():
    print(f"{name} is on device {param.device}")

Best Segmentation Threshold

In [None]:
from pyannote.audio.pipelines import SpeakerDiarization
from pyannote.metrics.diarization import DiarizationErrorRate
from pyannote.pipeline import Optimizer


pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    clustering="OracleClustering",  
)

# min_duration_off to zero for optimization
pipeline.freeze({"segmentation": {"min_duration_off": 0.0}})

# Initializing optimizer
optimizer = Optimizer(pipeline)
dev_set = list(dataset.development())  # Full development set

# iteration limit and best_loss threshold for early stopping
max_iterations = 30
best_loss = 1.0
loss_threshold = 0.1

# Tuning segmentation threshold
iterations = optimizer.tune_iter(dev_set, show_progress=True)
for i, iteration in enumerate(iterations):
    current_loss = iteration['loss']
    print(f"Iteration {i + 1}, Segmentation threshold: {iteration['params']['segmentation']['threshold']}, Loss: {current_loss}")

    # Update best loss and check for early stopping
    if current_loss < best_loss:
        best_loss = current_loss
    if best_loss < loss_threshold or i >= max_iterations - 1:
        break

In [None]:
best_segmentation_threshold = optimizer.best_params["segmentation"]["threshold"]

Best Clustering Threshold

In [None]:
pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap= False, #pretrained_pipeline.embedding_exclude_overlap, 
    clustering=pretrained_pipeline.klustering,
).to(device)

pipeline.freeze({
    "segmentation": {
        "threshold": best_segmentation_threshold,
        "min_duration_off": 0.0,
    },
    "clustering": {
        "method": "centroid",
        "min_cluster_size": 15,
    },
})

optimizer = Optimizer(pipeline)
iterations = optimizer.tune_iter(dev_set, show_progress=False)
best_loss = 1.0
for i, iteration in enumerate(iterations):
    print(f"Best clustering threshold so far: {iteration['params']['clustering']['threshold']}")
    if i > 70: break  

In [None]:
best_clustering_threshold = optimizer.best_params["clustering"]["threshold"]
print(f"Best Clustering Threshold: {best_clustering_threshold}")

In [None]:
#final DER on the test set
import time
finetuned_pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap= False, #pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
).to(device)

finetuned_pipeline.instantiate({
    "segmentation": {
        "threshold": best_segmentation_threshold,
        "min_duration_off": 0.0,
    },
    "clustering": {
        "method": "centroid",
        "min_cluster_size": 15,
        "threshold": best_clustering_threshold,
    },
})

metric = DiarizationErrorRate()
latencies = []  
segment_count_dict = {}


for file in dataset.test():
    start_time = time.time()
    file["finetuned pipeline"] = finetuned_pipeline(file)

    latency = time.time() - start_time
    latencies.append(latency)

    file_der=metric(file["annotation"], file["finetuned pipeline"], uem=file["annotated"])
    
    print(f"File: {file['uri']} - DER: {file_der:.2f} - Latency: {latency:.2f} s")
    

average_latency = sum(latencies) / len(latencies) if latencies else 0.0
print(f"Average pipeline latency per file: {average_latency:.2f} s")

print(f"The finetuned pipeline reaches a Diarization Error Rate (DER) of {100 * abs(metric):.1f}% on {dataset.name} test set.")
final_report_df = metric.report(display=False)
print(final_report_df)

In [None]:
for file in dataset.test():
    # Process the file with the pipeline
    file["finetuned pipeline"] = finetuned_pipeline(file)
    
    # Retrieve the timeline of detected segments
    timeline = file["finetuned pipeline"].get_timeline()
    
    # Count the number of segments in the timeline
    num_segments = len(timeline)
    
    print(f"File: {file['uri']} - Segments detected: {num_segments}")


Visualizations for Finetuned Pipeline

In [None]:
import matplotlib.pyplot as plt

def overlaying_annotations(ground_truth, hypothesis, file_id=""):
    
    fig, ax = plt.subplots(figsize=(8, 3))
    
    # Plot ground truth segments in blue 
    for idx, (segment, _, label) in enumerate(ground_truth.itertracks(yield_label=True)):
        if idx == 0:
            ax.hlines(
                y=1.2, 
                xmin=segment.start, 
                xmax=segment.end, 
                colors='b', 
                linewidth=10, 
                label='Ground Truth'
            )
        else:
            ax.hlines(
                y=1.2, 
                xmin=segment.start, 
                xmax=segment.end, 
                colors='b', 
                linewidth=10
            )
    
    # Plot predicted segments from the finetuned pipeline in red
    for idx, (segment, _, label) in enumerate(hypothesis.itertracks(yield_label=True)):
        if idx == 0:
            ax.hlines(
                y=0.8, 
                xmin=segment.start, 
                xmax=segment.end, 
                colors='r', 
                linewidth=10, 
                label='Prediction'
            )
        else:
            ax.hlines(
                y=0.8, 
                xmin=segment.start, 
                xmax=segment.end, 
                colors='r', 
                linewidth=10
            )
    
    
    ax.set_yticks([0.8, 1.2])
    ax.set_yticklabels(['Prediction', 'Ground Truth'])
    ax.set_xlabel("Time (s)")
    
    
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    
    plt.tight_layout()
    plt.show()

# Example usage with the finetuned pipeline on a specific test file (e.g., file with uri "TS3003a")
for file in dataset.test():
    if file["uri"] == "ES2004a":
        
        file["finetuned pipeline"] = finetuned_pipeline(file)
        overlaying_annotations(file["annotation"], file["finetuned pipeline"], file_id=file["uri"])
        break  


Profiling

In [None]:
import cProfile
import pstats
import io

def run_pipeline(file):
    
    file["finetuned pipeline"] = finetuned_pipeline(file)
    file_der = metric(file["annotation"], file["finetuned pipeline"], uem=file["annotated"])
    return file_der

#profiler instance
profiler = cProfile.Profile()

profiler.enable()
# Process all files in the test set
for file in dataset.test():
    run_pipeline(file)
profiler.disable()

s = io.StringIO()

ps = pstats.Stats(profiler, stream=s).sort_stats('cumulative')
ps.print_stats()
print(s.getvalue())

In [None]:
import cProfile
import pstats
import os

print("Current working directory:", os.getcwd())

profiler = cProfile.Profile()
profiler.enable()

for file in dataset.test():
    file["finetuned pipeline"] = finetuned_pipeline(file)
    _ = metric(file["annotation"], file["finetuned pipeline"], uem=file["annotated"])

profiler.disable()
profiler.dump_stats("profile.prof")
print("Profile stats dumped to profile.prof")
