In [1]:
import os
from pyannote.audio import Pipeline
import random
from pathlib import Path
from hashlib import md5
import gzip
import shutil
import tarfile
import pandas as pd
import torchaudio
import torch
from torchmetrics.text import WordErrorRate
from IPython.display import Audio
import torchaudio.functional as audio_func
from pyannote.core import notebook, Annotation, Segment, Timeline
from pyannote.metrics.segmentation import SegmentationCoverage, SegmentationPurity, SegmentationPrecision, SegmentationRecall
from pyannote.metrics.identification import IdentificationPrecision, IdentificationRecall, IdentificationErrorRate
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from nltk import word_tokenize
import json
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
random.seed(1337)

In [3]:
hf_token = os.environ["HUGGING_FACE_TOKEN"]

In [4]:
default_sample_rate = 16000
min_upvote_count = 2
output_path = Path("data/dev/concat-cv")
output_count = 10
clients_per_output = 5
output_sample_count = 100

In [5]:
output_path.mkdir(parents=True, exist_ok=True)

In [6]:
dataset_base_path = Path("data/cv-corpus-21.0-2025-03-14/ru/")
clips_path = dataset_base_path / "clips"
test_csv_path = dataset_base_path / "dev.tsv"
invalidated_csv_path = dataset_base_path / "invalidated.tsv"

In [7]:
files = pd.read_csv(test_csv_path.absolute(), sep="\t")

In [8]:
invalidated = pd.read_csv(invalidated_csv_path, sep="\t")
invalidated_ids = set(invalidated["client_id"])
files = files[~files["client_id"].isin(invalidated_ids)]

In [9]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    device = torch.device("cpu")
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce GTX 1050 Ti is available.


In [10]:
vad = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token).to(device)
initial_params = {"onset": 0.4, "offset": 0.3, "min_duration_on": 0.0, "min_duration_off": 0.1}
vad.instantiate(initial_params);

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
/home/lemon/Documents/workspace/ai_meeting_notes/.venv/lib64/python3.12/site-packages/pytorch_lightning/utilities/migration/migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
Lightning automatically upgraded your loaded checkpoint from v1.1.3 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/059e96f964841d40f1a5e7

Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.7.1, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


In [11]:
def load_audio(file, target_sample_rate=default_sample_rate, **kwargs):
    tensor, sample_rate = torchaudio.load(file)
    tensor /= tensor.abs().max()
    return audio_func.resample(tensor, sample_rate, target_sample_rate) 

In [12]:
def measure_sample(tensor, sample_rate=default_sample_rate):
    activity = vad({"waveform": tensor, "sample_rate": sample_rate})
    if len(activity.get_timeline()) == 0:
        return None
    start = activity.get_timeline()[0].start
    end = activity.get_timeline()[-1].end
    return (start, end)

In [13]:
def concat_and_save(index, samples):
    audios = []
    metas = []
    prev_end = 0
    
    for idx, row in samples.iterrows():
        audio = load_audio(clips_path / row["path"])
        timings = measure_sample(audio.to(device))
        if timings is None:
            print(f"{index} has no voice detected. Skipping")
            continue
        audios += [audio]
        metas += [(row["client_id"][:6], row["sentence"], timings[0] + prev_end / default_sample_rate, timings[1] + prev_end / default_sample_rate)]
        prev_end += audio.size(1)

    concat_audio = torch.concat(audios, 1)
    rel_path =f"clips/{index}.wav"
    abs_path = output_path / rel_path
    abs_path.parent.mkdir(parents=True, exist_ok=True)
    torchaudio.save(abs_path, concat_audio, default_sample_rate)

    labels = [{"start": start, "end": end, "labels": [f"speaker-{speaker}"]} for speaker, _, start, end in metas]
    transcription = [sentence for _, sentence, _, _ in metas]
    
    return {"audio": rel_path, "labels": labels, "transcription": transcription}

In [14]:
metas = []

for i in range(output_count):
    print(f"Creating file number {i}")

    clients = files["client_id"].sample(clients_per_output)

    rows = files[files["client_id"].isin(clients)]
    rows = rows.sample(min(len(rows), output_sample_count))

    metas += [concat_and_save(i, rows)]

It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



Creating file number 0
Creating file number 1
Creating file number 2
Creating file number 3
Creating file number 4
Creating file number 5
Creating file number 6
Creating file number 7
Creating file number 8
Creating file number 9


In [15]:
json.dump(metas, open(output_path / "labels.json", "w"), ensure_ascii=False)