In [None]:
# Download voxconverse dataset
!wget --load-cookies /tmp/cookies.txt --no-verbose "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate --no-verbose 'https://docs.google.com/uc?export=download&id=1jkmsypHYrljIlDuuCfe2vABez1Own5r9' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1jkmsypHYrljIlDuuCfe2vABez1Own5r9" -O voxconverse_dev_wav.zip && rm -rf /tmp/cookies.txt

# Unzip data files
!unzip -o -q voxconverse_dev_wav.zip -d ./

# Remove zip file
!rm voxconverse_dev_wav.zip

# Pull labels from github
!git clone https://github.com/joonson/voxconverse.git

!pip install torchaudio -q --no-deps
!pip install speechbrain -q
!pip install spectralcluster -q
!pip install pyannote.metrics -q

2021-04-03 02:42:56 URL:https://doc-00-4o-docs.googleusercontent.com/docs/securesc/8l3emp0v66elkaobmj3gg2n9t2f5a4bl/9p1mnh3phi0ipapa1a1sbsvshh1tqp14/1617417750000/03361959783973937060/00606084443186867714Z/1jkmsypHYrljIlDuuCfe2vABez1Own5r9?e=download&nonce=psuvgj7n34v9o&user=00606084443186867714Z&hash=jv3dju09mhcgf20ecoa98oo054nn1j9s [1988647478] -> "voxconverse_dev_wav.zip" [1]
Cloning into 'voxconverse'...
remote: Enumerating objects: 224, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 224 (delta 140), reused 224 (delta 140), pack-reused 0[K
Receiving objects: 100% (224/224), 97.46 KiB | 891.00 KiB/s, done.
Resolving deltas: 100% (140/140), done.


In [None]:
from __future__ import print_function, division

import os
import torch
import torchaudio
import numpy as np
import pandas as pd
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from speechbrain.pretrained import SpeakerRecognition
import torch.nn.functional as F
import matplotlib.pyplot as plt

from pyannote.database.util import load_rttm
from pyannote.metrics.diarization import DiarizationErrorRate

import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ECAPA-TDNN x-vector based pre-trained model on speaker verification task (latest x-vector system)
# https://arxiv.org/pdf/2005.07143.pdf
ECAPA = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device": device})

# Load VAD Model
# https://github.com/snakers4/silero-vad
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True)
(get_speech_ts, _, read_audio, _, _, _) = utils

# Data pipeline
class DiarizationDataSet(Dataset):
    def __init__(self, 
                 root_dir='../content/audio/', 
                 label_dir='../content/voxconverse/dev/', 
                 sr=16000, 
                 window_len=240, 
                 window_step=120, 
                 transform=None,
                 batch_size_for_ecapa=512,
                 vad_step=4):
        
        """
        Args:
        - root_dir (string): Local directory of the audio files
        - audioFilelist (string): txt file with audio file list
        - label_dir (string): Local directory of the rttm label files
        - sr (int): Sample rate for audio signal, default 16kHz
        - window_len (int): Length of each segment of audio signal in milliseconds
        - window_step (int): Length between two window_len in milliseconds
        - mel_transform (callable, optional): Parameters of mel transform. None signifies no transform
        - batch_size_for_ecapa (int): Size of batches used while applying pretrained speechbrain ECAPA model

        """

        self.root_dir = root_dir
        self.filelist = sorted(os.listdir(root_dir))
        self.label_dir = label_dir
        self.sr = sr
        self.win_len = window_len
        self.win_step = window_step
        self.transform = transform
        self.batch_size_for_ecapa = batch_size_for_ecapa
        self.vad_step = vad_step

    def __len__(self):
        return len(self.filelist)
  
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        audio_path = os.path.join(self.root_dir, self.filelist[idx])
        label_path = os.path.join(self.label_dir, self.filelist[idx][:-4]+'.rttm')

        # Torch array of audio signal
        audio = read_audio(audio_path, target_sr=self.sr)

        if self.transform:
            audio = self.transform(audio.detach().cpu().numpy())

        # Window len and Window step in frames
        win_len = self.win_len*(self.sr//1000)
        win_step = self.win_step*(self.sr//1000)

        # Pad and create audio segments
        audio_vec = audio.reshape(1, audio.shape[0])
        audio_vec = F.pad(input=audio_vec, pad=(win_len//2, win_len//2, 0, 0), mode='constant', value=0)

        audio_segments = []
        for i in range(win_len//2, audio_vec.shape[1]-win_len//2, win_step):
            audio_segments.append(audio_vec[:, i-win_len//2:i+win_len//2])

        audio_segments = torch.vstack(audio_segments)

        # Compute ECAPA-TDNN x-vectors for the audio signal
        Xt = []
        for i in range(audio_segments.shape[0]//self.batch_size_for_ecapa):
            Xt.append(ECAPA.encode_batch(audio_segments[i*self.batch_size_for_ecapa:(i+1)*self.batch_size_for_ecapa])[:,0,:])

        if audio_segments.shape[0]%self.batch_size_for_ecapa != 0:
            Xt.append(ECAPA.encode_batch(audio_segments[(audio_segments.shape[0]//self.batch_size_for_ecapa)*self.batch_size_for_ecapa:])[:,0,:])

        audio_segments = torch.vstack(Xt)

        return audio_segments, label_path

  '"sox" backend is being deprecated. '


Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading: "https://github.com/snakers4/silero-vad/archive/master.zip" to /root/.cache/torch/hub/master.zip


In [None]:
audio_dataset = DiarizationDataSet(root_dir='audio/',
                                   label_dir = 'voxconverse/dev/',
                                   sr = 16000, window_len = 1500, 
                                   window_step = 750,
                                   transform = None,
                                   batch_size_for_ecapa=512)

In [None]:
!mkdir VoxConverse_Xvectors

In [None]:
from tqdm.auto import tqdm

for i in tqdm(range(len(audio_dataset))):
    audio_segments, rttm_path = audio_dataset[i]
    name = rttm_path.split(sep="/")[-1][:-5]
    np.save("VoxConverse_Xvectors/" + name + ".npy", audio_segments.detach().cpu().numpy())

  0%|          | 0/216 [00:00<?, ?it/s]

In [None]:
!zip -r -q VoxConverse_Xvectors_750_1500.zip VoxConverse_Xvectors
!rm -r VoxConverse_Xvectors

In [None]:
!rm -r ./voxconverse
!rm -r ./pretrained_checkpoints
!rm -r ./audio