# Download files and install libraries

In [1]:
!pip install gdown -q

In [2]:
# Download voxconverse dataset
!gdown --id 1jkmsypHYrljIlDuuCfe2vABez1Own5r9

# Unzip data files
!unzip -o -q voxconverse_dev_wav.zip -d ./

# Remove zip file
!rm voxconverse_dev_wav.zip

# Download Estimated VAD
!gdown --id 18oXqn9Zyt5tJpoEwKKztpTag-AJMQ2Sz

# Unzip vad files
!unzip -o -q vad.zip -d ./

# Remove zip file
!rm vad.zip

# Pull labels from github
!git clone https://github.com/joonson/voxconverse.git
    
    
!pip install torchaudio -q --no-deps
!pip install speechbrain -q
!pip install spectralcluster -q
!pip install pyannote.metrics -q
!pip install pyamg -q

Downloading...
From: https://drive.google.com/uc?id=1jkmsypHYrljIlDuuCfe2vABez1Own5r9
To: /content/voxconverse_dev_wav.zip
1.99GB [00:21, 92.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=18oXqn9Zyt5tJpoEwKKztpTag-AJMQ2Sz
To: /content/vad.zip
100% 128k/128k [00:00<00:00, 4.12MB/s]
Cloning into 'voxconverse'...
remote: Enumerating objects: 224, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 224 (delta 140), reused 224 (delta 140), pack-reused 0[K
Receiving objects: 100% (224/224), 97.46 KiB | 13.92 MiB/s, done.
Resolving deltas: 100% (140/140), done.
[K     |████████████████████████████████| 1.9MB 22.6MB/s 
[K     |████████████████████████████████| 358kB 18.3MB/s 
[K     |████████████████████████████████| 1.2MB 19.3MB/s 
[K     |████████████████████████████████| 645kB 55.7MB/s 
[K     |████████████████████████████████| 102kB 13.7MB/s 
[K     |████████████████████████████████| 552kB 54.3MB/s

In [1]:
# Download util module
!gdown --id 1h9iwENF1XthJ0Z1AcMDm42iSe05Uny_c

# Download DEC module
!gdown --id 1FWIRF2Kq6JB0fFHwjkZmVfEuGq0-SSC5

# Download optimalSpeaker.py
!gdown --id 1hbepzjAxHTQS5QoAlKNP9zK0nMxdjauJ

Downloading...
From: https://drive.google.com/uc?id=1h9iwENF1XthJ0Z1AcMDm42iSe05Uny_c
To: /content/utils.py
100% 12.3k/12.3k [00:00<00:00, 11.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FWIRF2Kq6JB0fFHwjkZmVfEuGq0-SSC5
To: /content/DEC.py
100% 9.48k/9.48k [00:00<00:00, 8.34MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hbepzjAxHTQS5QoAlKNP9zK0nMxdjauJ
To: /content/optimumSpeaker.py
100% 7.07k/7.07k [00:00<00:00, 10.8MB/s]


# Import libraries

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from __future__ import print_function, division
import numpy as np
import matplotlib.pyplot as plt
import torchaudio
import os
import torch
from torch.utils.data import Dataset, DataLoader
from speechbrain.pretrained import SpeakerRecognition
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans, SpectralClustering
from sklearn import decomposition
from tqdm.auto import tqdm

from utils import DiarizationDataSet, make_rttm, get_metrics
import optimumSpeaker
from DEC import diarizationDEC

import shutil
import pandas as pd

# Diarization on test data

In [4]:
audio_dataset = DiarizationDataSet(root_dir='./audio/',
                                   label_dir = './voxconverse/dev/',
                                   sr = 16000, window_len = 1500, 
                                   window_step = 750,
                                   transform = None,
                                   batch_size_for_ecapa=512,
                                   vad_dir="./vad/",
                                   split='test')

print("\nData size:", len(audio_dataset))

Precomputed X-vectors exists!
Will use precomputed features...

Downloading precomputed features...
Download and Extraction Complete

Data size: 50


In [5]:
hypothesis_dir = diarizationDEC(audio_dataset)
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Downloading pre-trained weights for auto encoder...
Downloading Complete!




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
ahnss                  10.51   723.08   650.89   90.02        3.80        0.53            63.50             8.78      8.68      1.20
akthc                  13.39   105.16    99.01   94.15        7.93        7.54             3.21             3.06      2.93      2.79
aufkn                  11.86   187.68   168.27   89.66        2.85        1.52            14.66             7.81      4.76      2.54
bkwns                  15.39    48.60    45.98   94.61        4.86       10.00             0.58             1.20      2.04      4.19
blwmj                   5.07   256.80   248.92   96.93        5.14  

In [6]:
hypothesis_dir = diarizationDEC(audio_dataset, num_spkr="oracle", hypothesis_dir="./rttm_output_oracle/")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
ahnss                  10.59   723.08   650.27   89.93        3.80        0.53            63.50             8.78      9.30      1.29
akthc                  13.39   105.16    99.01   94.15        7.93        7.54             3.21             3.06      2.93      2.79
aufkn                  42.04   187.68   111.63   59.48        2.85        1.52            14.66             7.81     61.40     32.71
bkwns                  15.03    48.60    46.15   94.97        4.86       10.00             0.58             1.20      1.86      3.83
blwmj                   5.07   256.80   248.92   96.93        5.14  

# Diarization on full data

In [7]:
audio_dataset = DiarizationDataSet(root_dir='./audio/',
                                   label_dir = './voxconverse/dev/',
                                   sr = 16000, window_len = 1500, 
                                   window_step = 750,
                                   transform = None,
                                   batch_size_for_ecapa=512,
                                   vad_dir="./vad/",
                                   split='full')

print("\nData size:", len(audio_dataset))

Precomputed X-vectors exists!
Will use precomputed features...

Downloading precomputed features...
Download and Extraction Complete

Data size: 216


In [8]:
hypothesis_dir = diarizationDEC(audio_dataset)
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
abjxc                   3.81    62.60    61.42   98.12        1.20        1.92             1.18             1.88      0.00      0.00
afjiv                  34.17   123.64    97.93   79.21       16.54       13.38             3.68             2.98     22.03     17.82
ahnss                  10.59   723.08   650.27   89.93        3.80        0.53            63.50             8.78      9.30      1.29
aisvi                  17.25   441.88   389.24   88.09       23.56        5.33             2.32             0.53     50.32     11.39
akthc                  13.39   105.16    99.01   94.15        7.93  

In [9]:
hypothesis_dir = diarizationDEC(audio_dataset, num_spkr="oracle", hypothesis_dir="./rttm_output_oracle/")
metric = get_metrics(audio_dataset.label_dir, hypothesis_dir)
print(metric)

HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=216.0), HTML(value='')))


      diarization error rate    total  correct correct false alarm false alarm missed detection missed detection confusion confusion
                           %                         %                       %                                 %                   %
item                                                                                                                                
abjxc                   3.81    62.60    61.42   98.12        1.20        1.92             1.18             1.88      0.00      0.00
afjiv                  34.17   123.64    97.93   79.21       16.54       13.38             3.68             2.98     22.03     17.82
ahnss                  10.51   723.08   650.89   90.02        3.80        0.53            63.50             8.78      8.68      1.20
aisvi                   6.58   441.88   436.39   98.76       23.56        5.33             2.32             0.53      3.17      0.72
akthc                  13.39   105.16    99.01   94.15        7.93  