In [1]:
import torch
from pyannote.database import get_protocol, FileFinder

# Load embedding model
emb = torch.hub.load('pyannote/pyannote-audio', 'emb')
print(f'Embedding has dimension {emb.dimension:d}.')

Downloading: "https://github.com/pyannote/pyannote-audio/archive/master.zip" to /home/jovyan/.cache/torch/hub/master.zip


Downloading list of pretrained models and pipelines to "/home/jovyan/.pyannote/hub/pretrained.yml".


  0%|          | 0.00/901 [00:00<?, ?B/s]

Downloading pretrained model "emb_voxceleb" to "/home/jovyan/.pyannote/hub/models/emb_voxceleb.zip".


  0%|          | 0.00/16.3M [00:00<?, ?B/s]



Embedding has dimension 512.


#### Load audio files protocol

In [2]:
preprocessors = {'audio': FileFinder()}
protocol = get_protocol('VOXCON.SpeakerDiarization.Sample', preprocessors=preprocessors)

#### Get single file for testing

In [3]:
from pyannote.audio.features.utils import get_audio_duration

train_file = next(protocol.train())
duration = get_audio_duration(train_file)

#### Create SlidingWindow for embedding extraction from utterances

In [4]:
from pyannote.core import SlidingWindow, Segment

sw = SlidingWindow(duration=0.240, step=0.120, start=0.0, end=duration)

# Test SlidingWindow
# for chunk in sw(Segment(3, 7.5)):
#     print(tuple(chunk))

### D-Vector Create

In [5]:
def concat_segs(times, segs):
    #Concatenate continuous voiced segments
    concat_seg = []
    seg_concat = segs[0]
    for i in range(0, len(times)-1):
        if times[i][1] == times[i+1][0]:
            seg_concat = np.concatenate((seg_concat, segs[i+1]))
        else:
            concat_seg.append(seg_concat)
            seg_concat = segs[i+1]
    else:
        concat_seg.append(seg_concat)
    return concat_seg

def align_embeddings(embeddings):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    avg_embeddings = np.zeros((len(partitions),512))
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0) 
    return avg_embeddings

In [6]:
import glob
import librosa
import numpy as np
import os
import torch

from hparam import hparam as hp
from VAD_segments import VAD_chunk

In [None]:
# Split test and train set from the same folder

# audio_path = glob.glob(os.path.dirname(hp.unprocessed_data))  

# total_speaker_num = len(audio_path)
# train_speaker_num= (total_speaker_num//10)*9            # split total data 90% train and 10% test

# audio_path

In [7]:
next(protocol.train())['audio'].as_posix()

'/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/abjxc.wav'

In [8]:
train_sequence = []
train_cluster_id = []
label = 0
count = 0
train_saved = False
listLength = len(list(protocol.train()))
for i, file in enumerate(protocol.train()):
    filePath = file['audio'].as_posix()
    times, segs = VAD_chunk(2, filePath)
    if segs == []:
        print('No voice activity detected')
        continue
    concat_seg = concat_segs(times, segs)
#     STFT_frames = get_STFTs(concat_seg)
#     STFT_frames = np.stack(STFT_frames, axis=2)
#     STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0)))
#     print(STFT_frames)

486

In [21]:
for idx, time in enumerate(times):
    if (time[1]!=times[idx+1][0]):
        print(times[idx:idx+2])
        

[(30.16, 30.52), (30.54, 30.94)]
[(57.34, 57.38), (57.48, 57.88)]
[(81.08, 81.28), (81.42, 81.82)]
[(87.42, 87.68), (88.02, 88.42)]
[(165.2, 165.46), (165.5, 165.9)]
[(168.7, 168.86), (168.92, 169.32)]
[(177.32, 177.58), (177.9, 178.3)]


IndexError: list index out of range

In [None]:
    embeddings = emb(file)
    aligned_embeddings = align_embeddings(embeddings.data)
    train_sequence.append(aligned_embeddings)
    for embedding in aligned_embeddings:
        train_cluster_id.append(str(label))
    count = count + 1
    if count % 100 == 0:
        print('Processed {0}/{1} files'.format(count, listLength))
    label = label + 1
    
#     if not train_saved and i > train_speaker_num:
#         train_sequence = np.concatenate(train_sequence,axis=0)
#         train_cluster_id = np.asarray(train_cluster_id)
#         np.save('train_sequence',train_sequence)
#         np.save('train_cluster_id',train_cluster_id)
#         train_saved = True
#         train_sequence = []
#         train_cluster_id = []
        
train_sequence = np.concatenate(train_sequence,axis=0)
train_cluster_id = np.asarray(train_cluster_id)
np.save('test_sequence',train_sequence)
np.save('test_cluster_id',train_cluster_id)

In [13]:
lines = ["1 id10270/x6uYqmx31kE/00001.wav id10270/8jEAjG6SegY/00008.wav",
         "0 id10270/x6uYqmx31kE/00001.wav id10300/ize_eiCFEg0/00003.wav",
         "1 id10270/x6uYqmx31kE/00001.wav id10270/GWXujl-xAVM/00017.wav",
         "0 id10270/x6uYqmx31kE/00001.wav id10273/0OCW1HUxZyg/00001.wav",
         "1 id10270/x6uYqmx31kE/00001.wav id10270/8jEAjG6SegY/00022.wav",
         "0 id10270/x6uYqmx31kE/00001.wav id10284/Uzxv7Axh3Z8/00001.wav",
         "1 id10270/x6uYqmx31kE/00001.wav id10270/GWXujl-xAVM/00033.wav",
         "0 id10270/x6uYqmx31kE/00001.wav id10284/7yx9A0yzLYk/00029.wav",
         "1 id10270/x6uYqmx31kE/00002.wav id10270/5r0dWxy17C8/00026.wav",
         "0 id10270/x6uYqmx31kE/00002.wav id10285/m-uILToQ9ss/00009.wav",
         "1 id10270/x6uYqmx31kE/00002.wav id10270/GWXujl-xAVM/00035.wav",
         "0 id10270/x6uYqmx31kE/00002.wav id10306/uzt36PBzT2w/00001.wav",
         "1 id10270/x6uYqmx31kE/00002.wav id10270/GWXujl-xAVM/00038.wav",
         "0 id10270/x6uYqmx31kE/00002.wav id10307/kp_GCjLq4qA/00004.wav",
         "1 id10270/x6uYqmx31kE/00002.wav id10270/GWXujl-xAVM/00033.wav",
         "0 id10270/x6uYqmx31kE/00002.wav id10275/Mdk1SXywHck/00024.wav",
         "1 id10270/x6uYqmx31kE/00003.wav id10270/GWXujl-xAVM/00038.wav",
         "0 id10270/x6uYqmx31kE/00003.wav id10293/TwfthltapLg/00004.wav",
         "1 id10270/x6uYqmx31kE/00003.wav id10270/5r0dWxy17C8/00004.wav",
         "0 id10270/x6uYqmx31kE/00003.wav id10273/8cfyJEV7hP8/00004.wav",
         "1 id10270/x6uYqmx31kE/00003.wav id10270/8jEAjG6SegY/00038.wav",
         "0 id10270/x6uYqmx31kE/00003.wav id10300/SQzWyPhRqmk/00012.wav",
         "1 id10270/x6uYqmx31kE/00003.wav id10270/5r0dWxy17C8/00010.wav",
         "0 id10270/x6uYqmx31kE/00003.wav id10305/G50_Ix7IVjU/00001.wav",
         "1 id10270/x6uYqmx31kE/00004.wav id10270/GWXujl-xAVM/00010.wav",
         "0 id10270/x6uYqmx31kE/00004.wav id10306/2SaEbN8hYz4/00011.wav",
         "1 id10270/x6uYqmx31kE/00004.wav id10270/GWXujl-xAVM/00045.wav",
         "0 id10270/x6uYqmx31kE/00004.wav id10280/NXjT3732Ekg/00001.wav",
         "1 id10270/x6uYqmx31kE/00004.wav id10270/OhfKF8FSq3Y/00003.wav",
         "0 id10270/x6uYqmx31kE/00004.wav id10271/djK87iSSKQo/00001.wav",
         "1 id10270/x6uYqmx31kE/00004.wav id10270/GWXujl-xAVM/00007.wav",
         "0 id10270/x6uYqmx31kE/00004.wav id10286/isKyMAYUOgg/00012.wav",
         "1 id10270/x6uYqmx31kE/00005.wav id10270/8jEAjG6SegY/00026.wav",
         "0 id10270/x6uYqmx31kE/00005.wav id10278/LVXvQzNatXI/00001.wav",
         "1 id10270/x6uYqmx31kE/00005.wav id10270/5r0dWxy17C8/00024.wav",
         "0 id10270/x6uYqmx31kE/00005.wav id10295/3tvnlmkCiTw/00008.wav",
         "1 id10270/x6uYqmx31kE/00005.wav id10270/8jEAjG6SegY/00035.wav",
         "0 id10270/x6uYqmx31kE/00005.wav id10303/R5JGtwc4o1M/00005.wav",
         "1 id10270/x6uYqmx31kE/00005.wav id10270/OXdd7Gmluts/00003.wav",
         "0 id10270/x6uYqmx31kE/00005.wav id10281/NHARUN9OhSo/00002.wav",
         "1 id10270/5r0dWxy17C8/00001.wav id10270/8jEAjG6SegY/00027.wav",
         "0 id10270/5r0dWxy17C8/00001.wav id10292/gm6PJowclv0/00027.wav",
         "1 id10270/5r0dWxy17C8/00001.wav id10270/5r0dWxy17C8/00022.wav",
         "0 id10270/5r0dWxy17C8/00001.wav id10272/wb6ligRbbZ4/00001.wav",
         "1 id10270/5r0dWxy17C8/00001.wav id10270/8jEAjG6SegY/00012.wav",
         "0 id10270/5r0dWxy17C8/00001.wav id10302/WAbHmvQ9zME/00006.wav",
         "1 id10270/5r0dWxy17C8/00001.wav id10270/8jEAjG6SegY/00030.wav",
         "0 id10270/5r0dWxy17C8/00001.wav id10309/0cYFdtyWVds/00002.wav",
         "1 id10270/5r0dWxy17C8/00002.wav id10270/8jEAjG6SegY/00018.wav",
         "0 id10270/5r0dWxy17C8/00002.wav id10307/yUv37vQWmzE/00014.wav",
         "1 id10270/5r0dWxy17C8/00002.wav id10270/GWXujl-xAVM/00044.wav",
         "0 id10270/5r0dWxy17C8/00002.wav id10309/e-IdJ8a4gy4/00009.wav",
         "1 id10270/5r0dWxy17C8/00002.wav id10270/8jEAjG6SegY/00035.wav",
         "0 id10270/5r0dWxy17C8/00002.wav id10273/hW0Q6eHqN-U/00003.wav"]

In [15]:
import itertools

## Get a list of unique file names
files = list(itertools.chain(*[x.strip().split()[-2:] for x in lines]))
setfiles = list(set(files))
setfiles.sort()

print(setfiles)

['id10270/5r0dWxy17C8/00001.wav', 'id10270/5r0dWxy17C8/00002.wav', 'id10270/5r0dWxy17C8/00004.wav', 'id10270/5r0dWxy17C8/00010.wav', 'id10270/5r0dWxy17C8/00022.wav', 'id10270/5r0dWxy17C8/00024.wav', 'id10270/5r0dWxy17C8/00026.wav', 'id10270/8jEAjG6SegY/00008.wav', 'id10270/8jEAjG6SegY/00012.wav', 'id10270/8jEAjG6SegY/00018.wav', 'id10270/8jEAjG6SegY/00022.wav', 'id10270/8jEAjG6SegY/00026.wav', 'id10270/8jEAjG6SegY/00027.wav', 'id10270/8jEAjG6SegY/00030.wav', 'id10270/8jEAjG6SegY/00035.wav', 'id10270/8jEAjG6SegY/00038.wav', 'id10270/GWXujl-xAVM/00007.wav', 'id10270/GWXujl-xAVM/00010.wav', 'id10270/GWXujl-xAVM/00017.wav', 'id10270/GWXujl-xAVM/00033.wav', 'id10270/GWXujl-xAVM/00035.wav', 'id10270/GWXujl-xAVM/00038.wav', 'id10270/GWXujl-xAVM/00044.wav', 'id10270/GWXujl-xAVM/00045.wav', 'id10270/OXdd7Gmluts/00003.wav', 'id10270/OhfKF8FSq3Y/00003.wav', 'id10270/x6uYqmx31kE/00001.wav', 'id10270/x6uYqmx31kE/00002.wav', 'id10270/x6uYqmx31kE/00003.wav', 'id10270/x6uYqmx31kE/00004.wav', 'id10270/

In [17]:
for idx, line in enumerate(lines):
    data = line.split()

    # Append random label if missing
    if len(data) == 2:
        data = [random.randint(0, 1)] + data

    print(data)

['1', 'id10270/x6uYqmx31kE/00001.wav', 'id10270/8jEAjG6SegY/00008.wav']
['0', 'id10270/x6uYqmx31kE/00001.wav', 'id10300/ize_eiCFEg0/00003.wav']
['1', 'id10270/x6uYqmx31kE/00001.wav', 'id10270/GWXujl-xAVM/00017.wav']
['0', 'id10270/x6uYqmx31kE/00001.wav', 'id10273/0OCW1HUxZyg/00001.wav']
['1', 'id10270/x6uYqmx31kE/00001.wav', 'id10270/8jEAjG6SegY/00022.wav']
['0', 'id10270/x6uYqmx31kE/00001.wav', 'id10284/Uzxv7Axh3Z8/00001.wav']
['1', 'id10270/x6uYqmx31kE/00001.wav', 'id10270/GWXujl-xAVM/00033.wav']
['0', 'id10270/x6uYqmx31kE/00001.wav', 'id10284/7yx9A0yzLYk/00029.wav']
['1', 'id10270/x6uYqmx31kE/00002.wav', 'id10270/5r0dWxy17C8/00026.wav']
['0', 'id10270/x6uYqmx31kE/00002.wav', 'id10285/m-uILToQ9ss/00009.wav']
['1', 'id10270/x6uYqmx31kE/00002.wav', 'id10270/GWXujl-xAVM/00035.wav']
['0', 'id10270/x6uYqmx31kE/00002.wav', 'id10306/uzt36PBzT2w/00001.wav']
['1', 'id10270/x6uYqmx31kE/00002.wav', 'id10270/GWXujl-xAVM/00038.wav']
['0', 'id10270/x6uYqmx31kE/00002.wav', 'id10307/kp_GCjLq4qA/0000

In [19]:
x = torch.tensor([1, 2, 3, 4])
print(torch.unsqueeze(x, -1).dim)

<built-in method dim of Tensor object at 0x7f158d541580>
