In [1]:
import torch
from pyannote.database import get_protocol, FileFinder

emb = torch.hub.load('pyannote/pyannote-audio', 'emb')
print(f'Embedding has dimension {emb.dimension:d}.')

Downloading: "https://github.com/pyannote/pyannote-audio/archive/master.zip" to /home/jovyan/.cache/torch/hub/master.zip


Downloading list of pretrained models and pipelines to "/home/jovyan/.pyannote/hub/pretrained.yml".


  0%|          | 0.00/901 [00:00<?, ?B/s]

Downloading pretrained model "emb_voxceleb" to "/home/jovyan/.pyannote/hub/models/emb_voxceleb.zip".


  0%|          | 0.00/16.3M [00:00<?, ?B/s]



Embedding has dimension 512.


In [2]:
preprocessors = {'audio': FileFinder()}
protocol = get_protocol('VOXCON.SpeakerDiarization.Sample', preprocessors=preprocessors)

In [3]:
from pyannote.audio.features.utils import get_audio_duration

test_file = next(protocol.test())
duration = get_audio_duration(test_file)

In [7]:
from pyannote.core import SlidingWindow, Segment

sw = SlidingWindow(duration=0.025, step=0.01, start=0.0, end=duration)

for chunk in sw(Segment(3, 7.5)):
    print(tuple(chunk))

(3.0, 3.025)
(3.01, 3.0349999999999997)
(3.02, 3.045)
(3.03, 3.0549999999999997)
(3.04, 3.065)
(3.05, 3.0749999999999997)
(3.06, 3.085)
(3.07, 3.0949999999999998)
(3.08, 3.105)
(3.09, 3.1149999999999998)
(3.1, 3.125)
(3.11, 3.135)
(3.12, 3.145)
(3.13, 3.155)
(3.14, 3.165)
(3.15, 3.175)
(3.16, 3.185)
(3.17, 3.195)
(3.18, 3.205)
(3.19, 3.215)
(3.2, 3.225)
(3.21, 3.235)
(3.22, 3.245)
(3.23, 3.255)
(3.24, 3.265)
(3.25, 3.275)
(3.26, 3.2849999999999997)
(3.27, 3.295)
(3.2800000000000002, 3.305)
(3.29, 3.315)
(3.3, 3.3249999999999997)
(3.31, 3.335)
(3.32, 3.3449999999999998)
(3.33, 3.355)
(3.34, 3.3649999999999998)
(3.35, 3.375)
(3.36, 3.385)
(3.37, 3.395)
(3.38, 3.405)
(3.39, 3.415)
(3.4, 3.425)
(3.41, 3.435)
(3.42, 3.445)
(3.43, 3.455)
(3.44, 3.465)
(3.45, 3.475)
(3.46, 3.485)
(3.47, 3.495)
(3.48, 3.505)
(3.49, 3.515)
(3.5, 3.525)
(3.51, 3.5349999999999997)
(3.52, 3.545)
(3.5300000000000002, 3.555)
(3.54, 3.565)
(3.55, 3.5749999999999997)
(3.56, 3.585)
(3.5700000000000003, 3.595)
(3.58, 3.

In [15]:
def concat_segs(times, segs):
    #Concatenate continuous voiced segments
    concat_seg = []
    seg_concat = segs[0]
    for i in range(0, len(times)-1):
        if times[i][1] == times[i+1][0]:
            seg_concat = np.concatenate((seg_concat, segs[i+1]))
        else:
            concat_seg.append(seg_concat)
            seg_concat = segs[i+1]
    else:
        concat_seg.append(seg_concat)
    return concat_seg

def get_STFTs(segs):
    #Get 240ms STFT windows with 50% overlap
    sr = hp.data.sr
    STFT_frames = []
    for seg in segs:
        S = librosa.core.stft(y=seg, n_fft=hp.data.nfft,
                              win_length=int(hp.data.window * sr), hop_length=int(hp.data.hop * sr))
        S = np.abs(S)**2
        mel_basis = librosa.filters.mel(sr, n_fft=hp.data.nfft, n_mels=hp.data.nmels)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
        for j in range(0, S.shape[1], int(.12/hp.data.hop)):
            if j + 24 < S.shape[1]:
                STFT_frames.append(S[:,j:j+24])
            else:
                break
    return STFT_frames

def align_embeddings(embeddings):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    avg_embeddings = np.zeros((len(partitions),256))
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0) 
    return avg_embeddings