In [1]:
import torch
from pyannote.database import get_protocol, FileFinder

emb = torch.hub.load('pyannote/pyannote-audio', 'emb')
print(f'Embedding has dimension {emb.dimension:d}.')

preprocessors = {'audio': FileFinder()}
protocol = get_protocol('VOXCON.SpeakerDiarization.Sample', preprocessors=preprocessors)

Using cache found in /home/jovyan/.cache/torch/hub/pyannote_pyannote-audio_master


Embedding has dimension 512.


In [3]:
import time

# embs = []

# for file in protocol.test():
#     embeddings = emb(file)
#     embs.append(embeddings)
    
#     uri = file['uri']
#     print(uri, time.strftime("%H:%M:%S"))

    
test_file = next(protocol.train())
uri = test_file['uri']
print(uri, time.strftime("%H:%M:%S"))
embeddings = emb(test_file)
print(uri, time.strftime("%H:%M:%S"))

chunks = embeddings.sliding_window
print(f'Embeddings were extracted every {1000 * chunks.step:g}ms on {1000 * chunks.duration:g}ms-long windows.')

abjxc 03:09:14
abjxc 03:09:16
Embeddings were extracted every 1000ms on 4000ms-long windows.


In [8]:
import librosa
audio_path = test_file['audio'].as_posix()
utter, sr = librosa.core.load(audio_path, sr=16000) 
librosa.get_duration(utter, sr=16000)

68.376

In [None]:
import numpy as np
from IPython.display import display, clear_output
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE

X, Y = [], []

fileIndex = 7

embeddings = embs[fileIndex]
length = len(embeddings)
rttm = list(protocol.test())[fileIndex]["annotation"]

for id, (window, embedding) in enumerate(embeddings):
    # average speech turn embedding
    X.append(np.nanmean(embedding, axis=0))

    # keep track of speaker label (for later scatter plot)
    y = rttm.argmax(window)
    Y.append(y)
    clear_output(wait=True)
    display(f'{id+1} {100*(id+1)/length:g}%')

X = np.vstack(X)
_, y_true = np.unique(Y, return_inverse=True)

tsne = TSNE(n_components=2, metric="cosine")
X_2d = tsne.fit_transform(X)

# plot 
fig, ax = plt.subplots()
fig.set_figheight(5)
fig.set_figwidth(5)
plt.clf()
plt.scatter(*X_2d.T, c=y_true)

In [None]:
# for resource in protocol.test():
#     print(resource["audio"])
#     print(resource["uri"])

# test_file = next(protocol.test())
# test_file["audio"]

###########################################

# sw = SlidingWindow(duration=4, step=1, start=0.0, end=len(embeddings))

# for segment in sw:
#     # "strict" only keeps embedding strictly included in segment
#     x = embeddings.crop(segment, mode='strict')

############################################

# from pyannote.core import Segment
# import numpy as np

# for id, (window, emb) in enumerate(embeddings):
#     print(window, emb)
#     assert isinstance(window, Segment)
#     assert isinstance(emb, np.ndarray)