In [16]:
import os
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist, euclidean, cosine
from glob import glob

from model import vggvox_model
from wav_reader import get_fft_spectrum
import constants as c

In [7]:
model = vggvox_model()
model.load_weights(c.WEIGHTS_FILE)
model.summary()

Model: "VGGVox"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 512, None, 1)      0         
_________________________________________________________________
pad1 (ZeroPadding2D)         (None, 514, None, 1)      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 254, None, 96)     4800      
_________________________________________________________________
bn1 (BatchNormalization)     (None, 254, None, 96)     384       
_________________________________________________________________
relu1 (Activation)           (None, 254, None, 96)     0         
_________________________________________________________________
mpool1 (MaxPooling2D)        (None, 126, None, 96)     0         
_________________________________________________________________
pad2 (ZeroPadding2D)         (None, 128, None, 96)     0    

In [8]:
def build_buckets(max_sec, step_sec, frame_step):
	buckets = {}
	frames_per_sec = int(1/frame_step)
	end_frame = int(max_sec*frames_per_sec)
	step_frame = int(step_sec*frames_per_sec)
	for i in range(0, end_frame+1, step_frame):
		s = i
		s = np.floor((s-7+2)/2) + 1  # conv1
		s = np.floor((s-3)/2) + 1  # mpool1
		s = np.floor((s-5+2)/2) + 1  # conv2
		s = np.floor((s-3)/2) + 1  # mpool2
		s = np.floor((s-3+2)/1) + 1  # conv3
		s = np.floor((s-3+2)/1) + 1  # conv4
		s = np.floor((s-3+2)/1) + 1  # conv5
		s = np.floor((s-3)/2) + 1  # mpool5
		s = np.floor((s-1)/1) + 1  # fc6
		if s > 0:
			buckets[i] = int(s)
	return buckets

In [9]:
def get_embeddings_from_list_file(model, list_file, max_sec):
	buckets = build_buckets(max_sec, c.BUCKET_STEP, c.FRAME_STEP)
	result = pd.read_csv(list_file, delimiter=",")
	result['features'] = result['filename'].apply(lambda x: get_fft_spectrum(x, buckets))
	result['embedding'] = result['features'].apply(lambda x: np.squeeze(model.predict(x.reshape(1,*x.shape,1))))
	return result[['filename','speaker','embedding']]

In [17]:
print("Processing enroll samples....")
enroll_result = get_embeddings_from_list_file(model, c.ENROLL_LIST_FILE, c.MAX_SEC)
enroll_embs = np.array([emb.tolist() for emb in enroll_result['embedding']])
speakers = enroll_result['speaker']

print("Processing test samples....")
test_result = get_embeddings_from_list_file(model, c.TEST_LIST_FILE, c.MAX_SEC)
test_embs = np.array([emb.tolist() for emb in test_result['embedding']])

print("Comparing test samples against enroll samples....")
distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers)

scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",",header=0,names=['test_file','test_speaker'])
scores = pd.concat([scores, distances],axis=1)
scores['result'] = scores[speakers].idxmin(axis=1)
scores['correct'] = (scores['result'] == scores['test_speaker'])*1. # bool to int

print("Writing outputs to [{}]....".format(c.RESULT_FILE))
result_dir = os.path.dirname(c.RESULT_FILE)
if not os.path.exists(result_dir):
    os.makedirs(result_dir)
with open(c.RESULT_FILE, 'w') as f:
    scores.to_csv(f, index=False)

Processing enroll samples....
Processing test samples....
Comparing test samples against enroll samples....
Writing outputs to [res/results.csv]....


In [18]:
scores

Unnamed: 0,test_file,test_speaker,19,26,27,result,correct
0,data/wav/test/19-198-0001.wav,19,0.090734,1.110295,1.190071,19,1.0
1,data/wav/test/27-123349-0001.wav,27,1.524265,0.830944,0.556515,27,1.0
