# Speaker Verification

Import libraries

In [1]:
import pickle
import librosa
import numpy as np
from itertools import combinations, product
import random
import tensorflow as tf

Load data

In [2]:
with open('hw4_trs.pkl', 'rb') as f:
    trs_data = pickle.load(f)
print(trs_data.shape)
with open('hw4_tes.pkl', 'rb') as f:
    tes_data = pickle.load(f)
print(tes_data.shape)

(500, 16180)
(200, 22631)


Process STFT and pad training data to match with test data 

In [3]:
trs_stft = []
tes_stft = []

for i in range(500):
    X=librosa.stft(trs_data[i], n_fft=1024, hop_length=512)
    X=np.pad(X,((0,0),(0,45-X.shape[1])),'constant')
    trs_stft.append(np.abs(np.transpose(X)))
for i in range(200):
    T=librosa.stft(tes_data[i], n_fft=1024, hop_length=512)
    tes_stft.append(np.abs(np.transpose(T)))

Define number of pairs

In [4]:
num_pairs = 45

Compute True pairs

In [5]:
def get_true_pairs(data):
    final_pairs = []
    random.seed(599)
    for i in range(0,len(data),10):
        comb = list(combinations(data[i:i+10], 2))
        final_pairs.append(random.choices(comb, k = num_pairs))
    return final_pairs

In [6]:
trs_true_pairs = get_true_pairs(trs_stft)

In [7]:
tes_true_pairs = get_true_pairs(tes_stft)

Compute False pairs

In [8]:
def get_false_pairs(data):
    final_false = []
    for i in range(0,len(data),10):
        random.seed(599)
        current = list(range(i,i+10))
        first = list(range(0,i))
        last = list(range(i+10,len(data)))
        final = first+ last
        prod = list(product(current, final))
        false_comb = (random.choices(prod, k = num_pairs))
        comb_list= []
        for comb in false_comb:
            comb_list.append((data[comb[0]],data[comb[1]]))
        final_false.append(comb_list)
        
    return final_false

In [9]:
trs_false_pairs = get_false_pairs(trs_stft)

In [10]:
tes_false_pairs = get_false_pairs(tes_stft)

Make data 

In [11]:
def generate_data(true_pairs, false_pairs):
    final_true_pairs = []
    final_false_pairs = []
    true_labels = []
    false_labels = []
    for i in true_pairs:
        for j in i:
            final_true_pairs.append(j)
            true_labels.append(1)
    
    for i in false_pairs:
        for j in i:
            final_false_pairs.append(j)
            false_labels.append(0)
    
    assert len(final_true_pairs) == len(final_false_pairs) == len(true_labels) == len(false_labels)
    
    temp_data = []
    temp_labels = []
    for i in range(0, len(final_true_pairs), num_pairs):
        temp_data.append(final_true_pairs[i:i+num_pairs])
        temp_data.append(final_false_pairs[i:i+num_pairs])
        temp_labels.append(true_labels[i:i+num_pairs])
        temp_labels.append(false_labels[i:i+num_pairs])
    
    final_labels =[]
    final_data = []
    for i in temp_data:
        for j in i:
            final_data.append(j)
    for i in temp_labels:
        for j in i:
            final_labels.append(j)
    
    return final_data, final_labels

In [12]:
train_data, train_labels=  generate_data(trs_true_pairs, trs_false_pairs)

In [13]:
test_data, test_labels = generate_data(tes_true_pairs, tes_false_pairs)

In [14]:
np.array(train_data).shape

(4500, 2, 45, 513)

Define network architecture

For the architecture, I chose to go with 6 convolution layers followed by max pooling and dropout after each 2 convolution layers and followed by a dense layers at the end.

For the loss function, I went with constrastive loss. Adam Optimizer was chosen with learning rate 0.0001 and trained the network for 30 epochs.

In [52]:
def base_network(inputs, re = False):
    with tf.variable_scope('basenet',reuse=re):
        h1 = tf.layers.conv2d(inputs, filters = 32, kernel_size=(3,7), strides=(2,2), activation="relu", padding = "same", kernel_initializer = tf.glorot_uniform_initializer)
        print("h1",h1.shape)

        h2 = tf.layers.conv2d(h1, filters = 32, kernel_size=(3,7), strides=(1,1), activation="relu", padding = "same", kernel_initializer = tf.glorot_uniform_initializer)
        print("h2",h2.shape)

        h2_maxpool = tf.layers.max_pooling2d(h2, pool_size=(2,2), strides =(2,2) , padding = 'valid')
        print("h2_maxpool", h2_maxpool.shape)
        dropout1=tf.layers.dropout(h2_maxpool,0.2)

        h3 = tf.layers.conv2d(dropout1, filters = 64, kernel_size=(3,7), strides=(2,2), activation="relu", padding = "same",kernel_initializer = tf.glorot_uniform_initializer)
        print("h3",h3.shape)

        h4 = tf.layers.conv2d(h3, filters = 64, kernel_size=(3,7), strides=(1,1), activation="relu", padding = "same", kernel_initializer = tf.glorot_uniform_initializer)
        print("h4",h4.shape)

        h4_maxpool = tf.layers.max_pooling2d(h4, pool_size=(2,2), strides =(2,2), padding = 'valid')
        print("h4_maxpool", h4_maxpool.shape)
        dropout2=tf.layers.dropout(h4_maxpool,0.2)
        
        h5 = tf.layers.conv2d(dropout2, filters = 128, kernel_size=(3,5), strides=(1,2), activation="relu", padding = "same", kernel_initializer = tf.glorot_uniform_initializer)
        print("h5",h5.shape)
        h6 = tf.layers.conv2d(h5, filters = 128, kernel_size=(3,5), strides=(1,1), activation="relu", padding = "same", kernel_initializer = tf.glorot_uniform_initializer)
        print("h6",h6.shape)
        h6_maxpool = tf.layers.max_pooling2d(h6, pool_size=(2,2), strides =(2,2), padding = 'valid')
        print("h6_maxpool", h6_maxpool.shape)
        dropout3=tf.layers.dropout(h6_maxpool,0.2)

        flatten_layer = tf.layers.flatten(dropout3)
        print("flatten", flatten_layer.shape)
        
        dense_1 = tf.layers.dense(flatten_layer, 512, activation = "relu", kernel_initializer = tf.glorot_uniform_initializer)
        print("dense1" ,dense_1.shape)
        return dense_1


Define Contrastive Loss function

In [53]:
#https://github.com/ardiya/siamesenetwork-tensorflow/blob/master/model.py
def contrastive_loss(a, b, label, margin = 1):
    with tf.variable_scope('contrastive',reuse=False):
        distance = tf.sqrt(tf.maximum(tf.reduce_sum(tf.pow(a - b, 2), 1), 1e-6))
        print("distance", distance.shape)
        similarity = label * tf.square(distance)                                           # keep the similar label (1) close to each other
        dissimilarity = (1 - label) * tf.square(tf.maximum((margin - distance), 0))        # give penalty to dissimilar label if the distance is bigger than margin
        return tf.reduce_mean(dissimilarity + similarity) , distance

Define training variables

In [56]:
tf.reset_default_graph()

#two inputs for part of each pair
input1 = tf.placeholder(tf.float32, [None, 45, 513, 1], name='input1')
print("input1",input1.shape)

input2 = tf.placeholder(tf.float32, [None, 45, 513, 1], name='input2')
print("input2",input2.shape)

#output labels
y= tf.placeholder(tf.float32, [None,1], name="pred")
print("y", y.shape)

#pass the inputs to basenetwork
output1 = base_network(input1, re = False)
output2 = base_network(input2, re = True)  

processed_output1=tf.layers.batch_normalization(output1,beta_regularizer="l2")
processed_output2=tf.layers.batch_normalization(output2,beta_regularizer="l2")
loss, eudistance = contrastive_loss(processed_output1, processed_output2, y)

optimizer = tf.train.AdamOptimizer(0.0001).minimize(loss)

epochs = 30
train_batches = int(len(train_data)/(num_pairs))
train_batch_size = int(num_pairs)
print(train_batches, train_batch_size)
init = tf.global_variables_initializer()

input1 (?, 45, 513, 1)
input2 (?, 45, 513, 1)
y (?, 1)
h1 (?, 23, 257, 32)
h2 (?, 23, 257, 32)
h2_maxpool (?, 11, 128, 32)
h3 (?, 6, 64, 64)
h4 (?, 6, 64, 64)
h4_maxpool (?, 3, 32, 64)
h5 (?, 3, 16, 128)
h6 (?, 3, 16, 128)
h6_maxpool (?, 1, 8, 128)
flatten (?, 1024)
dense1 (?, 512)
dense2 (?, 256)
h1 (?, 23, 257, 32)
h2 (?, 23, 257, 32)
h2_maxpool (?, 11, 128, 32)
h3 (?, 6, 64, 64)
h4 (?, 6, 64, 64)
h4_maxpool (?, 3, 32, 64)
h5 (?, 3, 16, 128)
h6 (?, 3, 16, 128)
h6_maxpool (?, 1, 8, 128)
flatten (?, 1024)
dense1 (?, 512)
dense2 (?, 256)
distance (?,)
100 45


In [57]:
# writer = tf.summary.FileWriter('./a4graph', graph = tf.get_default_graph())
# writer.close()
len(train_data)
tf.__version__

'1.14.0'

In [58]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
sess_a4_part1 = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
sess_a4_part1.run(init)

for epoch in range(epochs):
    temp_loss = 0
    #print(epoch)
    for i in range(0,len(train_data), train_batch_size):
     #   print(i, i+train_batch_size)
        loss_train,_ = sess_a4_part1.run([loss, optimizer], feed_dict = {input1: np.expand_dims(np.array(train_data)[i:i+train_batch_size,0],-1),\
                                                                    input2: np.expand_dims(np.array(train_data)[i:i+train_batch_size,1],-1),\
                                                                 y: np.expand_dims(np.array(train_labels)[i:i+train_batch_size],1)})
        temp_loss+=loss_train
    print("Epoch:",epoch,"Loss:",temp_loss/train_batches)




Epoch: 0 Loss: 0.23409556843340396
Epoch: 1 Loss: 0.18985908813774585
Epoch: 2 Loss: 0.16414411820471286
Epoch: 3 Loss: 0.14394067943096162
Epoch: 4 Loss: 0.1267944796010852
Epoch: 5 Loss: 0.11109062030911446
Epoch: 6 Loss: 0.0982693406380713
Epoch: 7 Loss: 0.08918127892538905
Epoch: 8 Loss: 0.0823256360553205
Epoch: 9 Loss: 0.07628320232033729
Epoch: 10 Loss: 0.07329390626400709
Epoch: 11 Loss: 0.07070019045844673
Epoch: 12 Loss: 0.06683644415810704
Epoch: 13 Loss: 0.06579456059262156
Epoch: 14 Loss: 0.0647602802887559
Epoch: 15 Loss: 0.06513416316360235
Epoch: 16 Loss: 0.05715098180808127
Epoch: 17 Loss: 0.05228254922665656
Epoch: 18 Loss: 0.046440651756711304
Epoch: 19 Loss: 0.04479751555249095
Epoch: 20 Loss: 0.04265551464166492
Epoch: 21 Loss: 0.04335604273248464
Epoch: 22 Loss: 0.04152660571038723
Epoch: 23 Loss: 0.03820983230834827
Epoch: 24 Loss: 0.03587978850584477
Epoch: 25 Loss: 0.036245230636559427
Epoch: 26 Loss: 0.036863068342208864
Epoch: 27 Loss: 0.03440144557971507
Epo

Testing:

In [64]:
#deine test batch size and batches
test_batches = int(len(test_data)/(num_pairs))
test_batch_size =int(num_pairs)

Get predictions on test data based on euclidean distance

In [60]:
predictions = []

for i in range(0,len(test_data),test_batch_size):
    print(i, i+train_batch_size)
    predictions.append(sess_a4_part1.run(eudistance, feed_dict = {input1: np.expand_dims(np.array(test_data)[i:i+test_batch_size,0],-1),\
                                                                    input2: np.expand_dims(np.array(test_data)[i:i+test_batch_size,1],-1),\
                                                                   y: np.expand_dims(np.array(test_labels)[i:i+test_batch_size],1)}))

0 45
45 90
90 135
135 180
180 225
225 270
270 315
315 360
360 405
405 450
450 495
495 540
540 585
585 630
630 675
675 720
720 765
765 810
810 855
855 900
900 945
945 990
990 1035
1035 1080
1080 1125
1125 1170
1170 1215
1215 1260
1260 1305
1305 1350
1350 1395
1395 1440
1440 1485
1485 1530
1530 1575
1575 1620
1620 1665
1665 1710
1710 1755
1755 1800


In [68]:
final_predictions = []
for i in predictions:
    for j in i:
        final_predictions.append(j)

To calculate accuracy, we check if the predicted value is less than 0.5. If the euclidean distance(prediction) is smaller than 0.5, which is picked as an optimal value between 0 and 1, our prediction is closer to the correct value. 

In [70]:
correct_predictions = 0
alltrue=[]
allfalse=[]
for i in range(len(test_labels)):
    temp = 0
    
    if final_predictions[i] < 0.5:
        temp=1
    if temp==test_labels[i]:
        correct_predictions+=1
accuracy = count/len(test_labels)*100
print("Accuracy:",accuracy)

Accuracy: 70.77777777777777
