## Speaker Verification using the Siamese Network

In [0]:
#Importing required libraries
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import glob
from google.colab import drive
import pickle
import librosa
from itertools import combinations 


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:

f = open('/content/gdrive/My Drive/Deep Learning HW 4/hw4_trs.pkl','rb') 
train_file = pickle.load(f)


f = open('/content/gdrive/My Drive/Deep Learning HW 4/hw4_tes.pkl','rb') 
test_file = pickle.load(f)

In [0]:
#Converting the files to stft and getting the absolute value

train = []
test = []

for s in train_file:
  train.append(np.abs(librosa.stft(s, n_fft=1024, hop_length=512)).T)

for s in test_file:
  test.append(np.abs(librosa.stft(s, n_fft=1024, hop_length=512)).T)




In [0]:


#Sampling and creating the dataset for train and test

def generate_dataset(data):
  speaker_len = len(data)//10
  pairs_1 = []
  pairs_2 = []
  y_bool  = []  
  for i in range(1,speaker_len+1):
    pos_set = data[i*10 - 10:i*10]
    neg_set = np.delete(data,range(i*10 - 10,i*10),axis=0)
    idx = list(combinations(range(10), 2))
    l1 = []
    l2 = []
    for i in idx:
      l1.append(pos_set[i[0]])
    for j in idx:
      l2.append(pos_set[j[1]])
    pairs_1 = pairs_1 +  l1
    pairs_2 = pairs_2 + l2
    y_bool  = y_bool+[1]*len(idx)
    idx = list(zip(np.random.randint(10,size=45),np.random.randint(len(data)-10,size=45)))
    l3 = []
    l4 = []
    for i1 in idx:
      l3.append(pos_set[i1[0]])
    for j1 in idx:
      l4.append(neg_set[j1[1]])
    pairs_1 = pairs_1 + l3
    pairs_2 = pairs_2 + l4
    y_bool  = y_bool + [0]*len(idx)
  s = np.arange(np.array(pairs_1).shape[0])
  np.random.shuffle(s)
  return np.array(pairs_1)[s],np.array(pairs_2)[s],np.array(y_bool)[s]

#Train and test data for network
left_train,right_train,y_train = generate_dataset(train)
left_test,right_test,y_test = generate_dataset(test)

left_train_len = []
left_test_len = []

for i in left_train:
  left_train_len.append(i.shape[0])

for j in left_test:
  left_test_len.append(j.shape[0])




A GRU cell stacked model is used.
Batch Normalization is implemented
The layer outputs are being normalized for cosine similarity.
Dot product of the outputs is performed and are wrapper in a sigmoid function. Cross entropy loss function combined with Adam optimizer is implemented.


In [0]:

def next_batch(x1,x2,y,seq_len,start,batch_size):
     return x1[start:start + batch_size],x2[start:start + batch_size], y[start:start + batch_size],seq_len[start:start + batch_size]
                                  

n_classes = 512
n_channels = 1
hidden_units = 512


batch_size = 90
n_epoch = 100
display_f = 10
keep_prob = 0.9
learning_rate = 0.0001
dropout = 0.1


def gru_cell():
    gru_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.GRUCell(hidden_units, 
                                                          kernel_initializer =tf.contrib.layers.variance_scaling_initializer()),
                                                          output_keep_prob = 1 - dropout)
    return gru_cell

def batch_norm_layer(x,train_phase,momentum=0.9,epsilon=0.001):
  return tf.layers.batch_normalization(
        inputs=x,
        axis=-1,
        momentum=momentum,
        epsilon=epsilon,
        center=True,
        scale=True,
        reuse= False,
        training = train_phase)

  

tf.reset_default_graph()
left = tf.placeholder(tf.float32,[None,None,513])
right = tf.placeholder(tf.float32,[None,None,513])
y = tf.placeholder(tf.float32,[None,1])
succ_length = tf.placeholder(tf.int32,None)
flag_training=tf.placeholder(tf.bool)




def siamese_model(x,flag_training):
    with tf.name_scope("model"):

        with tf.variable_scope("rnn",reuse = tf.AUTO_REUSE ) as scope:
              rnn_cell = tf.contrib.rnn.MultiRNNCell([gru_cell() for cell in range(2)])
              dynamic_rnn, _  = tf.nn.dynamic_rnn(rnn_cell, x, dtype=tf.float32,sequence_length=succ_length)


        with tf.variable_scope("dense",reuse = tf.AUTO_REUSE ) as scope: 
              batchnorm_layer = batch_norm_layer(dynamic_rnn,flag_training)
              dense_layer = tf.layers.dense(batchnorm_layer, n_classes, kernel_initializer= tf.contrib.layers.variance_scaling_initializer(),activation=tf.nn.tanh)
        output = tf.layers.flatten(dense_layer)
        return output




left_output =  tf.nn.l2_normalize(siamese_model(left,flag_training),0)
right_output =  tf.nn.l2_normalize(siamese_model(right,flag_training),0)


layer_dot = tf.reduce_sum(tf.multiply( left_output, right_output ),1, keep_dims=True)

sig_layer = tf.sigmoid(layer_dot)


loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=sig_layer))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(sig_layer),y),tf.float32))




In [0]:
%%time
#Running the session
init_global = tf.global_variables_initializer()
init_local = tf.local_variables_initializer()
sess = tf.Session()
sess.run(init_global)
sess.run(init_local)




for epoch in range(n_epoch):
  for i in range(len(left_train)//batch_size):
      
      
    epoch_x1,epoch_x2,epoch_y,seq_len_batch = batch_next(left_train,right_train,y_train,left_train_len,i*batch_size,batch_size)
    epoch_y = epoch_y.reshape([batch_size,1])

                             
    _,c = sess.run([optimizer,loss], feed_dict={left:epoch_x1,right:epoch_x2,y:epoch_y,succ_length:seq_len_batch,flag_training:True})

    
    
    epoch_loss,acc = sess.run([loss,accuracy],\
                              feed_dict={left:left_test,
                                         right:right_test,
                                         y:y_test.reshape([len(y_test),1]),succ_length:left_test_len,flag_training:False})
    
    
    
         
  print("Epoch", epoch, " Test Loss =", epoch_loss, " Test Accuracy =", acc)
  if(acc>=0.70):
      break;
  
      

Epoch 0  Test Loss = 0.81112534  Test Accuracy = 0.5
Epoch 1  Test Loss = 0.80861586  Test Accuracy = 0.5038889
Epoch 2  Test Loss = 0.771661  Test Accuracy = 0.54555553
Epoch 3  Test Loss = 0.725836  Test Accuracy = 0.5966667
Epoch 4  Test Loss = 0.71141577  Test Accuracy = 0.62333333
Epoch 5  Test Loss = 0.71551013  Test Accuracy = 0.6077778
Epoch 6  Test Loss = 0.6962006  Test Accuracy = 0.6477778
Epoch 7  Test Loss = 0.7592695  Test Accuracy = 0.5422222
Epoch 8  Test Loss = 0.7186783  Test Accuracy = 0.62277776
Epoch 9  Test Loss = 0.6969394  Test Accuracy = 0.6483333
Epoch 10  Test Loss = 0.6936519  Test Accuracy = 0.6627778
Epoch 11  Test Loss = 0.68449235  Test Accuracy = 0.68333334
Epoch 12  Test Loss = 0.68582046  Test Accuracy = 0.67777777
Epoch 13  Test Loss = 0.6832045  Test Accuracy = 0.68333334
Epoch 14  Test Loss = 0.6813983  Test Accuracy = 0.69722223
Epoch 15  Test Loss = 0.6805864  Test Accuracy = 0.69
Epoch 16  Test Loss = 0.6780084  Test Accuracy = 0.69777775
Epoch 

In [0]:
acc = sess.run(accuracy,feed_dict={left:left_test,right:right_test,y:y_test.reshape([len(y_test),1]),succ_length:left_test_len,flag_training:False})

In [0]:
print("Test Accuracy is ",acc*100)

Test Accuracy is  71.38888835906982
