<a href="https://colab.research.google.com/github/srinithish/Deep-Learning/blob/master/HW4_Problem1_Srinithish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import numpy as np
import sklearn
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import random
import matplotlib.cm as cm
import math
import seaborn as sns
import glob
import pickle as p
import  itertools as itr

import warnings
warnings.filterwarnings('ignore')


In [0]:

"""
Change directory path here and leave it blank if its in the same directory as the 
Notebook
"""
dirpath  = "Drive/My Drive/Deeplearning Assignment/timit-homework"
dirpath  = ""


with open(dirpath+'hw4_trs.pkl', 'rb') as f:
  trainUtterances = p.load(f)

In [0]:
###load files
# !pip install librosa # in colab, you'll need to install this
import librosa
import io



### Loading the files

def gen_STFTnMagnitude(RawAmpVsTime):
  """
  returns 
  time domain 
  complex spectrogram 
  magnitude spectrgoram
  """
  
  spectrogram = librosa.stft(RawAmpVsTime, n_fft=1024, hop_length=512)
  spectrogram = np.transpose(spectrogram)
  spectrogram_Mag = np.abs(spectrogram)
  
  return spectrogram,spectrogram_Mag

def getSTFTs(AllRawAmpVsTime):
  AllSTFTs = []
  
  for i in AllRawAmpVsTime:
    spectrogram,spectrogram_Mag = gen_STFTnMagnitude(i)
    AllSTFTs.append(spectrogram_Mag)

  return AllSTFTs

## Sampling functions for generating pairs

1. For Train, Generating all possible 45 Positive pairs and 45 negative pairs for each 50 speakers
  Hence will have array of dimensions (50,90,32,513)  here 32 * 513 is the stft'd array dimensions

2. For Test, Generating all possible 45 Positive pairs and 45 negative pairs for each 10 speakers
Hence will have array of dimensions (20,90,32,513)  here 45 * 513 is the stft'd array dimensions

In [0]:
def positiveSampling(ConsideredSpeakerIndices,sampleNum = 5 ):
  allPosPairs = list(itr.combinations(ConsideredSpeakerIndices,2))
  if sampleNum is not None:
     allPosPairs =  random.sample(allPosPairs,sampleNum)
   
  
  return allPosPairs

def negativeSampling(ConsideredSpeakerIndices,RestSpeakerIndices, sampleNum = 5):
    allNegativePairs = list(itr.product(ConsideredSpeakerIndices,RestSpeakerIndices))
    if sampleNum is not None:
      allNegativePairs =  random.sample(allNegativePairs,sampleNum)
    
    return allNegativePairs

In [0]:
def genAllSpeaker(numSpeakers, AllStftArray ):
  AllStftArray = np.array(AllStftArray)
  numTotalUtterances,lenClip , xDim = AllStftArray.shape
  
  utterancesPerSpeaker = int(numTotalUtterances//numSpeakers)
  
  AllIndices = list(range(numTotalUtterances))
  
  trainLeft = []
  trainRight = []
  Y = []
  for eachSpeakerStartIndex in range(0,numTotalUtterances,10):
    ConsideredSpeakerIndices = list(range(eachSpeakerStartIndex,eachSpeakerStartIndex+10))
    
    positivePairs = positiveSampling(ConsideredSpeakerIndices,None)
    
    RestSpeakerIndices = np.delete(AllIndices,ConsideredSpeakerIndices)
    
    negativePairs = negativeSampling(ConsideredSpeakerIndices,RestSpeakerIndices, sampleNum = 45)
    
    positiveLeftIndices, positiveRightIndices = [i[0] for i in positivePairs] , [i[1] for i in positivePairs] 
    negativeLeftIndices, negativeRightIndices = [i[0] for i in negativePairs] , [i[1] for i in negativePairs] 
    
    posPlusNegXLeft = np.concatenate((positiveLeftIndices, negativeLeftIndices), axis=None)
    posPlusNegXRight = np.concatenate((positiveRightIndices, negativeRightIndices), axis=None)
    
    YBatch = [1 for i in range(len(positiveLeftIndices))] + [0 for i in range(len(negativeLeftIndices))] 
    YBatch = np.array(YBatch)    
    Y.append(YBatch)
    leftNetworkExamples  =  AllStftArray[posPlusNegXLeft,:,:] 
    rightNetworkExamples =  AllStftArray[posPlusNegXRight,:,:] 
    trainLeft.append(leftNetworkExamples)
    trainRight.append(rightNetworkExamples)
    
  return trainLeft,trainRight,Y
  
  


# Setting up training and Details

1. Generate latent vectors and take a dot product of the latent vectors
2. sigmoid on the dot product to get 0 to 1
3. Loss function is cross entropy 
4. Threshold 0.5 is set if prob > 0.5 then 1 is predicted
5. Have used an GRU cell with 256 neuron output and a fully connected layer of 10 Units with tanh activation

In [0]:

tf.reset_default_graph()
# Parameters
learning_rate = 0.001


# Network Parameters
RNNStructure = [256] ### 256  neuron layer
FullyConnectedStructure = [10] ### One with 10 output with sigmoid activation.


In [0]:
# (numExamples,xDim) = XSpectrogramMag.shape
# (numExamples,yDim) = YSpectrogramMag.shape
xDim = 513
yDim = 10

XLeft = tf.placeholder("float", [None,None, xDim])
XRight = tf.placeholder("float", [None,None, xDim])
Y = tf.placeholder("float", [None])

# SeqLen = tf.placeholder(tf.int32, [None])

timeSteps = tf.placeholder(tf.int32)
RNNDropout_keepProb = tf.placeholder(tf.float32)

#### defining the RNN Layer strucuure

In [0]:
##function to stack LSTMCells

def stackLSTMCells(RNNStructure,reuse):
  
  initialiser = tf.keras.initializers.he_normal(seed=None)
  LSTMStack = []
  
  for numUnits in RNNStructure: 

    LSTMStack.append(tf.nn.rnn_cell.GRUCell(num_units = numUnits,
#                                              kernel_initializer = initialiser,
                                            reuse = reuse,
                                           name = 'GRUCell'+str(numUnits)))
  
  return tf.nn.rnn_cell.MultiRNNCell(LSTMStack)

#### get networkoutput


1.   Using signmoid activation fully connected layer to make the values within 0 and 1
2.   



In [0]:
###get network output
def getNetworkOutput(xInp,StackedCell,FullyConnectedStructure,timeSteps,reuse):
  
#     xInp = tf.reshape(xInp, shape=[-1, maxSeqLen , xDim])
    
    ### All RNN Outputs
    StackedCell = stackLSTMCells(RNNStructure,reuse)
    
    outputsAtEachTimestamp ,FinalStates = tf.nn.dynamic_rnn(StackedCell,xInp,dtype = tf.float32,swap_memory = True)
    
    totalLayers = FullyConnectedStructure
    
    
    lastLayerPosition = len(totalLayers)-1
    outputlist = []
  
  
  
    output = tf.layers.dense(inputs = outputsAtEachTimestamp, 
                             units = 10, 
                             kernel_initializer = tf.contrib.layers.variance_scaling_initializer(),
                             activation = tf.nn.tanh,
                             bias_initializer = tf.zeros_initializer(),
                            reuse=reuse ,name = 'denseOutput1')
    
#     output = tf.layers.dense(inputs = output, 
#                              units = 10, 
#                              kernel_initializer = tf.contrib.layers.variance_scaling_initializer(),
#                              activation = tf.nn.tanh,
#                              bias_initializer = tf.zeros_initializer(),
#                             reuse=reuse,name = 'denseOutput')
    
#     for layerPosition,numUnits in enumerate(totalLayers):
# #       
# #       tf.print(layerPosition)
#       if layerPosition == 0:
      
#         output = tf.layers.dense(outputsAtEachTimestamp, numUnits, activation=tf.nn.relu,reuse = reuse,name = "denseLayer"+str(layerPosition))
    
        
#       elif layerPosition <= lastLayerPosition-1:
      
#          output = tf.layers.dense(output, numUnits, activation=tf.nn.relu,reuse = reuse,name = "denseLayer"+str(layerPosition))
        
      
#       else:
#         ###change activation if needed
#         output = tf.layers.dense(output, numUnits, activation=tf.nn.tanh,reuse = reuse,name = "denseLayer"+str(layerPosition))
        
      
    reshapedOutput = tf.reshape(output,shape = [-1,timeSteps*10])
    return reshapedOutput

### Chaining all the functions

In [0]:
## required training

  
##left network
LastOutputLeft = getNetworkOutput(XLeft,None,FullyConnectedStructure,timeSteps,reuse = False) ##using fully connected

#   scope.reuse_variables()

##right network
LastOutputRight = getNetworkOutput(XRight,None,FullyConnectedStructure,timeSteps,reuse = True) ##using fully connected


  
dotProduct = tf.reduce_sum(tf.multiply(LastOutputLeft,LastOutputRight),axis = 1 ,name = 'dotproduct')

yPred = tf.nn.sigmoid(dotProduct)

binarisedOutput = tf.cast(tf.math.greater(yPred,0.5), tf.int16)


accuracy = tf.metrics.accuracy(labels = Y,predictions = binarisedOutput)

lossCalcu = tf.nn.sigmoid_cross_entropy_with_logits(labels = Y, logits = dotProduct)
lossCalcu  = tf.reduce_sum(lossCalcu)

gradOptimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

train = gradOptimizer.minimize(lossCalcu)
# accuracy = RSquared(LastOutput,Y)

initialise = tf.global_variables_initializer()

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


#### Running the training procedure for targeting same (1) and not same (0) 


In [0]:
sess = tf.InteractiveSession()
sess.run(initialise)
# train_writer = tf.summary.FileWriter(LOG_DIR, sess.graph)



trainStfts = getSTFTs(trainUtterances)
trainXLeft,trainXRight,yTrain = genAllSpeaker(50,trainStfts)


for epoch in range(100):
  loss = []
  for xbatchLeft,xbatchRight,yBatch in zip(trainXLeft,trainXRight,yTrain):

    timeStepsInp = xbatchLeft.shape[1]
    
    sess.run(train,feed_dict ={XLeft: xbatchLeft,
                               XRight: xbatchRight,Y:yBatch,
                               timeSteps:timeStepsInp })

    loss += sess.run([lossCalcu], feed_dict ={XLeft: xbatchLeft,
                                              XRight: xbatchRight,Y:yBatch,
                                              timeSteps:timeStepsInp })  
    
#     accu = sess.run(accuracy, feed_dict ={XLeft: xbatchLeft,
#                                               XRight: xbatchRight,Y:yBatch,
#                                               timeSteps:timeStepsInp })
   
  print("Step " + str(epoch) + ", Loss= " + str(sum(loss)))
  

Step 0, Loss= 11666.8208694458
Step 1, Loss= 3732.192523956299
Step 2, Loss= 2518.712100982666
Step 3, Loss= 2150.1600608825684
Step 4, Loss= 1897.5947713851929
Step 5, Loss= 1718.8840990066528
Step 6, Loss= 1804.9715881347656
Step 7, Loss= 1595.3043308258057
Step 8, Loss= 1299.6066632270813
Step 9, Loss= 1077.1792073249817
Step 10, Loss= 886.3827857971191
Step 11, Loss= 771.3760724067688
Step 12, Loss= 675.0853085517883
Step 13, Loss= 588.5127789974213
Step 14, Loss= 596.3003215789795
Step 15, Loss= 430.77843260765076
Step 16, Loss= 429.9031431674957
Step 17, Loss= 374.20227658748627
Step 18, Loss= 276.2902705669403
Step 19, Loss= 255.90630501508713
Step 20, Loss= 282.39414632320404
Step 21, Loss= 287.11185759305954
Step 22, Loss= 224.50875449180603
Step 23, Loss= 123.96298438310623
Step 24, Loss= 84.43239688128233
Step 25, Loss= 56.74008375406265
Step 26, Loss= 37.98302336037159
Step 27, Loss= 30.53698781877756
Step 28, Loss= 25.19704645872116
Step 29, Loss= 21.728830941021442
Step 3

#Testing

Here I am randomly sampling from the test file random pairs same as training hence for each speaker I ll have 45 positive pairs and 45 negative pairs.

And there are 20 speakers in total. Hence the shape of XLeft is (20,90,Len,513)

In [0]:

with open(dirpath+'hw4_tes.pkl', 'rb') as f:
  testUtterances = p.load(f)
  

testStfts = getSTFTs(testUtterances)

testXLeft,testXRight,yTest = genAllSpeaker(10,testStfts)

testXLeft,testXRight,yTest = np.array(testXLeft).reshape((-1,45,513)), np.array(testXRight).reshape((-1,45,513)), np.array(yTest).reshape(-1)

yPrediction = sess.run(binarisedOutput, feed_dict ={XLeft: testXLeft,
                                      XRight: testXRight,Y:yTest,
                                      timeSteps:testXLeft.shape[1]})

print("The accuracy on the test set is :", 100* sum(yPrediction == yTest)/yTest.shape)


The accuracy on the test set is : [69.61111111]
