In [None]:
# tensorflow version 1.3

# personal note, CTC Tensorflow documentation is not at all clear and intuitive , so if confused refer to any example

# trying to implement word level CTC

# note for decoder, the input has to be in this format 3-D float Tensor sized [max_time x batch_size x num_classes]
# it doesn't support time major option 

In [1]:
import tensorflow as tf
from __future__ import print_function
from tensorflow.contrib import rnn
import numpy as np
import scipy.io.wavfile as wav
from python_speech_features import mfcc
import time


In [2]:
def audio_to_mfcc(fileurl):
    rate, sig = wav.read(fileurl)
    mfcc_feat = mfcc(sig,rate)
#d_mfcc_feat = delta(mfcc_feat, 2)
#fbank_feat = logfbank(sig,rate)

    return mfcc_feat

In [3]:
inputs = audio_to_mfcc('/home/saurabh/Documents/tf_orange/tf_orange/data/test.wav')

In [4]:
inputs.shape

(299, 13)

In [5]:
train_inputs = np.asarray(inputs[np.newaxis, :])
train_inputs = (train_inputs - np.mean(train_inputs))/np.std(train_inputs)
train_seq_len = [train_inputs.shape[1]]
print(train_inputs.shape)

(1, 299, 13)


In [6]:
# we also need a fixed vocabulary 
import re

word_dictionary = {'hello':0 , 'world':1 , ' ':2}


def word_to_index(sentence):
   
    words = sentence.split(' ')
    index_list=[]
    for word in words:
        if word in word_dictionary:
           # print(word)
            index_list.insert(len(index_list) , word_dictionary[word])
            index_list.insert(len(index_list) , word_dictionary[' '])
    index_list.pop()        
    return index_list
  

In [7]:
# this function is required for CTC Loss
# for it's input , first convert transcrition / ground truth to number representation 

def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)

    return indices, values, shape

In [8]:
output_transcript = word_to_index('hello world hello world')
batch_list = []
batch_list.insert(0,output_transcript)
train_targets = sparse_tuple_from(batch_list)

In [9]:
# Training Parameters
learning_rate = 0.001
training_steps = 10
batch_size = 1
display_step = 200
num_features = 13

# Network Parameters
num_input = 28 # MNIST data input (img shape: 28*28)
timesteps = 299 # timesteps
num_hidden = 128 # hidden layer num of features
num_classes = 4 # hello , world , blank and space 

# tf Graph input
X = tf.placeholder("float", [None, timesteps, num_input])
Y = tf.placeholder("float", [None, num_classes])


In [10]:
# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([batch_size, 2 * num_hidden, num_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [11]:
def RNN(x, weights, biases):

    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, timesteps, n_input)
    # Required shape: 'timesteps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'timesteps' tensors of shape (batch_size, n_input)
    x = tf.unstack(x, timesteps, 1)

    # Define a lstm cell with tensorflow
    lstm_cell_fw = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)
    
    lstm_cell_bw = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0)

    
    # Get lstm cell output
    outputs, _, _ = rnn.static_bidirectional_rnn(lstm_cell_fw, lstm_cell_bw, x,
    dtype=tf.float32)
    
    #convert output shape (timesteps * batch * classes ) to (batch*timesteps*classes)
    outputs=tf.transpose( outputs , [1, 0, 2])
    
    
    
    #temp=tf.convert_to_tensor(outputs)[:,-1,:]
    
   # temp=temp[:,-1,:]
    #return outputs
    res =  tf.matmul(outputs, weights['out']) + biases['out']
    
    return res
   # return tf.nn.softmax(tf.matmul(outputs, weights['out']) + biases['out'])

In [12]:

#prediction = tf.nn.softmax(logits)
inputs = tf.placeholder(tf.float32, [None, None, num_features])
targets = tf.sparse_placeholder(tf.int32)
seq_len = tf.placeholder(tf.int32, [None])

logits = RNN(inputs, weights, biases)

loss =  tf.nn.ctc_loss ( targets, logits , seq_len , time_major = False)
cost = tf.reduce_mean(loss)
optimizer = tf.train.MomentumOptimizer(learning_rate,
                                           0.9).minimize(cost)


# Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
decoder_input = tf.transpose(logits, [1, 0, 2])

decoded, log_prob = tf.nn.ctc_greedy_decoder(decoder_input, seq_len)

    

In [13]:
init = tf.global_variables_initializer()

In [16]:
with tf.Session() as sess:
    

    # Run the initializer
    sess.run(init)

    #print(train_seq_len)
    
    for curr_epoch in range(training_steps):
        train_cost = train_ler = 0
        start = time.time()
    
        feed = {inputs: train_inputs,
                    targets: train_targets,
                    seq_len: train_seq_len}
	    

        batch_cost, _ = sess.run([cost, optimizer], feed)
        print(batch_cost)
    train_cost += batch_cost*batch_size
    print(train_cost)
    
    # Decoding
    d = sess.run(decoded[0], feed_dict=feed)
#    str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
    # Replacing blank label to none
#    str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
    # Replacing space label to space
#    str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')
    
#    print(str_decoder)
    print(d)

176.167
170.99
197.128
158.713
205.752
122.451
116.824
92.2424
58.1107
61.2372
61.2372131348
SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [0, 4],
       [0, 5],
       [0, 6]]), values=array([2, 1, 2, 0, 2, 1, 0]), dense_shape=array([1, 7]))
