In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np
import os

from six.moves import xrange as range

try:
    from tensorflow.python.ops import ctc_ops
except ImportError:
    from tensorflow.contrib.ctc import ctc_ops
try:
    from python_speech_features import mfcc
except ImportError:
    print("Failed to import python_speech_features.\n Try pip install python_speech_features.")
    raise ImportError

from utils import maybe_download as maybe_download
from utils import sparse_tuple_from as sparse_tuple_from


In [2]:
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space

# Some configs
num_features = 13
# Accounting the 0th indice +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 10
num_hidden = 50
num_layers = 2
batch_size = 1
initial_learning_rate = 1e-4
momentum = 0.9


In [3]:

# Loading the data
from os import listdir
from os.path import isfile, join

mypath='/home/saurabh/Documents/ctc_tensorflow_new/an4/data/'
data_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
txt_files = [ fi for fi in data_files if not fi.endswith(".wav") ]
txt_files = lst = [os.path.splitext(x)[0] for x in txt_files]
wav_files = [ fi for fi in data_files if not fi.endswith(".txt") ]
wav_files = lst = [os.path.splitext(x)[0] for x in wav_files]

num_examples = len(wav_files)
print ("number of data examples : " + str(num_examples) )
num_batches_per_epoch = int(num_examples/batch_size)

number of data examples : 948


In [4]:
audio_filename={}
target_filename={}
fs={}
audio={}
inputs={}
train_inputs={}
train_seq_len = {}
targets={}
train_targets={}
original={}

In [5]:
for i,j in enumerate(wav_files):
        #print (i,j)
	audio_filename[i] =  '/home/saurabh/Documents/ctc_tensorflow_new/an4/data/' + j + '.wav'
	#print ( audio_filename[i])
	#print (audio_filename)
	target_filename[i] =  '/home/saurabh/Documents/ctc_tensorflow_new/an4/data/' + j + '.txt'
	fs[i], audio[i] = wav.read( audio_filename[i])
	#print (audio[i])

	
	#print (temp2)
	inputs[i] = mfcc(audio[i], samplerate=fs[i])
	#print ( inputs[i].shape , fs[i] )
	temp=inputs[i]
	# Tranform in 3D array
	
	train_inputs[i] = np.asarray(temp[np.newaxis, :])
	train_seq_len[i]=[train_inputs[i].shape[1]]
	#print ("new shape " + str(train_inputs[i].shape))
	with open(target_filename[i], 'r') as f:
    
   	 #Only the last line is necessary
    		line = f.readlines()[-1]    

   	 # Get only the words between [a-z] and replace period for none
    		original[i] = ' '.join(line.strip().lower().split(' ')).replace('.', '')
    		targets[i] = original[i].replace(' ', '  ')
    		targets[i] = targets[i].split(' ')
    	#np.append(Targets,targets)

	# Adding blank label
		targets[i] = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets[i]])


	# Transform char into index
		targets[i] = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                      for x in targets[i]])
	


	# Creating sparse representation to feed the placeholder

		train_targets[i] = sparse_tuple_from([targets[i]])

In [8]:
val_inputs, val_targets, val_seq_len = train_inputs, train_targets, \
                                       train_seq_len
#print (train_seq_len) 

# THE MAIN CODE!

graph = tf.Graph()
with graph.as_default():
    # e.g: log filter bank or MFCC features
    # Has size [batch_size, max_stepsize, num_features], but the
    # batch_size and max_stepsize can vary along each step
    inputs = tf.placeholder(tf.float32, [None, None, num_features])

    # Here we use sparse_placeholder that will generate a
    # SparseTensor required by ctc_loss op.
    targets = tf.sparse_placeholder(tf.int32)

    # 1d array of size [batch_size]
    seq_len = tf.placeholder(tf.int32, [None])

    # Defining the cell
    # Can be:
    #   tf.nn.rnn_cell.RNNCell
    #   tf.nn.rnn_cell.GRUCell
    cell = tf.contrib.rnn.LSTMCell(num_hidden)


    # Stacking rnn cells
    #[[cell] for _ in range(num_layers)]
    #stack =  tf.contrib.rnn.MultiRNNCell([cell] * num_layers,
                                      # state_is_tuple=True)
    #cell2=tf.nn.rnn_cell.LSTMCell(num_units=num_hidden, state_is_tuple=True)
    stacked_rnn = []
    for iiLyr in range(2):
    	stacked_rnn.append(cell)
    stack = tf.contrib.rnn.MultiRNNCell(cells=stacked_rnn, state_is_tuple=True)

    #stack =  tf.contrib.rnn.MultiRNNCell([[cell] for _ in range(num_layers)],
                                       # state_is_tuple=True)

    # The second output is the last state and we will no use that
    outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

    shape = tf.shape(inputs)
    batch_s, max_timesteps = shape[0], shape[1]

    # Reshaping to apply the same weights over the timesteps
    outputs = tf.reshape(outputs, [-1, num_hidden])

    # Truncated normal with mean 0 and stdev=0.1
    # Tip: Try another initialization
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden,
                                         num_classes],
                                        stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc_ops.ctc_loss( targets, logits , seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

    saver = tf.train.Saver()

with tf.Session(graph=graph) as session:
    # Initializate the weights and biases
    init_op = tf.global_variables_initializer()

    init_op.run()
   # saver.restore(session, './orange.ckpt')
   # print("Model restored.")
    for curr_epoch in range(num_epochs):
        train_cost = train_ler = 0
        start = time.time()

        for batch in range(num_batches_per_epoch):

            
	    temp = train_inputs[batch]
            #print (train_targets)
	   # print ( train_seq_len[batch] )
	    #refined_input = np.asarray(temp[np.newaxis, :])
          #  print (train_seq_len[batch])
            feed = {inputs: temp,
                    targets: train_targets[batch],
                    seq_len: train_seq_len[batch]}
            #print (refined_input.shape)
            batch_cost, _ = session.run([cost, optimizer], feed)
            train_cost += batch_cost*batch_size
            train_ler += session.run(ler, feed_dict=feed)*batch_size

        train_cost /= num_examples
        train_ler /= num_examples

        val_feed = {inputs: temp,
                    targets:train_targets[batch] ,
                    seq_len: train_seq_len[batch]}

        val_cost, val_ler = session.run([cost, ler], feed_dict=val_feed)
        


        log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"
        print(log.format(curr_epoch+1, num_epochs, train_cost, train_ler,
                         val_cost, val_ler, time.time() - start))
   
        for i in range(1, num_examples):

        
            feed2 = {inputs: train_inputs[i-1],
                    targets: train_targets[i-1],
                    seq_len: train_seq_len[i-1]}
            d = session.run(decoded[0], feed_dict=feed2)
        #print (d)
            str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
    	# Replacing blank label to none
            str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
    	# Replacing space label to space
            str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')

            print('Original:\n%s' % original[i-1])
            print('Decoded:\n%s' % str_decoded)
        save_path = saver.save(session, "./orange4.ckpt")
        print("Model saved in file: %s" % save_path)
    


Epoch 1/10, train_cost = 103.016, train_ler = 0.956, val_cost = 83.254, val_ler = 1.000, time = 92.136
Original:
one five two one seven
Decoded:

Original:
p i t t s b u r g h
Decoded:

Original:
rubout h f d w q six two two
Decoded:

Original:
two two nine three
Decoded:

Original:
b a r r y
Decoded:

Original:
one five one four seven
Decoded:

Original:
help
Decoded:

Original:
go
Decoded:

Original:
m a t t h e w
Decoded:

Original:
erase f h b k z sixty nine
Decoded:

Original:
thirty three
Decoded:

Original:
rubout n x f e eight nine six
Decoded:

Original:
c h r i s t o p h e r
Decoded:

Original:
one five two one three
Decoded:

Original:
t i m o t h y
Decoded:

Original:
enter six
Decoded:

Original:
rubout l d r w twenty six
Decoded:

Original:
f e g b forty eight
Decoded:

Original:
five t h a v e n u e
Decoded:

Original:
g r e e n
Decoded:

Original:
rubout v a n b x nine
Decoded:

Original:
a m p e r e
Decoded:

Original:
one oh seven oh eight
Decoded:

Original:
one seve

Original:
m a r g a r e t
Decoded:

Original:
t o m
Decoded:

Original:
w r i g h t s v i l l e
Decoded:

Original:
enter seven
Decoded:

Original:
no
Decoded:

Original:
no
Decoded:

Original:
rubout t b p b six
Decoded:

Original:
rubout u j m k nine
Decoded:

Original:
p i t t s b u r g h
Decoded:

Original:
erase x b m n t four one oh nine
Decoded:

Original:
o a m j one five three
Decoded:

Original:
s u n n y v a l e
Decoded:

Original:
one five two zero seven
Decoded:

Original:
p t q d q one
Decoded:

Original:
p i t t s b u r g h
Decoded:

Original:
one ten
Decoded:

Original:
two ten
Decoded:

Original:
e a s t e n d a v e n u e
Decoded:

Original:
rubout j u i p three two eight
Decoded:

Original:
go
Decoded:

Original:
stop
Decoded:

Original:
f r a n z i n i
Decoded:

Original:
r i c h a r d
Decoded:

Original:
rubout o k t l e two eight one
Decoded:

Original:
six three eight
Decoded:

Original:
j e a n e t t e
Decoded:

Original:
g r o s s m a n
Decoded:

Original:
one f

Original:
enter one nine
Decoded:

Original:
k a r l a
Decoded:

Original:
g l e n n
Decoded:

Original:
n e l s o n
Decoded:

Original:
yes
Decoded:

Original:
h a s t i n g s
Decoded:

Original:
enter two oh seven
Decoded:

Original:
two one two five seven oh two seven eight four
Decoded:

Original:
no
Decoded:

Original:
two six eight four one six one
Decoded:

Original:
e a s t e n d
Decoded:

Original:
one sixteen forty eight
Decoded:

Original:
four seven sixty four
Decoded:

Original:
p i t t s b u r g h
Decoded:

Original:
s z k w w two thirty seven
Decoded:

Original:
four four three six three one two
Decoded:

Original:
j s p s z nine six nine
Decoded:

Original:
one five two three two
Decoded:

Original:
m a r n e l l
Decoded:

Original:
v e r o n a
Decoded:

Original:
twelve twenty four sixty eight
Decoded:

Original:
one oh two
Decoded:

Original:
q c j k eight five eight
Decoded:

Original:
b e v e r l y
Decoded:

Original:
two six eight three zero two seven
Decoded:

Ori

Original:
v a n d e r l a a n
Decoded:

Original:
j a m e s
Decoded:

Original:
stop
Decoded:

Original:
a g l g a six hundred forty nine
Decoded:

Original:
p i n e h u r s t
Decoded:

Original:
f y z o two two two
Decoded:

Original:
four two one oh one nine six
Decoded:

Original:
eight oh three two three four five nine seven two
Decoded:

Original:
x h b a c five three four four
Decoded:

Original:
z i r k six four four
Decoded:

Original:
f e d o r
Decoded:

Original:
enter eighty nine forty three
Decoded:

Original:
enter eight seven
Decoded:

Original:
enter seven
Decoded:

Original:
repeat
Decoded:

Original:
one five one three three
Decoded:

Original:
s n o w
Decoded:

Original:
p i t t s b u r g h
Decoded:

Original:
s t o d o l s k y
Decoded:

Original:
o l a c k
Decoded:

Original:
d a n i e l
Decoded:

Original:
yes
Decoded:

Original:
one oh oh two eight
Decoded:

Original:
rubout a d e u y three sixty four
Decoded:

Original:
no
Decoded:

Original:
four eight four two
D

Original:
rubout u g a m thirty nine
Decoded:

Original:
one five two one seven
Decoded:

Original:
one five two one seven
Decoded:

Original:
stop
Decoded:

Original:
may ninth sixty two
Decoded:

Original:
start
Decoded:

Original:
h q f x four thousand three hundred and eighty six
Decoded:

Original:
s p r i n g h o u s e l a n e
Decoded:

Original:
s m i t h
Decoded:

Original:
m a r k
Decoded:

Original:
enter thirty seven
Decoded:

Original:
six three sixty eight
Decoded:

Original:
three eleven sixty seven
Decoded:

Original:
help
Decoded:

Original:
rubout j l y z two
Decoded:

Original:
january twelfth nineteen sixty six
Decoded:

Original:
erase n t d i c five nine one
Decoded:

Original:
enter twenty nine
Decoded:

Original:
l r l y u seven three
Decoded:

Original:
n o o n b u r g
Decoded:

Original:
n e i l
Decoded:

Original:
nine thirty sixty five
Decoded:

Original:
p i t t s b u r g h
Decoded:

Original:
enter seven two one six
Decoded:

Original:
erase v x s x two fou

KeyboardInterrupt: 