In [1]:
#  Compatibility imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


import time

import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np
import os

from six.moves import xrange as range

try:
    from tensorflow.python.ops import ctc_ops
except ImportError:
    from tensorflow.contrib.ctc import ctc_ops

try:
    from python_speech_features import mfcc
except ImportError:
    print("Failed to import python_speech_features.\n Try pip install python_speech_features.")
    raise ImportError

from utils import maybe_download as maybe_download
from utils import sparse_tuple_from as sparse_tuple_from


In [2]:
def wer2(r, h): 

# taken from https://martin-thoma.com/word-error-rate-calculation/
    """
    Calculation of WER with Levenshtein distance.

    Works only for iterables up to 254 elements (uint8).
    O(nm) time ans space complexity.

    Parameters
    ----------
    r : list
    h : list

    Returns
    -------
    int

    Examples
    --------
    >>> wer("who is there".split(), "is there".split())
    1
    >>> wer("who is there".split(), "".split())
    3
    >>> wer("".split(), "who is there".split())
    3
    """
    # initialisation
    import numpy
    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitution = d[i-1][j-1] + 1
                insertion    = d[i][j-1] + 1
                deletion     = d[i-1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)

    return d[len(r)][len(h)]


In [3]:
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space

# Some configs
num_features = 13
# Accounting the 0th indice +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters
num_epochs = 5
num_hidden = 50
num_layers = 2
batch_size = 1
initial_learning_rate = 7e-5
momentum = 0.9


In [4]:
# Loading the data
from os import listdir
from os.path import isfile, join

mypath='/home/saurabh/Documents/ctc_tensorflow_new/an4/data'
data_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
txt_files = [ fi for fi in data_files if not fi.endswith(".wav") ]
txt_files = lst = [os.path.splitext(x)[0] for x in txt_files]
wav_files = [ fi for fi in data_files if not fi.endswith(".txt") ]
wav_files = lst = [os.path.splitext(x)[0] for x in wav_files]

num_examples = len(wav_files)
print ("number of data examples : " + str(num_examples) )
num_batches_per_epoch = int(num_examples/batch_size)


#print (wav_files)
audio_filename={}
target_filename={}
fs={}
audio={}
inputs={}
train_inputs={}
train_seq_len = {}
targets={}
train_targets={}
original={}


number of data examples : 948


In [5]:
for i,j in enumerate(wav_files):
        #print (i,j)
	audio_filename[i] =  '/home/saurabh/Documents/ctc_tensorflow_new/an4/data/' + j + '.wav'
	#print ( audio_filename[i])
	#print (audio_filename)
	target_filename[i] =  '/home/saurabh/Documents/ctc_tensorflow_new/an4/data/' + j + '.txt'
	fs[i], audio[i] = wav.read( audio_filename[i])
	#print (audio[i])

	
	#print (temp2)
	inputs[i] = mfcc(audio[i], samplerate=fs[i])
	#print ( inputs[i].shape , fs[i] )
	temp=inputs[i]
	# Tranform in 3D array
	
	train_inputs[i] = np.asarray(temp[np.newaxis, :])
	train_seq_len[i]=[train_inputs[i].shape[1]]
	#print ("new shape " + str(train_inputs[i].shape))
	with open(target_filename[i], 'r') as f:
    
   	 #Only the last line is necessary
    		line = f.readlines()[-1]    

   	 # Get only the words between [a-z] and replace period for none
    		original[i] = ' '.join(line.strip().lower().split(' ')).replace('.', '')
    		targets[i] = original[i].replace(' ', '  ')
    		targets[i] = targets[i].split(' ')
    	#np.append(Targets,targets)

	# Adding blank label
		targets[i] = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets[i]])


	# Transform char into index
		targets[i] = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                      for x in targets[i]])
	


	# Creating sparse representation to feed the placeholder

		train_targets[i] = sparse_tuple_from([targets[i]])

In [6]:
val_inputs, val_targets, val_seq_len = train_inputs, train_targets, \
                                       train_seq_len

In [7]:
graph = tf.Graph()
with graph.as_default():
    # e.g: log filter bank or MFCC features
    # Has size [batch_size, max_stepsize, num_features], but the
    # batch_size and max_stepsize can vary along each step
    inputs = tf.placeholder(tf.float32, [None, None, num_features])

    # Here we use sparse_placeholder that will generate a
    # SparseTensor required by ctc_loss op.
    targets = tf.sparse_placeholder(tf.int32)

    # 1d array of size [batch_size]
    seq_len = tf.placeholder(tf.int32, [None])

    # Defining the cell
    # Can be:
    #   tf.nn.rnn_cell.RNNCell
    #   tf.nn.rnn_cell.GRUCell
    cell = tf.contrib.rnn.BasicLSTMCell(num_hidden, state_is_tuple=True)

    # Stacking rnn cells
    #[[cell] for _ in range(num_layers)]
    stack =  tf.contrib.rnn.MultiRNNCell([cell] * num_layers,
                                       state_is_tuple=True)

    #stack =  tf.contrib.rnn.MultiRNNCell([[cell] for _ in range(num_layers)],
                                       # state_is_tuple=True)

    # The second output is the last state and we will no use that
    outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

    shape = tf.shape(inputs)
    batch_s, max_timesteps = shape[0], shape[1]

    # Reshaping to apply the same weights over the timesteps
    outputs = tf.reshape(outputs, [-1, num_hidden])

    # Truncated normal with mean 0 and stdev=0.1
    # Tip: Try another initialization
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden,
                                         num_classes],
                                        stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc_ops.ctc_loss( targets, logits , seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

    saver = tf.train.Saver()

In [9]:
with tf.Session(graph=graph) as session:
    # Initializate the weights and biases
    init_op = tf.global_variables_initializer()

    #init_op.run()
    saver.restore(session, './orange15.ckpt')
    print("Model restored.")


   
    totalwer=0

	  
    for i in range(1, num_examples):

        
        feed2 = {inputs: train_inputs[i-1],
                    targets: train_targets[i-1],
                    seq_len: train_seq_len[i-1]}
        d = session.run(decoded[0], feed_dict=feed2)
        #print (d)
    	str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
    	# Replacing blank label to none
    	str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
    	# Replacing space label to space
    	str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')
    	
	print('Original:\n%s' % original[i-1])
        print('Decoded:\n%s' % str_decoded)
        totalwer=totalwer + wer2(original[i-1].split(),str_decoded.split())
      #  print(wer2(original[i-1].split(),str_decoded.split()))
	
    print("average wer = " + str(totalwer/num_examples))

Model restored.
Original:
one five two one seven
Decoded:
one five two one seven
Original:
p i t t s b u r g h
Decoded:
p i t t s bu r g 
Original:
rubout h f d w q six two two
Decoded:
rubout eh kf b w  six twotwo
Original:
two two nine three
Decoded:
t t nine thre
Original:
b a r r y
Decoded:
  r r i
Original:
one five one four seven
Decoded:
one five one four seven
Original:
help
Decoded:
l
Original:
go
Decoded:
l
Original:
m a t t h e w
Decoded:
m a t t h  w
Original:
erase f h b k z sixty nine
Decoded:
rs   b k z sixty nine
Original:
thirty three
Decoded:
teirt tere
Original:
rubout n x f e eight nine six
Decoded:
rubout  s  f e eight nine six
Original:
c h r i s t o p h e r
Decoded:
c  r i s t o  h  r
Original:
one five two one three
Decoded:
one fie to one thire
Original:
t i m o t h y
Decoded:
tp i m o g  y
Original:
enter six
Decoded:
nter six
Original:
rubout l d r w twenty six
Decoded:
rubout l d r w wnt six
Original:
f e g b forty eight
Decoded:
f g b forteight
Original:
fi

Original:
j c h e q fifty eight thousand nine thirty nine
Decoded:
j c h g g fty eight n nine thir nine
Original:
e l l s w o r t h
Decoded:
 l  s w  r t 
Original:
stop
Decoded:
st
Original:
erase w f s o l eight four four
Decoded:
ers w f s ol e four four
Original:
enter zero
Decoded:
nter zero
Original:
m a c a l u s o
Decoded:
n  c  l  s
Original:
one oh six
Decoded:
one o six
Original:
a s b u r y p l a c e
Decoded:
 s b  r y  l  c 
Original:
sixty three sixty four and a half
Decoded:
sixty thre sixty four ni n 
Original:
three four eight zero
Decoded:
thre four eih tro
Original:
stop
Decoded:
top
Original:
w c v y h seventy seven forty four
Decoded:
tw c d y h seventy seven forty four
Original:
erase c b j v c thirty five
Decoded:
erase c    c thir five
Original:
p i t t s b u r g h
Decoded:
 tt s b u r g 
Original:
s t e e r e
Decoded:
s  e r 
Original:
n g n n three nine nine
Decoded:
 g n n thre nine nine
Original:
j a m e s
Decoded:
g n  s
Original:
stop
Decoded:
sto
Original

Original:
m o h n k e r n
Decoded:
mo h  k r 
Original:
five two one four nine five four
Decoded:
fivetw one fun nine five four
Original:
four one two five two one eight two six oh
Decoded:
four one two five two one eight two six oh
Original:
one five two oh six
Decoded:
one five two oh six
Original:
o l k f five forty five
Decoded:
o l  five forty five
Original:
yes
Decoded:
s
Original:
m o d u g n o
Decoded:
n oi d  u g o
Original:
yes
Decoded:
bs
Original:
one zero zero six
Decoded:
one  zeo er six
Original:
erase y b e j q five oh seven
Decoded:
rse y e e  g fiveon seven
Original:
rubout h w o n j four thousand seven hundred twenty three
Decoded:
rubout i w l m k four teo seven oh wonty thre
Original:
p g h
Decoded:
p g h
Original:
three twenty seven sixty eight
Decoded:
ere fiwvnt seven sixty eight
Original:
g l l k h one
Decoded:
g l l k h one
Original:
ten thirty one fifty eight
Decoded:
n threy one fifty eightt
Original:
enter one thirty one
Decoded:
enter one thirty one
Origin

Original:
one five two one three
Decoded:
one five two one thr
Original:
j o n a t h a n
Decoded:
j o y  t h n
Original:
g i b b s
Decoded:
g i b b s
Original:
eight two six eight six eight seven
Decoded:
eih two six eight six eight seven
Original:
go
Decoded:
l
Original:
one five two one seven
Decoded:
one five tw one seven
Original:
four four one one seven two four
Decoded:
four four one one seven two four
Original:
five two five five five
Decoded:
five two five five five
Original:
v e r k e thirty five thirty
Decoded:
e  r   thrt five hir
Original:
one five oh one five
Decoded:
one five oh one five
Original:
u s i q n seventy one
Decoded:
u s i g n seventy one
Original:
fifty one fifty six
Decoded:
fife nine fifty six
Original:
enter ninety four
Decoded:
er nine four
Original:
one five two one seven
Decoded:
one five two one seven
Original:
n e w y o r k
Decoded:
n   y o r 
Original:
yes
Decoded:
f
Original:
help
Decoded:

Original:
two six eight three eight zero two
Decoded:
tw six

Original:
erase k j z f m fifty four
Decoded:
ers k k z f  fwty four
Original:
g a r l a n d
Decoded:
g   r l  n 
Original:
s a t t e r f i e l d
Decoded:
s  t t e r f  i l 
Original:
enter nine seventy two
Decoded:
enter nine seventy two
Original:
thirty two thirty one
Decoded:
heir two hry oh
Original:
a m s j seven thousand one hundred and eighty six
Decoded:
m s  seven othn one enn eight six
Original:
start
Decoded:
tr
Original:
d o w n i n g t o w n
Decoded:
p o  n i n  t o n
Original:
a r a p a h o e
Decoded:
 r  g   o e
Original:
five sixteen sixty nine
Decoded:
five sixtyn sixty nine
Original:
two seven one zero eight two six
Decoded:
tiwo seven one zr oh eight t six
Original:
enter fifty one
Decoded:
nter fifty one
Original:
stop
Decoded:
to
Original:
one five six six eight
Decoded:
one five six six eight
Original:
m a r k
Decoded:
m r k
Original:
stop
Decoded:
so
Original:
yes
Decoded:
es
Original:
rubout h a x x n six seven nine one
Decoded:
rubout   s s n six seven nine one

s h  ni l 
Original:
l a m p s o n
Decoded:
l  m g s o  n
Original:
f i f t h
Decoded:
f i f t h
Original:
rubout y y f x zero
Decoded:
rubout y yf s r
Original:
erase w e s f twenty four oh three
Decoded:
prse w e s  twnt four oh thr
Original:
january fifteenth nineteen sixty three
Decoded:
n o tn nine te ixty thre
Original:
one four eight five oh
Decoded:
one foureight fiveo
Original:
t h a d
Decoded:
t   b
Original:
rubout e z u a i fifteen
Decoded:
rubout z u  i fte
Original:
e z a k o eight
Decoded:
e z  k o h
Original:
j u l i e
Decoded:
  l i 
Original:
enter five three four three
Decoded:
nter five thre four three
Original:
yes
Decoded:
s
Original:
erase c k c w fourteen eighty five
Decoded:
eurs c  c   fourtn eight five
Original:
h a h l e four ninety one
Decoded:
h k h kl b four ine one
Original:
r o s e n f e l d
Decoded:
r o s  n f   l b
Original:
enter five
Decoded:
enter five
Original:
eight forty one
Decoded:
eigh forty one
Original:
b r o n x v i l l e
Decoded:
e r o n 

Original:
b v i q g eighty six
Decoded:
 i t g h six
Original:
erase b m e e three five two
Decoded:
ersx b    thre five two
Original:
rubout j b x r z nine twenty
Decoded:
ubout k b  r z ne twnt
Original:
enter eight two four two
Decoded:
e igt two four to
Original:
j o h n s o n
Decoded:
 o h n s o n
Original:
start
Decoded:
ssttrt
Original:
d e r e k
Decoded:
 r  
Original:
two oh three seven five seven eight eight nine nine
Decoded:
two oh thre seve five seven eight eight nine nine
Original:
repeat
Decoded:
et
Original:
three two seven one six nine five
Decoded:
thre two seven one six ne five
Original:
start
Decoded:
str
Original:
march third nineteen twenty eight
Decoded:
r ter ninete twnty eight
Original:
rubout i t d k m four seventy one
Decoded:
rubout i t  m four sevey one
Original:
four one two four two one eight eight nine six
Decoded:
four one t four two oneeighteht nine six
Original:
t e r i n a
Decoded:
t  r i  
Original:
go
Decoded:
o
Original:
g a b h two
Decoded:
g  b 