In [45]:
import numpy as np
import librosa
from scipy.io import wavfile
from python_speech_features import mfcc
import tensorflow as tf
import tensorflow.compat.v1 as tft
tft.compat.v1.disable_eager_execution()
import os
import re
from tensorflow.python import ops
import time

chars = "abcdefghijklmnopqrstuvwxyz' "
n_inp=26
n_ctx = 9
n_h = 1024
n_chars = len(chars)+1
pb1 = 0.9
pb2 = 0.999
peps = 1e-8
plr = 0.001
epochs=100

In [46]:
def audiofile_to_vector(audio_fname, n_mfcc_features, nctx):
    sampling_rate, raw_w = wavfile.read(audio_fname)
    mfcc_ft = mfcc(raw_w, samplerate=sampling_rate, numcep=n_mfcc_features)
    mfcc_ft = mfcc_ft[::2]
    n_strides = len(mfcc_ft)
    dummy_ctx = np.zeros((nctx, n_mfcc_features), dtype=mfcc_ft.dtype)
    mfcc_ft = np.concatenate((dummy_ctx, mfcc_ft, dummy_ctx))
    w_size = 2*nctx+1
    input_vector = np.lib.stride_tricks.as_strided(mfcc_ft,(n_strides, w_size, n_mfcc_features),
        (mfcc_ft.strides[0], mfcc_ft.strides[0], mfcc_ft.strides[1]),
        writeable=False)
    input_vector = np.reshape(input_vector, [n_strides, -1])
    input_vector = np.copy(input_vector)
    input_vector = (input_vector - np.mean(input_vector))/np.std(input_vector)
    return input_vector

In [47]:
mfcc_features = audiofile_to_vector('C://Users//shash//Desktop//speech recognition//timit//dr1-fvmh0//sa1.wav',26,9)
print(mfcc_features.shape)

(171, 494)


In [48]:
regexp_alphabets = "[^a-zA-Z']+"
cnt=0
def get_label(ch):
    global cnt
    label = cnt
    cnt+=1
    return label
chr2lbl = {c:get_label(c) for c in list(chars)}
lbl2chr = {chr2lbl[c]:c for c in list(chars)}
def get_string2label(strval):
    strval = strval.lower()
    idlist = []
    for c in list(strval):
        if c in chr2lbl:
            idlist.append(chr2lbl[c])
    return np.array(idlist)
def get_label2string(lblarr):
    strval = []
    for idv in lblarr:
        strval.append(lbl2chr[idv])
    return ''.join(strval)
def decoded_val_to_text(decoded_val):
    idxs = decoded_val[0]
    vals = decoded_val[1]
    res = [''] * decoded_val[2][0]
    for i in range(len(idxs)):
        idx = idxs[i][0]
        char = lbl2chr[vals[i]]
        res[idx] = res[idx] + char
    return res
def array2txt(arr_val):
    res = ''
    for i in range(len(arr_val)):
        if arr_val[i] in lbl2chr:
            res += lbl2chr[arr_val[i]]
        else:
            res += ''
    return res.replace('`', ' ')

In [49]:
idlist = get_string2label("This is a test")
print(idlist)
strval = get_label2string(idlist)
print(strval)

[19  7  8 18 27  8 18 27  0 27 19  4 18 19]
this is a test


In [50]:
def get_wav_trans(fpath,X, y):
    files = os.listdir(fpath)
    for fname in files:
        next_path = fpath + "/" + fname
        if os.path.isdir(next_path):
            get_wav_trans(next_path,X,y)
        else:
            if fname.endswith('wav'):
                fname_without_ext = fname.split(".")[0]
                trans_fname = fname_without_ext + ".txt"
                trans_fname_path = fpath + "/" + trans_fname
                if os.path.isfile(trans_fname_path):
                    mfcc_ft = audiofile_to_vector(next_path,n_inp,n_ctx)
                    with open(trans_fname_path,'r') as content:
                        transcript = content.read()
                        transcript = re.sub(regexp_alphabets, ' ', transcript).strip().lower()
                    trans_lbl = get_string2label(transcript)
                    X.append(mfcc_ft)
                    y.append(trans_lbl)

In [51]:
def get_layers(X_batch,seq_len):
    X_batch_shape = tft.shape(X_batch)
    X_batch = tft.transpose(X_batch, [1, 0, 2])
    X_batch = tft.reshape(X_batch, [-1, n_inp + 2*n_inp*n_ctx])
    
    with tft.name_scope('Lyr1'):
        B1 = tft.get_variable(name='B1', shape=[n_h], 
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        H1 = tft.get_variable(name='H1', shape=[n_inp + 2*n_inp*n_ctx, n_h],
                             initializer=tf.contrib.layers.xavier_initializer(uniform=False))
        logits1 = tft.add(tf.matmul(X_batch, H1), B1)
        relu1 = tft.nn.relu(logits1)
        clipped_relu1 = tft.minimum(relu1,20.0)
        Lyr1 = tft.nn.dropout(clipped_relu1, 0.5)
    with tft.name_scope('Lyr2'):
        B2 = tft.get_variable(name='B2', shape=[n_h], 
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        H2 = tft.get_variable(name='H2', shape=[n_h,n_h],
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        logits2 = tft.add(tft.matmul(Lyr1, H2), B2)
        relu2 = tft.nn.relu(logits2)
        clipped_relu2 = tft.minimum(relu2,20.0)
        Lyr2 = tft.nn.dropout(clipped_relu2, 0.5)
    with tf.name_scope('Lyr3'):
        B3 = tft.get_variable(name='B3', shape=[2*n_h], 
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        H3 = tft.get_variable(name='H3', shape=[n_h,2*n_h],
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        logits3 = tft.add(tft.matmul(Lyr2, H3), B3)
        relu3 = tft.nn.relu(logits3)
        clipped_relu3 = tft.minimum(relu3,20.0)
        Lyr3 = tft.nn.dropout(clipped_relu3, 0.5)
    
    with tf.name_scope('RNN_Lyr'):
        fw_c = tft.contrib.rnn.BasicLSTMCell(n_h, forget_bias=1.0, state_is_tuple=True, 
                                            reuse=tft.get_variable_scope().reuse)
        fw_c = tft.contrib.rnn.DropoutWrapper(fw_c, input_keep_prob=0.7, output_keep_prob=0.7,seed=123)
        bw_c = tft.contrib.rnn.BasicLSTMCell(n_h, forget_bias=1.0, state_is_tuple=True, 
                                                    reuse=tft.get_variable_scope().reuse)
        bw_c = tft.contrib.rnn.DropoutWrapper(bw_c,input_keep_prob=0.7, output_keep_prob=0.7,
                                                    seed=123)
        Lyr3 = tft.reshape(Lyr3, [-1, X_batch_shape[0], 2*n_h])
        outs, out_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_c,
                                                                     cell_bw=bw_c,
                                                                     inputs=Lyr3,
                                                                     dtype=tf.float32,
                                                                     time_major=True,
                                                                     sequence_length=seq_len)
        outs = tft.concat(outs, 2)
        outs = tft.reshape(outs, [-1, 2 * n_h])
    
    with tft.name_scope('Lyr4'):
        B4 = tft.get_variable(name='B4', shape=[n_h], 
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        H4 = tft.get_variable(name='H4', shape=[(2 * n_h), n_h],
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        logits4 = tft.add(tft.matmul(outs, H4), B4)
        relu4 = tft.nn.relu(logits4)
        clipped_relu4 = tft.minimum(relu4,20.0)
        Lyr4 = tft.nn.dropout(clipped_relu4, 0.5)
    
    with tft.name_scope('Lyr5'):
        B5 = tft.get_variable(name='B5', shape=[n_chars], 
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        H5 = tft.get_variable(name='H5', shape=[n_h, n_chars],
                             initializer=tft.random_normal_initializer(stddev=0.046875))
        Lyr5 = tft.add(tft.matmul(Lyr4, H5), B5)
        Lyr5 = tft.reshape(Lyr5, [-1, X_batch_shape[0], n_chars])
    
    return Lyr5
    

def get_logits(X_batch,seq_len):
    logits = get_layers(X_batch,seq_len)
    return logits

In [52]:
class Batch:
    def __init__(self):
        self.start_idx = 0
        self.batch_size = 32
        self.audio = []
        self.transcript = []
        get_wav_trans("C://Users//shash//Desktop//speech recognition//timit",self.audio,self.transcript)     
    def pad_seq(self,seqs):
        seq_lens = np.asarray([len(st) for st in seqs], dtype=np.int64)
        n_s = len(seqs)
        max_seq_len = np.max(seq_lens)
        s_shape = tuple()
        for s in seqs:
            if len(s) > 0:
                s_shape = np.asarray(s).shape[1:]
                break
        seqs_trc = (np.ones((n_s, max_seq_len) + s_shape) * 0.).astype(np.float32)
        for ix, s in enumerate(seqs):
            if len(s) == 0:
                continue  
            trc = s[:max_seq_len]
            trc = np.asarray(trc, dtype=np.int64)
            if trc.shape[1:] != s_shape:
                raise ValueError("ERROR in truncation shape")
            seqs_trc[ix, :len(trc)] = trc
        return seqs_trc, seq_lens
    def get_sp_tuple(self,seqs):
        ixs = []
        vals = []
        for n, s in enumerate(seqs):
            ixs.extend(zip([n] * len(s), range(len(s))))
            vals.extend(s)
        ixs = np.asarray(ixs, dtype=np.int64)
        vals = np.asarray(vals, dtype=np.int32)
        shape = np.asarray([len(seqs), ixs.max(0)[1] + 1], dtype=np.int64)
        return ixs, vals, shape
    def get_next_batch(self):
        src = self.audio[self.start_idx:self.start_idx+self.batch_size]
        tgt = self.transcript[self.start_idx:self.start_idx+self.batch_size]
        self.start_idx += self.batch_size
        if(self.start_idx>len(self.audio)):
            self.start_idx=0
        src,src_len = self.pad_seq(src)
        sp_lbls = self.get_sp_tuple(tgt)
        return src, src_len, sp_lbls

In [53]:
def get_model():
    input_t = tft.placeholder(tft.float32, [None, None, n_inp + 
                                                (2 * n_inp * n_ctx)], name='inp')
    tgts = tft.sparse_placeholder(tft.int32, name='tgts')
    len_seq = tft.placeholder(tft.int32, [None], name='len_seq')
    logits = get_logits(input_t,tft.to_int64(len_seq))
    return input_t, tgts, len_seq, logits

In [54]:
def get_cost(tgts,logits,len_seq):
    loss_t = ops.ctc_ops.ctc_loss(tgts, logits, len_seq)
    loss_avg = tf.reduce_mean(loss_t)
    return loss_avg

In [55]:
def get_optimizer(logits,len_seq,loss_avg):
    adm_opt = tf.train.AdamOptimizer(learning_rate=plr,beta1=pb1,beta2=pb2,epsilon=peps)
    adm_opt = adm_opt.minimize(loss_avg)
    dec, prob_log = ops.ctc_ops.ctc_beam_search_decoder(logits, len_seq, merge_repeated=False)
    return adm_opt,dec

In [56]:
def get_error_rates(dec,tgts):
    edit_dist = tf.edit_distance(tf.cast(dec[0], tf.int32), tgts)
    error_rate4label = tf.reduce_mean(edit_dist, name='error_rate4label')
    return error_rate4label

In [57]:
gr = tf.Graph()
with gr.as_default():
    input_t,tgts,len_seq,logits = get_model()
    loss_avg = get_cost(tgts,logits,len_seq)
    adm_opt, dec = get_optimizer(logits,len_seq,loss_avg)
    error_rate = get_error_rates(dec,tgts)
    sess = tf.Session()
    writer = tf.summary.FileWriter('/tmp/models/', graph=sess.graph)
    loss_summary = tf.summary.scalar("loss_avg", loss_avg)
    sum_op = tf.summary.merge_all()
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    for ep in range(epochs):
        train_cost = 0
        label_err_rate = 0
        batch_feeder = Batch()
        n_batches = np.ceil(len(batch_feeder.audio)/batch_feeder.batch_size)
        n_batches = int(n_batches)
        st = time.time()
        for batch in range(n_batches):
            src,len_src,labels_src = batch_feeder.get_next_batch()
            data_dict = {input_t: src, tgts: labels_src,len_seq:len_src}
            batch_cost, _,summ = sess.run([loss_avg, adm_opt,sum_op], data_dict)
            train_cost += batch_cost * batch_feeder.batch_size
            print("Batch cost: {0}, Train cost: {1}".format(batch_cost,train_cost))
            label_err_rate += sess.run(error_rate, feed_dict=data_dict) * batch_feeder.batch_size
            print('Label error: {}'.format(label_err_rate))
            writer.add_summary(summ,ep*batch_feeder.batch_size+batch)
        saver = tf.train.Saver() 
        saver.save(sess, '/tmp/models/speech2txt.ckpt')
        decoded_val = sess.run(dec[0], feed_dict=data_dict)
        d_decoded_val = tf.sparse_tensor_to_dense(decoded_val, default_value=-1).eval(session=sess)
        d_lbl = decoded_val_to_text(labels_src)
        cnt = 0
        cnt_max = 4
        if cnt < cnt_max:
            for actual_val, decoded_val in zip(d_lbl, d_decoded_val):
                d_str = array2txt(decoded_val)
                print('Batch {}'.format(batch))
                print('Actual: {}'.format(actual_val))
                print('Predicted:  {}'.format(d_str))
                cnt += 1
        time_taken = time.time() - st
        log = 'Epoch {}/{}, training_cost: {:.3f}, error_rate: {:.3f}, time: {:.2f} sec'
        print(log.format(ep,epochs,train_cost/len(batch_feeder.audio),
                (label_err_rate/len(batch_feeder.audio)), time_taken))

AttributeError: module 'tensorflow' has no attribute 'contrib'