### Imports

In [1]:
import os
import pickle
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import LSTM, TimeDistributed, Activation, Bidirectional, ConvLSTM2D, Attention, Dense, Flatten, MaxPool3D, MaxPool2D,BatchNormalization, Conv3D, GRU
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras import Model
from tensorflow.keras.backend import ctc_batch_cost, ctc_decode, ctc_label_dense_to_sparse, get_value
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input

import Levenshtein as Lev
import sys
from string import ascii_uppercase

In [2]:
print(tf.__version__)

2.0.0


In [3]:
tf.test.is_gpu_available()

True

### Paths

In [4]:
train_path = "./LibriSpeech100/train/train_all/"
dev_path = "./LibriSpeech100/dev/dev_all/"
test_path = "./LibriSpeech100/test/test_all/"

### Data Generator


In [5]:
class DataGenerator(Sequence):
    
    def __init__(self, path, to_fit = True):
        self.path = path
        self.list_X, self.list_Y = self.getLists()
        self.to_fit = to_fit
    
    
    def __len__(self):
        return len(self.list_X)
    
    
    def __getitem__(self, index):      
        dict_X = self.get_dict_X(index)   
        dict_Y = self.get_dict_Y(index)
        
        X, Y, input_len, label_len, y_strings = self.generate_XY(dict_X, dict_Y)
            
        return [X, y_strings, input_len, label_len], Y
    
    
    def getLists(self):
        list_X = []
        list_Y = []
        for item in sorted(os.listdir(self.path)):
            ext = item.split(".")[-1]
            if ext == 'pkl':
                list_X.append(item)
            elif ext == 'txt':
                list_Y.append(item)
        return list_X, list_Y
    
    
    def get_dict_X(self, index):
        file_name = self.path + self.list_X[index]
        with open(file_name, 'rb') as pickle_file:
            dict_X = pickle.load(pickle_file)
        return dict_X
    
    
    def get_dict_Y(self, index):
        filename = self.path + self.list_Y[index]
        file = open(filename)
        dict_Y = {}
        for line in file:
            data = line.split()
            key = data[0]
            value = ' '.join(data[1:])
            dict_Y[key] = value
        return dict_Y

    
    def generate_XY(self, dict_X, dict_Y):
        X = []
        Y = []
        Y_strings = []
        input_len = []
        label_len = []
        
        max_x = 0
        max_y = 0
        
        for key in dict_X:
            x_temp = dict_X[key]
            y_temp = dict_Y[key]
            if max_x < x_temp.shape[1]:
                max_x = x_temp.shape[1]
            if max_y < len(y_temp):
                max_y = len(y_temp)
        
        for key in dict_X:
            x_temp = dict_X[key]
            y_temp = dict_Y[key]
            Y_strings.append(y_temp)

            input_len.append(x_temp.shape[1])
            label_len.append(len(y_temp))
            
            to_pad_x = ( (0,0), (0, max_x - dict_X[key].shape[1]))
            to_pad_y = (  (0, max_y - len(dict_Y[key])))
            
            x_temp = np.pad(dict_X[key], pad_width = to_pad_x, mode='constant', constant_values=0)
            y_temp = self.generate_Y_array(dict_Y[key], max_y)
            X.append(x_temp.T)
            Y.append(y_temp)
          
        return np.stack(X), np.stack(Y), np.stack(input_len), np.stack(label_len), Y_strings

    
    def generate_Y_array(self, sentence, maxlen):
        space_token = ' '
        end_token = '>'
        blank_token = '%'
        apos_token = '\''
        while len(sentence) != maxlen:
            sentence += blank_token
        sentence += end_token
        
        alphabet = list(ascii_uppercase) + [space_token, apos_token, blank_token, end_token] 
        char_to_index = {}
        for idx, char in enumerate(alphabet):
            char_to_index[char] = idx

        y = []
        
        for char in sentence:
            y.append(char_to_index[char])
        
        return np.array(y)

### Create DataGenerator objects

In [6]:
train_data = DataGenerator(train_path)
val_data = DataGenerator(dev_path)
test_data = DataGenerator(test_path)

### Check Data loaded by the DataGenerator objects

In [7]:
x, y = train_data[0]
x, y_strings, input_len, label_len = x

In [8]:
print(x.shape)
print(input_len.shape)
print(label_len.shape)

print(len(y_strings))
print(y.shape)

(58, 727, 20)
(58,)
(58,)
58
(58, 319)


In [9]:
print(x.dtype)
print(y.dtype)
print(input_len.dtype)
print(label_len.dtype)

float32
int64
int64
int64


In [10]:
print(y_strings[0])

AND THAT IF SHE NOTICED ANYTHING ODD OR OUT OF PLACE SHE WOULD NEVER REST UNTIL SHE HAD FERRETED OUT THE WHYS AND WHEREFORES THEREOF THERE ARE PLENTY OF PEOPLE IN AVONLEA AND OUT OF IT WHO CAN ATTEND CLOSELY TO THEIR NEIGHBOR'S BUSINESS BY DINT OF NEGLECTING THEIR OWN


### Word Error Rate

In [11]:
def wer(s1, s2):

    s1 =s1.lower()
    s2 =s2.lower()
    b = set(s1.lower().split() + s2.lower().split())
    
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]
    return Lev.distance(''.join(w1), ''.join(w2))/float(len(s2.split()))

### Convert a tensor array to sentence

In [12]:
def indices_to_string(indices):
#     print(indices)
    space_token = ' '
    end_token = '>'
    blank_token = '%'
    apos_token = '\''
        
    alphabet = list(ascii_uppercase) + [space_token, apos_token, blank_token, end_token] 

    sentence = ''
    for idx in indices:
        sentence += alphabet[idx]
    
    return sentence

# Model Architecture

In [13]:
class BaseModel(Model):
    def __init__(self, op_dim = 30):
        super(BaseModel, self).__init__()
        self.rnn = GRU(20, return_sequences= True)
        self.batchnorm = BatchNormalization()
        self.time_dense = TimeDistributed(Dense(op_dim))
        
    def call(self, inputs):
        x = self.rnn(inputs)
        x = self.batchnorm(x)
        x = self.time_dense(x)
        return x

In [14]:
class ASRModel(Model):
    def __init__(self):
        super(ASRModel, self).__init__()
        self.base_model = BaseModel()
        self.activation = Activation('softmax')

    def call(self, inputs):
        x = self.base_model(inputs)
        x = self.activation(x) 
        return x

### Build Model

In [15]:
model = ASRModel()
model.build(input_shape = (None, None, 20))
optimizer = tf.keras.optimizers.Adam()

### One Training Step

In [16]:
def train_one_step(model, optimizer, x, y_true, input_len, label_len, y_strings):
#     print('------------------------------')
#     print(x.shape)
#     print(y.shape)
#     print(input_len.shape)
#     print(label_len.shape)
    
    input_len = np.expand_dims(input_len, axis = 1)
    label_len = np.expand_dims(label_len, axis = 1)
#     print(input_len.shape)
#     print(label_len.shape)
            
    with tf.GradientTape() as tape:
        y_pred = model(x)
#         print(y_pred.shape)
        loss = ctc_batch_cost(y_true, y_pred, input_len, label_len)
    
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    input_len = np.squeeze(input_len)
    y_decode = ctc_decode(y_pred, input_len)[0][0]
    
#         print(y_decode)
#         print(len(y_strings))
    
    accuracy = 0.0
    
    for i in range(len(y_strings)):
        predicted_sentence = indices_to_string(y_decode[i])
#             print(predicted_sentence)
        accuracy += wer(predicted_sentence, y_strings[i])
            
    return tf.reduce_mean(loss), accuracy/len(y_strings)

### Training

In [17]:
def validate(model, x, y_true, input_len, label_len, y_strings, test = False):
    input_len = np.expand_dims(input_len, axis = 1)
    label_len = np.expand_dims(label_len, axis = 1)
    
    y_pred = model(x)
    loss = ctc_batch_cost(y_true, y_pred, input_len, label_len)
    
    input_len = np.squeeze(input_len)
    y_decode = ctc_decode(y_pred, input_len)[0][0]
    
    accuracy = 0.0
    
    for i in range(len(y_strings)):
        predicted_sentence = indices_to_string(y_decode[i])
#             print(predicted_sentence)
        accuracy += wer(predicted_sentence, y_strings[i])
        
        if test:
            print("Correct Sentence:", y_strings[i])
            print("Predicted Sentence:", predicted_sentence)
    
    return tf.reduce_mean(loss), accuracy/len(y_strings)    

In [18]:
def model_evaluate(model, val_ds, test = False):
    val_step = 0
    val_loss = 0.0
    val_accuracy = 0.0
            
    for inputs, y in val_ds:
        x, y_strings, ip_len, label_len = inputs
        val_step += 1       
        loss, accuracy = validate(model, x, y, ip_len, label_len, y_strings, test)
        val_loss += loss
        val_accuracy += accuracy
                
    val_loss /= val_step
    val_accuracy /= val_step

    tf.print(' Validation Loss:', val_loss, ' Validation WER: ', val_accuracy)
    
    return val_loss, val_accuracy

In [19]:
def model_fit(model, optimizer, train_ds, val_ds = None,epochs=20):
    
    losses = []
    accuracies = []
    val_losses = []
    val_acc = []
    
    for epoch in range(epochs):
        step = 0
        epoch_loss = 0.0
        epoch_accuracy = 0.0
        for inputs, y in train_ds:
            x, y_strings, ip_len, label_len = inputs
            step += 1
            loss, accuracy = train_one_step(model, optimizer, x, y, ip_len, label_len, y_strings)
            epoch_loss += loss
            epoch_accuracy += accuracy
            if step % 78 == 0:
                print(step)
                
            
        epoch_loss /= step
        epoch_accuracy /= step
        
        losses.append(epoch_loss)
        accuracies.append(epoch_accuracy)
        
        tf.print('Epoch: ', epoch+1, ' Loss:', epoch_loss, ' WER: ', epoch_accuracy)
        
        
        if val_ds:
            val_loss, val_accuracy = model_evaluate(model, val_ds)
            val_losses.append(val_loss)
            val_acc.append(val_accuracy)
        
                
    if not val_ds:    
        return losses, accuracies
    
    return losses, accuracies, val_losses, val_acc

In [20]:
losses, accuracies, val_losses, val_acc = model_fit(model, optimizer, train_data, val_ds = val_data)

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
58
116
174
232
290
348
406
464
522
580
Epoch:  1  Loss: 907.079834  WER:  0.9999876533324197
 Validation Loss: 532.587646  Validation WER:  1.000229095074456
58
116
174
232
290
348
406
464
522
580
Epoch:  2  Loss: 588.869385  WER:  1.0
 Validation Loss: 321.562958  Validation WER:  1.0
58
116
174
232
290
348
406
464
522
580
Epoch:  3  Loss: 529.442871  WER:  1.0
 Validation Loss: 320.867584  Validation WER:  1.0
58
116
174
232
290
348
406
464
522
580
Epoch:  4  Loss: 529.120483  WER:  1.0
 Validation Loss: 320.663605  Validation WER:  1.0
58
116
174
232
290
348
406
464
522
580
Epoch:  5  Loss: 529.008179  WER:  1.0
 Validation Loss: 320.563293  Validation WER:  1.0
58
116
174
232
290
348
406
464
522
580
Epoch:  6  Loss: 528.943237  WER:  1.0
 Validation Loss: 320.489929  Validation WER:  1.0
58
116
174
232
290
348
406
464
522
580
Epoch:  7  Loss: 528.894  WER:  1.0
 Validation Loss: 320.4

InternalError: Failed to call ThenRnnBackward with model config: [rnn_mode, rnn_input_mode, rnn_direction_mode]: 3, 0, 0 , [num_layers, input_size, num_units, dir_count, max_seq_length, batch_size, cell_num_units]: [1, 20, 20, 1, 705, 133, 0]  [Op:CudnnRNNBackprop]

In [None]:
_, acc = model_evaluate(model, test_data, test=True)

In [None]:
print(acc)