# DS MODEL

## 0. IMPORTS

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.display import HTML, display
from IPython.display import display, Audio

import sys
import base64
import struct 
import os
import fnmatch
import re

import scipy.io.wavfile as wav
import python_speech_features as p

import pandas as pd

# import librosa

## 1. DATA

assume data is already converted from NIST to RIF format .wav files. If not run the mfcc_data_convert script

In [2]:
datapath = "/home/rob/Dropbox/UCL/DIS/Admin/LDC/timit/"
target = datapath+"TIMIT/"

train_list_wavs, train_list_trans, train_list_mfcc, train_list_fin = [],[],[],[]
valid_list_wavs, valid_list_trans, valid_list_mfcc, valid_list_fin = [],[],[],[]
test_list_wavs,  test_list_trans,  test_list_mfcc,  test_list_fin  = [],[],[],[]

file_count = 0

#token = re.compile("[\w-]+|'m|'t|'ll|'ve|'d|'s|\'")
def clean(word):
    ## LC ALL & strip fullstop, comma and semi-colon which are not required
    new = word.lower().replace('.','')
    new = new.replace(',','')
    new = new.replace(';','')
    new = new.replace('"','')
    new = new.replace('!','')
    new = new.replace('?','')
    new = new.replace(':','')
    new = new.replace('-','')
    return new

def read_text(full_wav):
    #need to remove _rif.wav (8chars) then add .TXT     
    trans_file = full_wav[:-8]+".TXT"
    with open(trans_file, "r") as f:
        for line in f:
            split  = line.split()
            start  = split[0]
            end    = split[1]
            t_list = split[2:]
            trans  = ""
        # insert cleaned word (lowercase plus removed bad punct)
        for t in t_list:
            trans = trans+' '+clean(t)
    
    return start, end, trans

for root, dirnames, filenames in os.walk(target):
    for filename in fnmatch.filter(filenames, "*.wav"):
        
        full_wav = os.path.join(root, filename)
                    
## Load file
#         fs, audio = wav.read(full_wav)
#         mfcc = p.mfcc(audio, samplerate=fs, numcep=26) # produces time x cep
        
        _, end, trans = read_text(full_wav)
    
        if 'train' in full_wav.lower():
            train_list_wavs.append(full_wav)
            train_list_trans.append(trans)
            train_list_fin.append(end) 

        elif 'test' in full_wav.lower():
            ##split 50/50 into validation and test (note not random)
            if file_count % 2 == 0:
                test_list_wavs.append(full_wav)
                test_list_trans.append(trans)            
                test_list_fin.append(end) 
            else:
                valid_list_wavs.append(full_wav)
                valid_list_trans.append(trans)           
                valid_list_fin.append(end) 
        else:
            raise IOError

        file_count=file_count+1
        
a = {'wavs' : train_list_wavs,
     'fin'  : train_list_fin, 
     'trans': train_list_trans}

b = {'wavs' : valid_list_wavs,
     'fin'  : valid_list_fin,
     'trans': valid_list_trans}

c = {'wavs' : test_list_wavs,
     'fin'  : test_list_fin,
     'trans': test_list_trans}

df_train = pd.DataFrame(a, columns=['fin','trans','wavs'],dtype=int)
df_valid = pd.DataFrame(b, columns=['fin','trans','wavs'],dtype=int)
df_test = pd.DataFrame(c, columns=['fin','trans','wavs'],dtype=int)
    
sortagrad = True
if sortagrad:
    df_train = df_train.sort_values(by='fin',ascending=True)
    df_valid = df_valid.sort_values(by='fin',ascending=True)
    df_test = df_test.sort_values(by='fin',ascending=True)

In [4]:
df_train.to_csv("./df_train.csv")

Here we have 5 seperate lists so if we iterate through with a counter we'll find the ith value is correct for what we need. However later we will need to sort the data which would break this approach therefore we should consider putting the data into a more managable dataformat. We will also need to transform the data (words/characters) into numbers.

Baidu's ba-dls-deepspeech implementation uses a character based implementation https://github.com/baidu-research/ba-dls-deepspeech

In [3]:
print(len(train_list_wavs)+len(test_list_wavs)+len(valid_list_wavs))
print(len(train_list_wavs),len(test_list_wavs),len(valid_list_wavs))
#6300
#(4620, 840, 840)

6300
(4620, 840, 840)


## 2. CORPUS

Before we train and build a model, let's see what the max length of a transcription is and build a list of the corpora for later. "does society really exist as an entity over and above the agglomeration of man?"

In [4]:
## looks like index 150 is the longest sentence with 80chars so we'll have to pad on this.
max_mfcc_index = 0
max_mfcc_len = 0
max_trans_charlength = 0
all_words = []

comb = train_list_trans+test_list_trans+valid_list_trans
# comb_mfcc = train_list_mfcc+test_list_mfcc+valid_list_mfcc

for count,sent in enumerate(comb):
    #count length
    if len(sent)>max_trans_charlength:
        max_trans_charlength = len(sent)
    #build vocab
    for w in sent.split():
        all_words.append(clean(w)) 
    #check mfcc    
#     if(comb_mfcc[count].shape[0]>max_mfcc_len):
#         max_mfcc_len=comb_mfcc[count].shape[0]
#         max_mfcc_index=count
    
print("max_trans_charlength:",max_trans_charlength)
# print("max_mfcc_len:",max_mfcc_len, "at comb index:",max_mfcc_index)

#('max_trans_charlength:', 80)
#('max_mfcc_len:', 778, 'at comb index:', 541)

('max_trans_charlength:', 80)


In [5]:
## Longest MFCC shape = (476, 26)
# print(list_of_mfcc[150].shape)

In [6]:
## CORPUS - useful for the LM

all_vocab = set(all_words) 
## do some analysis here on the types of words. E.g. a ? will change the sound of a word a lot. 
print("Words:",len(all_words))
print("Vocab:",len(all_vocab))

('Words:', 54198)
('Vocab:', 6145)


In [7]:
# From Baidu ba-dls-deepspeech - https://github.com/baidu-research/ba-dls-deepspeech

char_map_str = """
' 1
<SPACE> 2
a 3
b 4
c 5
d 6
e 7
f 8
g 9
h 10
i 11
j 12
k 13
l 14
m 15
n 16
o 17
p 18
q 19
r 20
s 21
t 22
u 23
v 24
w 25
x 26
y 27
z 28
"""

char_map = {}
index_map = {}

for line in char_map_str.strip().split('\n'):
    ch, index = line.split()
    char_map[ch] = int(index)
    index_map[int(index)] = ch
index_map[2] = ' '

def text_to_int_sequence(text):
    """ Use a character map and convert text to an integer sequence """
    int_sequence = []
    for c in text:
        if c == ' ':
            ch = char_map['<SPACE>']
        else:
            ch = char_map[c]
        int_sequence.append(ch)
    return int_sequence

##CHECK
max_intseq_length = 0
for x in train_list_trans+test_list_trans+valid_list_trans:
    try:
        y=text_to_int_sequence(x)
        if len(y)>max_intseq_length:
            max_intseq_length=len(y)
    except:
        print("error at:",x)
        
print(max_intseq_length)

80


In [8]:
num_classes = len(char_map)

print(num_classes)



28


## 6. KERAS

We can load this mfcc data into a DS1 NN

In [52]:
# imports
import tensorflow as tf

import keras
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers.recurrent import SimpleRNN
from keras.layers import Dense, Activation, Bidirectional, Reshape, Lambda, Input
from keras.optimizers import SGD, adam
from keras.preprocessing.sequence import pad_sequences
# from keras.utils.data_utils import Sequence

from keras.layers.merge import add, concatenate
import keras.callbacks

print(keras.__version__) ##be careful with 2.0.6 as 2.0.4 tested with CoreML

2.0.4


## STEPS

DONE:
0. merge the mfcc TIMIT code into this notebook so that variables are accessible
2. Build a generator (or sequence keras 2.0.6) include parameters:
2. sorting into smaller batches (sortagrad)

TODO:

4. language model does it get added here or after as output on own?
5. WER calc (take from DS)
6. exporting (without weird i/o in coreml)

In [53]:
global batch_size
batch_size = 8


In [101]:
def get_intseq(trans):
    #PAD
    while(len(trans)<max_intseq_length):
        trans=trans+' ' #replace with a space char to pad
    t = text_to_int_sequence(trans)
    return t

def get_mfcc(filename):
    fs, audio = wav.read(filename)
    r = p.mfcc(audio, samplerate=fs, numcep=26) # 2D array -> timesamples x mfcc_features
    t = np.transpose(r) # 2D array ->  mfcc_features x timesamples
    X = pad_sequences(t, maxlen=778, dtype='float', padding='post', truncating='post').T
    return X # 2D array -> MAXtimesamples x mfcc_features {778 x 26}

def get_xsize(val):
    return val.shape[0]
    
def get_ylen(val):
    return len(val)

class timitWavSeq(keras.callbacks.Callback):
        def __init__(self, wavpath, transcript, finish):
            
            self.wavpath = wavpath
            self.transcript = transcript
            self.start = np.zeros(len(finish))
            self.finish = finish
            self.length = self.finish
            self.batch_size = batch_size
            self.cur_train_index = 0
            self.cur_val_index = 0
            self.cur_test_index = 0
            
#         def __len__(self):
#             ## returns number of batches in the sequence
#             return len(self.wavpath) // self.batch_size
        
        def get_batch(self,idx):
                     
            batch_x = self.wavpath[idx*self.batch_size:(idx+1)*self.batch_size]
            batch_y_trans = self.transcript[idx*self.batch_size:(idx+1)*self.batch_size] 
            
            assert(len(batch_x)==self.batch_size)
            assert(len(batch_y_trans)==self.batch_size) 
            
            X_data = np.array([get_mfcc(file_name) for file_name in batch_x]) 
#             print("1. X_data.shape:",X_data.shape)  # ('1. X_data.shape:', (2, 778, 26))
            
            assert(X_data.shape == (self.batch_size,778,26))
            
            labels = np.array([get_intseq(l) for l in batch_y_trans])
#             print("2. labels.shape:",labels.shape) # ('2. labels.shape:', (2, 80))
            
            assert (labels.shape == (self.batch_size,max_intseq_length))
            
            input_length = np.array([get_xsize(mfcc) for mfcc in X_data])
#             print("3. input_length:",input_length.shape) # ('3. input_length:', (2,))
            assert (input_length.shape == (self.batch_size,))
                  
            label_length = np.array([get_ylen(y) for y in labels])
#             print("4. label_length:",label_length.shape) # ('4. label_length:', (2,))
            assert (label_length.shape == (self.batch_size,))
            
            inputs = {
                 'the_input': X_data,
                 'the_labels': labels,
                 'input_length': input_length,
                 'label_length': label_length
                 }

            outputs = {'ctc': np.zeros([batch_size])}
            
            return (inputs, outputs)

        def next_train(self):
            while 1:
                ret = self.get_batch(self.cur_train_index)
                self.cur_train_index += self.batch_size
                if self.cur_train_index >= len(self.wavpath):
                    self.cur_train_index=0
                yield ret
                
        def next_val(self):
            while 1:
                ret = self.get_batch(self.cur_val_index)
                self.cur_val_index += self.batch_size
                if self.cur_val_index >= len(self.wavpath):
                    self.cur_val_index=0
                yield ret
                
        def next_test(self):
            while 1:
                ret = self.get_batch(self.cur_test_index)
                
                self.cur_test_index += self.batch_size
                if self.cur_test_index >= len(self.wavpath):
                    self.cur_test_index=0
                yield ret
        
        def on_train_begin(self, logs={}):
            print("train begin")
            
        def on_epoch_begin(self, epochs, logs={}):
            print("on epoch begin")
            
        def on_epoch_end(self, epoch, logs={}):
            print("EPOCH END")
    

In [102]:
sort_train_fin_list = df_train['fin'].tolist()
sort_train_trans_list = df_train['trans'].tolist()
sort_train_wav_list = df_train['wavs'].tolist()

sort_valid_fin_list = df_valid['fin'].tolist()
sort_valid_trans_list = df_valid['trans'].tolist()
sort_valid_wav_list = df_valid['wavs'].tolist()

sort_test_fin_list = df_test['fin'].tolist()
sort_test_trans_list = df_test['trans'].tolist()
sort_test_wav_list = df_test['wavs'].tolist()

traindata = timitWavSeq(wavpath=sort_train_wav_list, transcript=sort_train_trans_list, finish=sort_train_fin_list)
validdata = timitWavSeq(wavpath=sort_valid_wav_list, transcript=sort_valid_trans_list, finish=sort_valid_fin_list)
testdata = timitWavSeq(wavpath=sort_test_wav_list, transcript=sort_test_trans_list, finish=sort_test_fin_list)

In [88]:
# Define CTC loss
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    ''' from TF: Input requirements
    1. sequence_length(b) <= time for all b
    2. max(labels.indices(labels.indices[:, 1] == b, 2)) <= sequence_length(b) for all b.
    '''

    print(labels.shape) #(?, 80)
    print(y_pred.shape) #(?, 778, 2048)
    print(input_length.shape) #(?, 1)
    print(label_length.shape) #(?, 1)
    
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage:
    
    #y_pred = y_pred[:, 2:, :] ## want to change this?
    
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


## 7. MODEL

Let's build the model

In [89]:
# Network Params
fc_size = 2048
rnn_size = 512
mfcc_features = 26
max_mfcclength_audio = 778

X = np.zeros([batch_size, max_mfcclength_audio, mfcc_features])

# Creates a tensor there are always 26 MFCC
input_data = Input(name='the_input', shape=X.shape[1:]) # >>(?, 778, 26)

# First 3 FC layers
x = Dense(fc_size, name='fc1', activation='relu')(input_data) # >>(?, 778, 2048)
x = Dense(fc_size, name='fc2', activation='relu')(x) # >>(?, 778, 2048)
x = Dense(fc_size, name='fc3', activation='relu')(x) # >>(?, 778, 2048)

# Layer 4 BiDirectional RNN

rnn_1f = SimpleRNN(rnn_size, return_sequences=True, go_backwards=False, 
                   kernel_initializer='he_normal', name='rnn_f')(x) #>>(?, ?, 512)

rnn_1b = SimpleRNN(rnn_size, return_sequences=True, go_backwards=True, 
                   kernel_initializer='he_normal', name='rnn_b')(x) #>>(?, ?, 512)

rnn_merged = add([rnn_1f, rnn_1b]) #>>(?, ?, 512)

#TODO TRY THIS FROM: https://github.com/fchollet/keras/issues/2838
# rnn_bidir1 = merge([rnn_fwd1, rnn_bwd1], mode='concat')
# predictions = TimeDistributed(Dense(output_class_size, activation='softmax'))(rnn_bidir1) 

x = Activation('relu', name='birelu')(rnn_merged) #>>(?, ?, 512)

# Layer 5 FC Layer
y_pred = Dense(fc_size, name='fc5', activation='relu')(x) #>>(?, 778, 2048)



In [90]:
# Change shape 
labels = Input(name='the_labels', shape=[80], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

# Keras doesn't currently support loss funcs with extra parameters
# so CTC loss is implemented in a lambda layer
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
                                                                   labels, 
                                                                   input_length, 
                                                                   label_length])

sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)

model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

# the loss calc occurs elsewhere, so use a dummy lambda func for the loss
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)

(?, 80)
(?, 778, 2048)
(?, 1)
(?, 1)


## What's the output of the model? 



In [213]:
print(model.summary(line_length=80))

________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
the_input (InputLayer)    (None, 778, 26)   0                                   
________________________________________________________________________________
fc1 (Dense)               (None, 778, 2048) 55296    the_input[0][0]            
________________________________________________________________________________
fc2 (Dense)               (None, 778, 2048) 4196352  fc1[0][0]                  
________________________________________________________________________________
fc3 (Dense)               (None, 778, 2048) 4196352  fc2[0][0]                  
________________________________________________________________________________
rnn_f (SimpleRNN)         (None, 778, 512)  1311232  fc3[0][0]                  
________________________________________________________________________________
rnn_b (SimpleRNN)         (N


## 8. TRAIN




In [92]:
## Make it smaller for perpose of demo
train_steps = len(train_list_wavs)/25//batch_size
valid_steps = len(valid_list_wavs)/25//batch_size

print(train_steps, valid_steps)

(23, 4)


In [93]:

model.fit_generator(generator=traindata.next_train(),
                    steps_per_epoch=train_steps, #28
                    epochs=2, 
                    callbacks=[traindata], ##create custom callback to handle stop for valid
                    
                    validation_data=None,
                    validation_steps=None,
                    initial_epoch=0)


train begin
on epoch begin
Epoch 1/2
on epoch begin
Epoch 2/2


<keras.callbacks.History at 0x7f4f7f98c4d0>

BUG WITH TF & Batch Size = 1

https://github.com/fchollet/keras/issues/6635



## BIG CTC ERROR 

#BATCH_SIZE=2
InvalidArgumentError (see above for traceback): label SparseTensor is not valid: 
indices[24] = [0,24] is out of bounds: need 0 <= index < [2,24]

#BATCH_SIZE=10
InvalidArgumentError (see above for traceback): label SparseTensor is not valid: 
indices[24] = [0,24] is out of bounds: need 0 <= index < [10,24]
    
#BATCH_SIZE=32  
InvalidArgumentError (see above for traceback): label SparseTensor is not valid:
indices[24] = [0,24] is out of bounds: need 0 <= index < [32,24]

## DIFFERENT DIMS SWAP [batch_size, mfcc, 778] vs old [batch_size, 778, mfcc]

InvalidArgumentError (see above for traceback): sequence_length(0) <= 776
	 [[Node: ctc_22/CTCLoss = CTCLoss[ctc_merge_repeated=true, preprocess_collapse_repeated=false, _device="/job:localhost/replica:0/task:0/cpu:0"](ctc_22/Log, ctc_22/ToInt64, ctc_22/ToInt32_2, ctc_22/ToInt32_1)]]


InvalidArgumentError (see above for traceback): Not enough time for target transition sequence (required: 80, available: 26), skipping data instance in batch: 0


## TEST TIME

In [103]:
## not sure this is best use of gen / test data creation

model.predict_generator( testdata.next_test(), 5, workers=1, verbose=1)
#model.evaluate(x, y, batch_size=32, verbose=1, sample_weight=None)



Exception in thread Thread-32:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/rob/py27/local/lib/python2.7/site-packages/keras/engine/training.py", line 612, in data_generator_task
    generator_output = next(self._generator)
  File "<ipython-input-101-aabe4a3ca035>", line 93, in next_test
    ret = self.get_batch(self.cur_test_index)
  File "<ipython-input-101-aabe4a3ca035>", line 43, in get_batch
    assert(len(batch_x)==self.batch_size)
AssertionError





array([[ 131.88670349],
       [ 189.01681519],
       [ 282.64770508],
       [ 107.13787842],
       [ 258.87561035],
       [ 236.39028931],
       [ 223.16127014],
       [ 161.07373047],
       [ 281.67977905],
       [ 248.79734802],
       [ 233.22113037],
       [ 281.21868896],
       [ 211.15124512],
       [ 208.48600769],
       [ 216.82899475],
       [ 204.44763184],
       [ 197.55986023],
       [ 238.77764893],
       [ 298.02575684],
       [ 231.60652161],
       [ 415.53540039],
       [ 292.26651001],
       [ 258.68112183],
       [ 235.75234985],
       [ 306.95895386],
       [ 295.81634521],
       [ 280.23269653],
       [ 283.27096558],
       [ 287.94567871],
       [ 326.68359375],
       [ 300.39996338],
       [ 269.24191284],
       [ 282.01095581],
       [ 344.5602417 ],
       [ 326.61203003],
       [ 289.81646729],
       [ 362.6890564 ],
       [ 298.21893311],
       [ 346.61257935],
       [ 303.37142944]], dtype=float32)

## 9. EXPORT

We can export the model ready for CoreML to convert.

In [104]:

# serialize model to JSON
with open("ds_ctc_model.json", "w") as json_file:
    json_file.write(model.to_json())

# serialize weights to HDF5
model.save_weights("ds_ctc_model_weights.h5")

# # save data to .npz
# np.savez('xor_data.npz', training_data=training_data, target_data=target_data, test_data=test_data)


## 10. CONVERT

Converting is expected to fail here due to the Lambda layer required for the CTC function. 

In [105]:
import coremltools

from keras.models import model_from_json

In [166]:
# load json and create model
json_file = open('ds_ctc_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into loaded model
loaded_model.load_weights("ds_ctc_model_weights.h5")
print("Loaded model/weights from disk")

# it looks like this has worked. We can now convert to 
print("Convert Model")
try:
    coreml_model = coremltools.converters.keras.convert(loaded_model)
except Exception as e:
    print(e)
    

(?, 80)
(?, 778, 2048)
(?, 1)
(?, 1)
Loaded model/weights from disk
Convert Model
Keras layer '<class 'keras.layers.core.Lambda'>' not supported. 


## 11. REMOVE LAMBDA

We are required to remove the lambda layer and use a normal activation function. In theory it would be possible to build a graph that is identical to the tensorflow model and transfer the weights. We still want the majority of the graph to be converted to coreml though.

Things to try:

1. Use Keras pop layer to remove lambda 
2. Recreate model without Lambda



### 1. POP KERAS

In [167]:

### Doesn't work, must be an issue with the outputs. Instead we can transfer to new

# 1. Use Keras pop layer to remove lambda

# METHOD1 use pop and correct output

# print("Before")
# loaded_model.layers.pop()
# loaded_model.layers.pop()
# loaded_model.layers.pop()
# loaded_model.layers.pop()

# loaded_model.summary(line_length=80)

# # Get input
# new_input = loaded_model.input
# # Find the layer to connect
# hidden_layer = loaded_model.layers[-1].output
# # Connect a new layer on it
# out = Dense(fc_size, name='final_out')(hidden_layer)
# # Build a new model
# model3 = Model(new_input, out)
# # print("After")
# model3.summary(line_length=80)



In [169]:

# it looks like this has worked. We can now convert to 
print("Convert Model")

try:
    coreml_model = coremltools.converters.keras.convert(loaded_model)
    # Set model metadata
    coreml_model.author = 'Rob Smith'
    coreml_model.license = 'BSD'
    coreml_model.short_description = 'Performs keras ds '

    # SAVE
    coreml_model.save('kds.mlmodel')
    
except Exception as e:
    print(e)


Convert Model
Keras layer '<class 'keras.layers.core.Lambda'>' not supported. 


## 12. REBUILD WITHOUT LAMBDA

let's see if we get the same error building from scratch. Also copy over the weights.

In [210]:
# Network Params
fc_size = 2048
rnn_size = 512
mfcc_features = 26
max_mfcclength_audio = 778

X = np.zeros([batch_size, max_mfcclength_audio, mfcc_features])

# Creates a tensor there are always 26 MFCC
input_data = Input(name='the_input', shape=X.shape[1:]) # >>(?, 778, 26)

# First 3 FC layers
x = Dense(fc_size, name='fc1', activation='relu', 
          weights=loaded_model.layers[1].get_weights())(input_data) # >>(?, 778, 2048)
x = Dense(fc_size, name='fc2', activation='relu',
         weights=loaded_model.layers[2].get_weights())(x) # >>(?, 778, 2048)
x = Dense(fc_size, name='fc3', activation='relu',
         weights=loaded_model.layers[3].get_weights())(x) # >>(?, 778, 2048)

# Layer 4 BiDirectional RNN

rnn_1f = SimpleRNN(rnn_size, return_sequences=True, go_backwards=False, 
                   kernel_initializer='he_normal', name='rnn_f',
                   weights=loaded_model.layers[4].get_weights())(x) #>>(?, ?, 512)

rnn_1b = SimpleRNN(rnn_size, return_sequences=True, go_backwards=True, 
                   kernel_initializer='he_normal', name='rnn_b',
                   weights=loaded_model.layers[5].get_weights())(x) #>>(?, ?, 512)

rnn_merged = add([rnn_1f, rnn_1b],
                weights=loaded_model.layers[6].get_weights()) #>>(?, ?, 512)
x = Activation('relu', name='birelu',
              weights=loaded_model.layers[7].get_weights())(rnn_merged) #>>(?, ?, 512)

# Layer 5 FC Layer
y_pred = Dense(fc_size, name='fc5', activation='relu',
              weights=loaded_model.layers[8].get_weights())(x) #>>(?, 778, 2048)



In [211]:


#####################################

# Change shape 
labels = Input(name='the_labels', shape=[80], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

# Keras doesn't currently support loss funcs with extra parameters
# so CTC loss is implemented in a lambda layer
# loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred,
#                                                                    labels, 
#                                                                    input_length, 
#                                                                    label_length])

# sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)

# model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

# # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
# model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=sgd)


out = Dense(fc_size, name='final_out')(y_pred)
sgd = SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
model3 = Model(inputs=input_data, outputs=out)

model3.compile(loss='mean_squared_error',
               optimizer=sgd)




In [212]:
print("Convert Model")
coreml_model = coremltools.converters.keras.convert(model3)

# Set model metadata
coreml_model.author = 'Rob Smith'
coreml_model.license = 'BSD'
coreml_model.short_description = 'Performs keras ds '

# Set feature descriptions manually
#coreml_model.input_description['the_input'] = 'Audio input'
# coreml_model.input_description['bathrooms'] = 'Number of bathrooms'
# coreml_model.input_description['size'] = 'Size (in square feet)'

# Set the output descriptions
# coreml_model.output_description['out'] = 'Audio transcription'

# SAVE
coreml_model.save('kds.mlmodel')

Convert Model
0 : the_input, <keras.engine.topology.InputLayer object at 0x7f4f64b37d90>
1 : fc1, <keras.layers.core.Dense object at 0x7f4f64ac9f10>
2 : fc1__activation__, <keras.layers.core.Activation object at 0x7f4f66c385d0>
3 : fc2, <keras.layers.core.Dense object at 0x7f4f64ac9ed0>
4 : fc2__activation__, <keras.layers.core.Activation object at 0x7f4f64ac98d0>
5 : fc3, <keras.layers.core.Dense object at 0x7f4f66c38f50>
6 : fc3__activation__, <keras.layers.core.Activation object at 0x7f4f64ac9c50>
7 : rnn_f, <keras.layers.recurrent.SimpleRNN object at 0x7f4f64accf50>
8 : rnn_b, <keras.layers.recurrent.SimpleRNN object at 0x7f4f64a91e50>
9 : add_11, <keras.layers.merge.Add object at 0x7f4f64a69a50>
10 : birelu, <keras.layers.core.Activation object at 0x7f4f64a7bc90>
11 : fc5, <keras.layers.core.Dense object at 0x7f4f6292af50>
12 : fc5__activation__, <keras.layers.core.Activation object at 0x7f4f64ac9bd0>
13 : final_out, <keras.layers.core.Dense object at 0x7f4f64b37a90>


## CONCLUSION

So it finally was exported as a model however it doesn't have a CTC function. Next step will be to train on data with this model with the CTC function and see how it performs in iOS