In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Input, Conv2D, BatchNormalization, Concatenate
from keras.layers import merge, concatenate
from keras.layers import MaxPool2D, MaxPooling2D, Reshape, Dropout, SeparableConv2D
from keras.models import Model
from keras.utils import to_categorical
import os

# matplotlib for displaying the output
import matplotlib.pyplot as plt
import matplotlib.style as ms
ms.use('seaborn-muted')
%matplotlib inline
from WavDataLoader import WavDataLoader
from WavDataGenerator import WavDataGenerator

In [24]:
labels = ['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'silence', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']
num_labels = len(labels)

In [25]:
if os.name is 'nt':
    data_dir = r'C:\Development\kaggle\tensorflow-speech-recognition-challenge\data\train\audio'
else:
    data_dir = r'/home/shaur141/Development/kaggle/tensorflow-speech-recognition-challenge/data/train/audio'
    
wav_data_loader = WavDataLoader(data_dir, labels)


In [49]:
def build_model():
    log_melspectrogram_inputs = Input(shape=(128, 32, 1), name='log_melspectrogram')
    
    
#     x = Reshape((wav_data_loader.nx*wav_data_loader.ny,))(inputs)
#     x = BatchNormalization()(inputs)
    x_logmel = Conv2D(16,(5,5),strides=(1,1), activation='relu')(log_melspectrogram_inputs)
#     x_logmel = BatchNormalization()(x_logmel)
    x_logmel = MaxPool2D(strides=(1,1))(x_logmel)
    x_logmel = Dropout(0.25)(x_logmel)
    x_logmel = Conv2D(32,(3,3),strides=(2,2), activation='relu')(x_logmel)
    x_logmel = BatchNormalization()(x_logmel)
    x_logmel = MaxPool2D(strides=(1,1))(x_logmel)
    x_logmel = Dropout(0.25)(x_logmel)
    x_logmel = Conv2D(64,(3,3),strides=(1,1), activation='relu')(x_logmel)    
    
    x_logmel = MaxPool2D(strides=(2,2))(x_logmel)
    x_logmel = Dropout(0.25)(x_logmel)
    x_logmel = Reshape((-1,))(x_logmel)
    x_logmel = Dense(256, activation='relu')(x_logmel)
    
    mfcc_inputs = Input(shape=(40, 32, 1), name='mfcc')
    x_mfcc = Conv2D(16,(3,3),strides=(1,1), activation='relu')(mfcc_inputs)

    x_mfcc = MaxPool2D(strides=(1,1))(x_mfcc)
    x_mfcc = Dropout(0.25)(x_mfcc)
    x_mfcc = Conv2D(32,(3,3),strides=(2,2), activation='relu')(x_mfcc)
    x_mfcc = BatchNormalization()(x_mfcc)
    x_mfcc = MaxPool2D(strides=(1,1))(x_mfcc)
    x_mfcc = Dropout(0.25)(x_mfcc)
    x_mfcc = Conv2D(64,(3,3),strides=(2,2), activation='relu')(x_mfcc)    
    x_mfcc = MaxPool2D(strides=(1,1))(x_mfcc)
    x_mfcc = Dropout(0.25)(x_mfcc)
    x_mfcc = Reshape((-1,))(x_mfcc)
    x_mfcc = Dense(256, activation='relu')(x_mfcc)
    x = concatenate([x_logmel, x_mfcc])
    
    x = Dense(128, activation='relu')(x)
#     x = Dense(128, activation='relu')(x)
    predictions = Dense(num_labels, activation='softmax')(x)
    
    model = Model(inputs=[log_melspectrogram_inputs, mfcc_inputs], outputs=predictions)
    model.compile(optimizer='Nadam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model    

In [50]:
model = build_model()

In [51]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
log_melspectrogram (InputLayer)  (None, 128, 32, 1)    0                                            
____________________________________________________________________________________________________
mfcc (InputLayer)                (None, 40, 32, 1)     0                                            
____________________________________________________________________________________________________
conv2d_40 (Conv2D)               (None, 124, 28, 16)   416         log_melspectrogram[0][0]         
____________________________________________________________________________________________________
conv2d_43 (Conv2D)               (None, 38, 30, 16)    160         mfcc[0][0]                       
___________________________________________________________________________________________

In [52]:
model.fit([wav_data_loader.data['log_melspectrogram'], wav_data_loader.data['mfcc']], 
          y=to_categorical(wav_data_loader.y), 
          validation_split=0.15, epochs=20)

# model.fit_generator(wav_data_generator.generator(), 
#                     steps_per_epoch=wav_data_generator.num_examples//wav_data_generator.batch_size,
#                    workers=4)
                    

Train on 658 samples, validate on 117 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

In [21]:
model.save('model.h5')