# TensorFlow Speech Recognition 

.Speech recognition Keras model. Given a short speech map it to short text.
.Download data from https://research.googleblog.com/2017/08/launching-speech-commands-dataset.html.

In [1]:
from pathlib import Path
import time

from scipy.io import wavfile
import numpy as np
import pandas as pd
from scipy import signal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

import keras

from keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Dense, Input, Dropout, Flatten
from keras.models import Model
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def load_data(datadir):
    
    datadir = Path(datadir)
    files = [(str(f), f.parts[-2]) for f in datadir.glob('*/*.wav') if f]
    dframe = pd.DataFrame(files, columns=['path', 'word'])
    
    speech_commands = 'yes no up down left right on off stop go silence unknown'.split()
    
    words = dframe.word.unique().tolist()
    silence = ['_background_noise_']
    unknown = [w for w in words if w not in silence + speech_commands]

   
    dframe.loc[dframe.word.isin(silence), 'word'] = 'unknown'
    dframe.loc[dframe.word.isin(unknown), 'word'] = 'unknown'
    
    return dframe

def get_specgrams(paths, nsamples=16000):
    
    wavs = [wavfile.read(x)[1] for x in paths]

    # zero pad the shorter samples and cut off the long ones.
    data = [] 
    for wav in wavs:
        if wav.size < 16000:
            d = np.pad(wav, (nsamples - wav.size, 0), mode='constant')
        else:
            d = wav[0:nsamples]
        data.append(d)

    # get the specgram
    specgram = [signal.spectrogram(d, nperseg=256, noverlap=128)[2] for d in data]
    specgram = [s.reshape(129, 124, -1) for s in specgram]
    
    return specgram


def batch_generator(X, y, batch_size=16):
    
    
    while True:
        # choose batch_size random images / labels from the data
        idx = np.random.randint(0, X.shape[0], batch_size)
        im = X[idx]
        label = y[idx]
        
        specgram = get_specgrams(im)


        yield np.concatenate([specgram]), label


def build_model(shape):
    '''Create a keras model.'''
    inputlayer = Input(shape=shape)

    model = BatchNormalization()(inputlayer)
    #Conv2D(filters, kernel_size, strides=(1, 1), padding='valid', data_format=None, dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None)
    model = Conv2D(16, (3, 3), activation='elu')(model)
    model = Dropout(0.25)(model)
    model = MaxPooling2D((2, 2))(model)

    model = Flatten()(model)
    model = Dense(32, activation='elu')(model)
    model = Dropout(0.8)(model)
    
    # 11 because background noise has been taken out
    model = Dense(11, activation='sigmoid')(model)
    
    model = Model(inputs=inputlayer, outputs=model)
    
    return model


train = load_data('../train/audio/')
shape = (129, 124, 1)
model = build_model(shape)

model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
              metrics=['accuracy'])


In [3]:
labelbinarizer = LabelBinarizer()
X = train.path
y = labelbinarizer.fit_transform(train.word)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=y)


In [4]:
train_gen = batch_generator(Xtrain.values, ytrain, batch_size=32)
valid_gen = batch_generator(Xtest.values, ytest, batch_size=32)

model.fit_generator(
    generator=train_gen,
    epochs=1,
    steps_per_epoch=Xtrain.shape[0] // 32,
    validation_data=valid_gen,
    validation_steps=Xtest.shape[0] // 32)


Epoch 1/1





<keras.callbacks.History at 0x1cc9cd78da0>

In [5]:


test = load_data('../test/')

predictions = []
data_dir = test.path.tolist()

for filePath in data_dir:
    specgram = get_specgrams([filePath])
    pred = model.predict(np.array(specgram))
    predictions.extend(pred)


labels = [labelbinarizer.inverse_transform(p.reshape(1, -1), threshold=0.5)[0] for p in predictions]
test['labels'] = labels

test.path = test.path.apply(lambda x: str(x).split('/')[-1])
submission = pd.DataFrame({'fname': test.path.tolist(), 'label': labels})
submission.to_csv('../output/speech.csv', index=False)