In [84]:
import re
import gc
import os
import time
import datetime
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import signal
from scipy.io import wavfile
from scipy.fftpack import fft

from IPython.display import clear_output

import keras
from keras import optimizers
from keras.layers import Conv2D, BatchNormalization, MaxPooling2D, Dense, Input, Dropout, Flatten, Activation, Conv1D, LSTM
from keras.layers.advanced_activations import PReLU
from keras.initializers import Constant
from keras.initializers import he_normal, he_uniform
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.regularizers import l2
from keras.models import Sequential
from keras.callbacks import LearningRateScheduler

from glob import glob

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
# seems to reduce the chance of gpu errors, also doesn't blindly allocate all vram 

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)

In [3]:
def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def list_wavs_fname(dirpath, ext='wav'):
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+\\(\w+)\\\w+\.wav$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+\\(\w+\.wav)$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [4]:
L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
root_path = r'.\\data'
out_path = r'.'
model_path = r'.'
train_data_path = os.path.join(root_path, 'input', 'train', 'audio')
test_data_path = os.path.join(root_path, 'input', 'test', 'audio')

In [41]:
labels, fnames = list_wavs_fname(train_data_path)

new_sample_rate = 16000
y = []
x = []

for label, fname in zip(labels, fnames):
    if label not in legal_labels and label != '_background_noise_' and np.random.randint(10) > 0:
        continue
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y.append(label)
        x.append(specgram)
x = np.array(x)
y = label_transform(y)
label_index = y.columns.values
y = y.values
y = np.array(y)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
del x, y
gc.collect()

.\\data\input\train\audio


  # Remove the CWD from sys.path while we load stuff.


33

In [133]:
# def cnn_lstm():

#     model = Sequential()
#     model.add(Conv1D(256, 10, strides=4, input_shape=(99, 161)))
#     model.add(BatchNormalization())
#     model.add(Activation('relu'))
#     model.add(Dropout(0.2))
        
#     model.add(LSTM(128, activation='relu', return_sequences=True, dropout=0.2))
#     model.add(LSTM(128, activation='relu', return_sequences=True, dropout=0.2))

#     # 1 fully connected layer DNN ReLu with default 20% dropout
#     model.add(Dense(64))
#     model.add(Activation('relu'))
#     model.add(Dropout(0.2))
    
#     # Output layer with softmax
#     model.add(Dense(12))
#     model.add(Activation('softmax'))
    
#     return model

# model = cnn_lstm()
# model.summary()

In [138]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Bidirectional, TimeDistributed, Conv1D, ZeroPadding1D, GRU
from tensorflow.keras.layers import Lambda, Input, Dropout, Masking, BatchNormalization, Activation
from tensorflow.keras.models import Model

def cnn_lstm():
    input_data = Input(shape=(99, 161))

    x = Conv1D(filters=256, kernel_size=10, strides=4)(input_data)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
        
    x = LSTM(128, activation='relu', return_sequences=True, dropout=0.2, recurrent_dropout=0.3)(x)
    x = LSTM(128, activation='relu', return_sequences=False, dropout=0.2, recurrent_dropout=0.4)(x)

    x = Dense(units=256, activation='relu', kernel_regularizer=l2(1e-6), kernel_initializer=he_uniform())(x)
    x = Dropout(0.5)(x)

    # Output layer with softmax
    y_pred = Dense(units=12, activation='softmax')(x)

    network_model = Model(inputs=input_data, outputs=y_pred)
    
    return network_model

K.clear_session()
model = cnn_lstm()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 99, 161)]         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 23, 256)           412416    
_________________________________________________________________
batch_normalization (BatchNo (None, 23, 256)           1024      
_________________________________________________________________
activation (Activation)      (None, 23, 256)           0         
_________________________________________________________________
dropout (Dropout)            (None, 23, 256)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 23, 128)           197120    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584

In [139]:
from tensorflow.keras.optimizers import Adam

opt = Adam(lr=1e-3, epsilon=1e-4)

model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

#Adam
def step_scheduler(epoch, lr):
    return 1e-3 - 2e-5*epoch

history = model.fit(x_train, y_train,
                    batch_size=128, epochs=50,
                    validation_data=(x_test, y_test),
                    callbacks=[LearningRateScheduler(step_scheduler, verbose=0)])

Train on 22392 samples, validate on 5598 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
Epoch 40/50
22392/22392 [==============================] - 12s 551us/sample - loss: 0.0936 - accuracy: 0.9691 - val_loss: 0.2047 - val_accuracy: 0.9489
Epoch 41/50
22392/22392 [==============================] - 12s 549us/sample - loss: 0.0863 - accuracy: 0.9713 - val_loss: 0.2121 - val_accuracy: 0.9471
Epoch 42/50
22392/22392 [==============================] - 12s 545us/sample - loss: 0.0867 - accuracy: 0.9717 - val_loss: 0.2081 - val_accuracy: 0.9496
Epoch 43/50
22392/22392 [==============================] - 12s 547us/sample - loss: 0.0805 - accuracy: 0.9727 - val_loss: 0.2127 - val_accuracy: 0.9496
Epoch 44/50
22392/22392 [==============================] - 12s 550us/sample - loss: 0.0747 - accuracy: 0.9753 - val_loss: 0.2151 - val_accuracy: 0.9478
Epoch 45/50
22392/22392 [==============================] - 12s 544us/sample - loss: 0.0779 - accuracy: 0.9740 - val_loss: 0.2125 - val_accuracy: 0.9505
Epoch 46/50
22392/22392 [==============================] - 12s 548us/sample - loss: 0.0735 - accuracy: 0.9758 - val_loss: 0.2147 - val_accuracy: 0.9512
Epoch 47/50
22392/22392 [==============================] - 12s 545us/sample - loss: 0.0742 - accuracy: 0.9737 - val_loss: 0.2159 - val_accuracy: 0.9514
Epoch 48/50
22392/22392 [==============================] - 12s 548us/sample - loss: 0.0689 - accuracy: 0.9773 - val_loss: 0.2066 - val_accuracy: 0.9512
Epoch 49/50
22392/22392 [==============================] - 12s 547us/sample - loss: 0.0679 - accuracy: 0.9764 - val_loss: 0.2133 - val_accuracy: 0.9518
Epoch 50/50
22392/22392 [==============================] - 12s 545us/sample - loss: 0.0676 - accuracy: 0.9770 - val_loss: 0.2113 - val_accuracy: 0.9516

In [140]:
import collections
collections.Counter(label_index[np.argmax(y_train, axis=1)])

Counter({'up': 1898,
         'off': 1890,
         'right': 1906,
         'yes': 1901,
         'go': 1888,
         'down': 1885,
         'left': 1885,
         'unknown': 3343,
         'no': 1909,
         'on': 1881,
         'stop': 1910,
         'silence': 96})

In [141]:
def test_data_generator(batch):
    fpaths = glob(os.path.join(test_data_path, '*wav'))
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        resampled = signal.resample(samples, int(new_sample_rate / rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        imgs.append(specgram)
        fnames.append(path.split('\\')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        yield fnames, imgs
    raise StopIteration()

In [142]:
gc.collect()

index = []
results = []
t0 = datetime.datetime.now()
i = 1
b = 128

for fnames, imgs in test_data_generator(b):
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)
    delta = datetime.datetime.now() - t0
    clear_output(wait=True)
    print('%.2f%% %.1f seconds left' % (b*i/1585.38, delta.total_seconds()*(158538/b/i-1)))
    i+=1

100.03% -0.3 seconds left


RuntimeError: generator raised StopIteration

In [143]:
df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results
df.to_csv(os.path.join(out_path, 'submission.csv'), index=False)

In [144]:
df['label'].value_counts()

unknown    67155
on         12437
right      11284
no          9058
go          8344
silence     8034
left        7624
up          7575
off         7521
down        6667
yes         6465
stop        6374
Name: label, dtype: int64