In [1]:
from __future__ import print_function
import time
from sklearn.utils import class_weight
import numpy as np # linear algebra
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, Convolution2D, Activation
from keras import backend as K
import numpy as np
from scipy import signal
import os
import random
from scipy.io import wavfile
from subprocess import check_output
import matplotlib.pyplot as plt
import librosa
import acoustics
from data_augment import augment_data
import gc

Using TensorFlow backend.


In [2]:
import time
import numpy as np
import librosa
import os
import random

def timer():
    time_start = [int(time.time())]
    time_start_overall = [int(time.time())]

    def time_taken():
        time_now = int(time.time())
        om, os = divmod(time_now - time_start_overall[0], 60)
        m, s = divmod(time_now - time_start[0], 60)
        time_start[0] = time_now
        return ' overall_time: ' + str(om) + 'm' + str(os) + 's' + ' time_from_previous_call: ' + str(m) + 'm' + str(s) + 's'
    return time_taken
time_taken = timer()

sr = 16000

unknowns = 'bed bird cat dog eight five four happy house marvin nine one seven sheila six three tree two wow zero'.split()
knowns = 'yes no up down left right on off stop go'.split()
silence = 'silence'.split()
labels = knowns + silence + ['unknown'] 
num_classes = len(labels)

LABEL_TO_FILE_NAMES = {}
for label in knowns:
    for filename in os.listdir('../train/audio/' + label):
        if not filename.endswith('.wav'):
            continue
        LABEL_TO_FILE_NAMES.setdefault(label, [])
        LABEL_TO_FILE_NAMES[label].append('../train/audio/' + label + '/' + filename)

for label in unknowns:
    for filename in os.listdir('../train/audio/' + label):
        if not filename.endswith('.wav'):
            continue
        LABEL_TO_FILE_NAMES.setdefault('unknown', [])
        LABEL_TO_FILE_NAMES['unknown'].append('../train/audio/' + label + '/' + filename)


VALIDATION_LABEL_TO_FILE_NAMES = {}
TRAIN_LABEL_TO_FILE_NAMES = {}
for label, filenames in LABEL_TO_FILE_NAMES.items():
    np.random.shuffle(filenames)
    if label == 'unknown':
        VALIDATION_LABEL_TO_FILE_NAMES[label] = filenames[0:4000]
        TRAIN_LABEL_TO_FILE_NAMES[label] = filenames[4000:]
    else:
        VALIDATION_LABEL_TO_FILE_NAMES[label] = filenames[0:400]
        TRAIN_LABEL_TO_FILE_NAMES[label] = filenames[400:]

FILE_TO_SAMPLES = {}
FILE_TO_LABEL = {}
for label, filenames in LABEL_TO_FILE_NAMES.items():
    for filename in filenames:
        samples, _ = librosa.load(filename, sr=sr)
        FILE_TO_SAMPLES[filename] = samples
        FILE_TO_LABEL[filename] = label
        if len(FILE_TO_LABEL) % 5000 == 0:
            print(len(FILE_TO_LABEL), time_taken())
    

UNCOLORED_NOISES = []
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/doing_the_dishes.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/dude_miaowing.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/exercise_bike.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/running_tap.wav', sr=sr)[0].tolist()


def get_silence():
    choice = np.random.choice([0, 1, 2], p=[0.01, 0.15, 0.84])
    if choice == 0:
        return np.zeros((16000))
    if choice == 1:
        idx = random.randint(0, len(UNCOLORED_NOISES) - sr)
        return np.array(UNCOLORED_NOISES[idx:idx+sr], dtype=np.float32)
    else:
        return np.array(acoustics.generator.noise(16000, color=np.random.choice(['pink', 'white']))/3, np.float32)

def pad_zeros(samples):
    if len(samples) < sr:
        diff = sr - len(samples)
        diff_div = diff // 2
        samples = np.lib.pad(samples, (diff_div, diff - diff_div), 'constant', constant_values = (0, 0))
    return samples

def pitch_shift(samples, sr=sr):
    return librosa.effects.pitch_shift(samples, sr=sr, n_steps=random.randint(1, 5))

def get_shuffled_XY(X, Y):
    m = X.shape[0]
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation,:,:,:]
    shuffled_Y = Y[permutation,:]
    return shuffled_X, shuffled_Y

def time_shift(arr):
    num = np.random.uniform(0, 0.2) * len(arr)
    num = int(num)
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = 0
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = 0
        result[:num] = arr[-num:]
    else:
        result = arr
    return np.array(result)
        
def flip_transform(wave):
    if np.random.choice([0, 1]):
        return -wave

def noise_mix(wave):
    noise = get_silence()
    noise_limit = random.uniform(0, 0.1)
    wave = (1 - noise_limit) * wave + noise_limit * noise
    return wave

def get_melspectrogram(samples):
    S = librosa.feature.melspectrogram(samples, sr=sr, n_mels=72, hop_length=223, n_fft=512)
    spec = librosa.power_to_db(S, ref=np.max)
    spec = np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)
    return spec

def get_transformed_samples(samples):
    samples = pad_zeros(samples)
    samples = time_shift(samples)
    samples = noise_mix(samples)
    if np.random.choice([0, 1]):
        samples = -samples
    stdx = np.std(samples)
    if stdx:
        sampels = samples / stdx
    #samples = flip_transform(samples)
    return samples

5000  overall_time: 0m2s time_from_previous_call: 0m2s
10000  overall_time: 0m4s time_from_previous_call: 0m2s
15000  overall_time: 0m6s time_from_previous_call: 0m2s
20000  overall_time: 0m8s time_from_previous_call: 0m2s
25000  overall_time: 0m9s time_from_previous_call: 0m1s
30000  overall_time: 0m11s time_from_previous_call: 0m2s
35000  overall_time: 0m13s time_from_previous_call: 0m2s
40000  overall_time: 0m15s time_from_previous_call: 0m2s
45000  overall_time: 0m17s time_from_previous_call: 0m2s
50000  overall_time: 0m19s time_from_previous_call: 0m2s
55000  overall_time: 0m21s time_from_previous_call: 0m2s
60000  overall_time: 0m23s time_from_previous_call: 0m2s


In [3]:
print(labels)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']


In [4]:
print('printing train label to filename lengths')
for label, filenames in TRAIN_LABEL_TO_FILE_NAMES.items():
    print(label, len(filenames))
print('printing validation label to filename lengths')
for label, filenames in VALIDATION_LABEL_TO_FILE_NAMES.items():
    print(label, len(filenames))

printing train label to filename lengths
on 1967
off 1957
down 1959
up 1975
no 1975
stop 1980
left 1953
right 1967
go 1972
yes 1977
unknown 37039
printing validation label to filename lengths
on 400
off 400
down 400
up 400
no 400
stop 400
left 400
right 400
go 400
yes 400
unknown 4000


In [5]:
print(time_taken())

 overall_time: 0m43s time_from_previous_call: 0m20s


In [6]:
def validation_data_generator():
    XV = np.zeros((256, 72, 72, 3))
    YV = np.zeros((256, num_classes))
    while True:
        for idx in range(256):
            random_label = np.random.choice(labels)
            if random_label == 'silence':
                samples = get_silence()
            else:
                random_filename = np.random.choice(VALIDATION_LABEL_TO_FILE_NAMES[random_label])
                samples = FILE_TO_SAMPLES[random_filename]
                samples = get_transformed_samples(samples)
            spec = get_melspectrogram(samples)
            XV[idx, :, :] = spec
            this_Y = [labels.index(random_label)]
            this_Y = keras.utils.to_categorical(np.array(this_Y).astype(np.float32), num_classes)
            YV[idx, :] = this_Y
        yield XV, YV

def train_data_generator():
    XT = np.zeros((256, 72, 72, 3))
    YT = np.zeros((256, num_classes))
    while True:
        for idx in range(256):
            random_label = np.random.choice(labels)
            if random_label == 'silence':
                samples = get_silence()
            else:
                random_filename = np.random.choice(TRAIN_LABEL_TO_FILE_NAMES[random_label])
                samples = FILE_TO_SAMPLES[random_filename]
                samples = get_transformed_samples(samples)
            spec = get_melspectrogram(samples)
            XT[idx, :, :] = spec
            this_Y = [labels.index(random_label)]
            this_Y = keras.utils.to_categorical(np.array(this_Y).astype(np.float32), num_classes)
            YT[idx, :] = this_Y
        yield XT, YT

In [None]:
epochs = 200
# model = keras.applications.xception.Xception(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=num_classes)
# model.compile(loss=keras.losses.categorical_crossentropy,
#               optimizer=keras.optimizers.Adam(),
#               metrics=['accuracy'])
model = load_model('model-xception-resumed-07-0.97-0.13.h5')
checkpoint = ModelCheckpoint('model-xception-resumed2-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.h5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             save_weights_only=False,
                             mode='max')
earlystopping = EarlyStopping(monitor='val_acc', patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, min_lr=1e-8, verbose=1)
callback_list = [checkpoint, reduce_lr]

train_generator = train_data_generator()
validation_generator = validation_data_generator()
model.fit_generator(train_generator, 
                    steps_per_epoch=200, 
                    epochs=epochs, 
                    callbacks=callback_list,
                    validation_data=validation_generator,
                    validation_steps=20,
                    verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200

Epoch 00007: reducing learning rate to 1.6000001778593287e-06.
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200

Epoch 00011: reducing learning rate to 3.200000264769187e-07.
Epoch 12/200

In [None]:
print(time_taken())