In [1]:
from __future__ import print_function
import time
from sklearn.utils import class_weight
import numpy as np # linear algebra
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, Convolution2D, Activation
from keras import backend as K
import numpy as np
from scipy import signal
import os
import random
from scipy.io import wavfile
from subprocess import check_output
import matplotlib.pyplot as plt
import librosa
import acoustics
import gc

Using TensorFlow backend.


In [2]:
import time
import numpy as np
import librosa
import os
import random

def timer():
    time_start = [int(time.time())]
    time_start_overall = [int(time.time())]

    def time_taken():
        time_now = int(time.time())
        om, os = divmod(time_now - time_start_overall[0], 60)
        m, s = divmod(time_now - time_start[0], 60)
        time_start[0] = time_now
        return ' overall_time: ' + str(om) + 'm' + str(os) + 's' + ' time_from_previous_call: ' + str(m) + 'm' + str(s) + 's'
    return time_taken
time_taken = timer()

sr = 16000

unknowns = 'bed bird cat dog eight five four happy house marvin nine one seven sheila six three tree two wow zero'.split()
knowns = 'yes no up down left right on off stop go'.split()
silence = 'silence'.split()
labels = knowns + silence + ['unknown'] 
num_classes = len(labels)

LABEL_TO_FILE_NAMES = {}
VALIDATION_LABEL_TO_FILE_NAMES = {}
TRAIN_LABEL_TO_FILE_NAMES = {}
FILE_TO_LABEL = {}
with open('train-88.csv') as f:
    for line in f:
        line = line.strip()
        line = line.split(',')
        LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        TRAIN_LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        TRAIN_LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        FILE_TO_LABEL[line[0]] = line[1]
with open('validation-88.csv') as f:
    for line in f:
        line = line.strip()
        line = line.split(',')
        LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        VALIDATION_LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        VALIDATION_LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        FILE_TO_LABEL[line[0]] = line[1]


def file_to_sample(filename):
    samples, _ = librosa.load(filename, sr=sr)
    return samples
    

UNCOLORED_NOISES = []
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/doing_the_dishes.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/dude_miaowing.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/exercise_bike.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/running_tap.wav', sr=sr)[0].tolist()


def get_silence():
    choice = np.random.choice([0, 1, 2, 4], p=[0.01, 0.10, 0.60, 0.29])
    if choice == 0:
        return np.zeros((16000))
    elif choice == 1:
        idx = random.randint(0, len(UNCOLORED_NOISES) - sr)
        return np.array(UNCOLORED_NOISES[idx:idx+sr], dtype=np.float32)
    elif choice == 3:
        return np.array(acoustics.generator.noise(16000, color=np.random.choice(['pink', 'white']))/3, np.float32)
    else:
        random_silence_file = np.random.choice(LABEL_TO_FILE_NAMES['silence'])
        return file_to_sample(random_silence_file)

def pad_zeros(samples):
    if len(samples) < sr:
        diff = sr - len(samples)
        diff_div = diff // 2
        samples = np.lib.pad(samples, (diff_div, diff - diff_div), 'constant', constant_values = (0, 0))
    return samples

def pitch_shift(samples, sr=sr):
    return librosa.effects.pitch_shift(samples, sr=sr, n_steps=random.randint(1, 5))

def get_shuffled_XY(X, Y):
    m = X.shape[0]
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation,:,:,:]
    shuffled_Y = Y[permutation,:]
    return shuffled_X, shuffled_Y

def time_shift(arr):
    num = np.random.uniform(0, 0.2) * len(arr)
    num = int(num)
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = 0
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = 0
        result[:num] = arr[-num:]
    else:
        result = arr
    return np.array(result)
        
def flip_transform(wave):
    if np.random.choice([0, 1]):
        return -wave

def noise_mix(wave):
    if np.random.random() < 0.10:
        return wave
    noise = get_silence()
    noise_limit = random.uniform(0, 0.1)
    wave = (1 - noise_limit) * wave + noise_limit * noise
    return wave

# 72 * 72
def get_melspectrogram(samples):
    S = librosa.feature.melspectrogram(samples, sr=sr, n_mels=72, hop_length=223, n_fft=512)
    spec = librosa.power_to_db(S, ref=np.max)
    spec = np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)
    return spec

# 150 * 150
def get_mel_of_150_150(samples):
    S = librosa.feature.melspectrogram(samples, sr=sr, n_mels=150, hop_length=107, n_fft=512)
    spec = librosa.power_to_db(S, ref=np.max)
    spec = np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)
    return spec

# 197 * 161
def log_specgram(audio, sample_rate=16000, window_size=20,
                 step_size=15, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    spec = np.log(spec.T.astype(np.float32) + eps)
    return np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)

def get_transformed_samples(samples):
    samples = pad_zeros(samples)
    samples = time_shift(samples)
    samples = noise_mix(samples)
    if np.random.choice([0, 1]):
        samples = -samples
    stdx = np.std(samples)
    if stdx:
        sampels = samples / stdx
    #samples = flip_transform(samples)
    return samples

In [3]:
print(labels)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']


In [4]:
print('printing train label to filename lengths')
for label, filenames in TRAIN_LABEL_TO_FILE_NAMES.items():
    print(label, len(filenames))
print('printing validation label to filename lengths')
for label, filenames in VALIDATION_LABEL_TO_FILE_NAMES.items():
    print(label, len(filenames))

printing train label to filename lengths
stop 6733
unknown 70315
down 5355
on 6561
left 6552
right 6811
go 5331
no 6248
yes 6221
off 6456
up 5944
silence 6591
printing validation label to filename lengths
stop 748
unknown 7812
down 594
on 729
left 728
right 756
go 592
no 694
yes 691
off 717
up 660
silence 732


In [5]:
print(time_taken())

 overall_time: 0m14s time_from_previous_call: 0m14s


In [7]:
batch_size = 128
def validation_data_generator():
    XV = np.zeros((batch_size, 150, 150, 3))
    YV = np.zeros((batch_size, num_classes))
    while True:
        for idx in range(batch_size):
            random_label = np.random.choice(labels)
            if random_label == 'silence':
                samples = get_silence()
            else:
                random_filename = np.random.choice(VALIDATION_LABEL_TO_FILE_NAMES[random_label])
                samples = file_to_sample(random_filename)
                samples = get_transformed_samples(samples)
            spec = get_mel_of_150_150(samples)
            XV[idx, :, :] = spec
            this_Y = [labels.index(random_label)]
            this_Y = keras.utils.to_categorical(np.array(this_Y).astype(np.float32), num_classes)
            YV[idx, :] = this_Y
        yield XV, YV

def train_data_generator():
    XT = np.zeros((batch_size, 150, 150, 3))
    YT = np.zeros((batch_size, num_classes))
    while True:
        for idx in range(batch_size):
            random_label = np.random.choice(labels)
            if random_label == 'silence':
                samples = get_silence()
            else:
                random_filename = np.random.choice(TRAIN_LABEL_TO_FILE_NAMES[random_label])
                samples = file_to_sample(random_filename)
                samples = get_transformed_samples(samples)
            spec = get_mel_of_150_150(samples)
            XT[idx, :, :] = spec
            this_Y = [labels.index(random_label)]
            this_Y = keras.utils.to_categorical(np.array(this_Y).astype(np.float32), num_classes)
            YT[idx, :] = this_Y
        yield XT, YT

In [None]:
epochs = 200
#model = keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=num_classes)
#model = keras.applications.xception.Xception(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=num_classes)
# model.compile(loss=keras.losses.categorical_crossentropy,
#               optimizer=keras.optimizers.Adam(),
#               metrics=['accuracy'])
model = load_model('model-pseudo-88-melspec-inception-resnet-v2-0-114-0.99-0.07.h5')
checkpoint = ModelCheckpoint('model-pseudo-88-melspec-inception-resnet-v2-1-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.h5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             save_weights_only=False,
                             mode='max')
earlystopping = EarlyStopping(monitor='val_acc', patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, min_lr=1e-8, verbose=1)
callback_list = [checkpoint, reduce_lr]

train_generator = train_data_generator()
validation_generator = validation_data_generator()
model.fit_generator(train_generator, 
                    steps_per_epoch=200, 
                    epochs=epochs, 
                    callbacks=callback_list,
                    validation_data=validation_generator,
                    validation_steps=20,
                    verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
 24/200 [==>...........................] - ETA: 2:42 - loss: 0.0427 - acc: 0.9928

In [None]:
print(time_taken())