In [1]:
from __future__ import print_function
import time
from sklearn.utils import class_weight
import numpy as np # linear algebra
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, Convolution2D, Activation
from keras import backend as K
import numpy as np
from scipy import signal
import os
import random
from scipy.io import wavfile
from subprocess import check_output
import matplotlib.pyplot as plt
import librosa
import acoustics
import gc

Using TensorFlow backend.


In [2]:
import time
import numpy as np
import librosa
import os
import random

def timer():
    time_start = [int(time.time())]
    time_start_overall = [int(time.time())]

    def time_taken():
        time_now = int(time.time())
        om, os = divmod(time_now - time_start_overall[0], 60)
        m, s = divmod(time_now - time_start[0], 60)
        time_start[0] = time_now
        return ' overall_time: ' + str(om) + 'm' + str(os) + 's' + ' time_from_previous_call: ' + str(m) + 'm' + str(s) + 's'
    return time_taken
time_taken = timer()

sr = 16000

unknowns = 'bed bird cat dog eight five four happy house marvin nine one seven sheila six three tree two wow zero'.split()
knowns = 'yes no up down left right on off stop go'.split()
silence = 'silence'.split()
labels = knowns + silence + ['unknown'] 
num_classes = len(labels)

LABEL_TO_FILE_NAMES = {}
VALIDATION_LABEL_TO_FILE_NAMES = {}
TRAIN_LABEL_TO_FILE_NAMES = {}
FILE_TO_LABEL = {}
with open('train-88.csv') as f:
    for line in f:
        line = line.strip()
        line = line.split(',')
        LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        TRAIN_LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        TRAIN_LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        FILE_TO_LABEL[line[0]] = line[1]
with open('validation-88.csv') as f:
    for line in f:
        line = line.strip()
        line = line.split(',')
        LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        VALIDATION_LABEL_TO_FILE_NAMES.setdefault(line[1], [])
        VALIDATION_LABEL_TO_FILE_NAMES[line[1]].append(line[0])
        FILE_TO_LABEL[line[0]] = line[1]


def file_to_sample(filename):
    samples, _ = librosa.load(filename, sr=sr)
    return samples
    

UNCOLORED_NOISES = []
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/doing_the_dishes.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/dude_miaowing.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/exercise_bike.wav', sr=sr)[0].tolist()
UNCOLORED_NOISES += librosa.load('../train/audio/_background_noise_/running_tap.wav', sr=sr)[0].tolist()


def get_silence():
    choice = np.random.choice([0, 1, 2, 4], p=[0.01, 0.10, 0.70, 0.19])
    if choice == 0:
        return np.zeros((16000))
    elif choice == 1:
        idx = random.randint(0, len(UNCOLORED_NOISES) - sr)
        return np.array(UNCOLORED_NOISES[idx:idx+sr], dtype=np.float32)
    elif choice == 3:
        return np.array(acoustics.generator.noise(16000, color=np.random.choice(['pink', 'white']))/3, np.float32)
    else:
        random_silence_file = np.random.choice(LABEL_TO_FILE_NAMES['silence'])
        return file_to_sample(random_silence_file)

def pad_zeros(samples):
    if len(samples) < sr:
        diff = sr - len(samples)
        diff_div = diff // 2
        samples = np.lib.pad(samples, (diff_div, diff - diff_div), 'constant', constant_values = (0, 0))
    return samples

def pitch_shift(samples, sr=sr):
    return librosa.effects.pitch_shift(samples, sr=sr, n_steps=random.randint(1, 5))

def get_shuffled_XY(X, Y):
    m = X.shape[0]
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation,:,:,:]
    shuffled_Y = Y[permutation,:]
    return shuffled_X, shuffled_Y

def time_shift(arr):
    num = np.random.uniform(0, 0.2) * len(arr)
    num = int(num)
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = 0
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = 0
        result[:num] = arr[-num:]
    else:
        result = arr
    return np.array(result)
        
def flip_transform(wave):
    if np.random.choice([0, 1]):
        return -wave

def noise_mix(wave):
    noise = get_silence()
    noise_limit = random.uniform(0, 0.1)
    wave = (1 - noise_limit) * wave + noise_limit * noise
    return wave

# 72 * 72
def get_melspectrogram(samples):
    S = librosa.feature.melspectrogram(samples, sr=sr, n_mels=72, hop_length=223, n_fft=512)
    spec = librosa.power_to_db(S, ref=np.max)
    spec = np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)
    return spec

# 150 * 150
def get_mel_of_150_150(samples):
    S = librosa.feature.melspectrogram(samples, sr=sr, n_mels=150, hop_length=107, n_fft=512)
    spec = librosa.power_to_db(S, ref=np.max)
    spec = np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)
    return spec

# 197 * 161
def log_specgram(audio, sample_rate=16000, window_size=20,
                 step_size=15, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    spec = np.log(spec.T.astype(np.float32) + eps)
    return np.repeat(spec[np.newaxis,:,:,np.newaxis], 3, axis=3)

def get_transformed_samples(samples):
    samples = pad_zeros(samples)
    samples = time_shift(samples)
    samples = noise_mix(samples)
    if np.random.choice([0, 1]):
        samples = -samples
    stdx = np.std(samples)
    if stdx:
        sampels = samples / stdx
    #samples = flip_transform(samples)
    return samples

In [3]:
print(labels)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']


In [4]:
print('printing train label to filename lengths')
for label, filenames in TRAIN_LABEL_TO_FILE_NAMES.items():
    print(label, len(filenames))
print('printing validation label to filename lengths')
for label, filenames in VALIDATION_LABEL_TO_FILE_NAMES.items():
    print(label, len(filenames))

printing train label to filename lengths
no 6248
up 5944
right 6811
down 5355
off 6456
go 5331
on 6561
yes 6221
unknown 70315
stop 6733
left 6552
silence 6591
printing validation label to filename lengths
no 694
up 660
right 756
down 594
off 717
go 592
on 729
yes 691
unknown 7812
stop 748
left 728
silence 732


In [5]:
print(time_taken())

 overall_time: 0m23s time_from_previous_call: 0m23s


In [6]:
batch_size = 128
def validation_data_generator():
    XV = np.zeros((batch_size, 150, 150, 3))
    YV = np.zeros((batch_size, num_classes))
    while True:
        for idx in range(batch_size):
            random_label = np.random.choice(labels)
            if random_label == 'silence':
                samples = get_silence()
            else:
                random_filename = np.random.choice(VALIDATION_LABEL_TO_FILE_NAMES[random_label])
                samples = file_to_sample(random_filename)
                samples = get_transformed_samples(samples)
            spec = get_mel_of_150_150(samples)
            XV[idx, :, :] = spec
            this_Y = [labels.index(random_label)]
            this_Y = keras.utils.to_categorical(np.array(this_Y).astype(np.float32), num_classes)
            YV[idx, :] = this_Y
        yield XV, YV

def train_data_generator():
    XT = np.zeros((batch_size, 150, 150, 3))
    YT = np.zeros((batch_size, num_classes))
    while True:
        for idx in range(batch_size):
            random_label = np.random.choice(labels)
            if random_label == 'silence':
                samples = get_silence()
            else:
                random_filename = np.random.choice(TRAIN_LABEL_TO_FILE_NAMES[random_label])
                samples = file_to_sample(random_filename)
                samples = get_transformed_samples(samples)
            spec = get_mel_of_150_150(samples)
            XT[idx, :, :] = spec
            this_Y = [labels.index(random_label)]
            this_Y = keras.utils.to_categorical(np.array(this_Y).astype(np.float32), num_classes)
            YT[idx, :] = this_Y
        yield XT, YT

In [7]:
epochs = 200
model = keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=num_classes)
#model = keras.applications.xception.Xception(include_top=True, weights=None, input_tensor=None, input_shape=None, pooling=None, classes=num_classes)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])
#model = load_model('model-xception-0-19-0.98-0.08.h5')
checkpoint = ModelCheckpoint('model-pseudo-88-melspec-inception-resnet-v2-0-{epoch:02d}-{val_acc:.2f}-{val_loss:.2f}.h5',
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True,
                             save_weights_only=False,
                             mode='max')
earlystopping = EarlyStopping(monitor='val_acc', patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, min_lr=1e-8, verbose=1)
callback_list = [checkpoint, reduce_lr]

train_generator = train_data_generator()
validation_generator = validation_data_generator()
model.fit_generator(train_generator, 
                    steps_per_epoch=200, 
                    epochs=epochs, 
                    callbacks=callback_list,
                    validation_data=validation_generator,
                    validation_steps=20,
                    verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200

Epoch 00027: reducing learning rate to 0.00020000000949949026.
Epoch 28/200
Epoch 29/200


Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200

Epoch 00038: reducing learning rate to 4.0000001899898055e-05.
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200

Epoch 00049: reducing learning rate to 8.000000525498762e-06.
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200

Epoch 00055: reducing learning rate to 1.6000001778593287e-06.
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200



Epoch 00059: reducing learning rate to 3.200000264769187e-07.
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200

Epoch 00065: reducing learning rate to 6.400000529538374e-08.
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200

Epoch 00069: reducing learning rate to 1.2800001059076749e-08.
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200

Epoch 00073: reducing learning rate to 1e-08.
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200


Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200


Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200

KeyboardInterrupt: 

In [None]:
print(time_taken())