In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
from sklearn.model_selection import train_test_split
import glob

import os
print(os.listdir('.'))
import gc
import python_speech_features as psfs

['train.7z', 'submission.csv', 'test.7z', '.ipynb_checkpoints', 'sample_submission.7z', 'sample_submission.csv', 'sc.sh', 'link_to_gcp_credits_form.txt', 'train', 'model1.h5', 'data_work.ipynb', 'test', 'EDA.ipynb']


In [2]:
t_list = "train/audio/"
v_list = "test/audio/"
cats = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']

In [3]:
# !sudo python3 -m pip install python_speech_features

In [4]:
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [5]:
def list_wavs_fname(train_audio_path):
    labels = []
    fnames = []
    for i in next(os.walk(train_audio_path))[1]:
#         lst = [name for name in os.listdir(train_audio_path + i) if os.path.isfile(os.path.join(train_audio_path + i, name))]
        lst = [name for name in os.listdir(train_audio_path + i) if os.path.join(train_audio_path + i, name).endswith(".wav")]
#         lst = [i + '/' + name for name in os.listdir(train_audio_path + i) if os.path.isfile(os.path.join(train_audio_path + i, name))]
        fnames += lst
        
        labels += [i]*len(lst)
#         if i in cats:
#             labels += [i]*len(lst)
#         else:
#             labels += len(lst) * ['unknown']
#         print("Done", i)
#         print(len(lst))
    return labels, fnames

In [6]:
def pad_audio(samples):
    L = 16000
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in cats:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [7]:
labels, fnames = list_wavs_fname(t_list)

new_sample_rate = 8000
y_train = []
x_train = []

for label, fname in zip(labels, fnames):
#     print (label)
    sample_rate, samples = wavfile.read(os.path.join(t_list, label, fname))
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y_train.append(label)
        x_train.append(specgram)
x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
del labels, fnames
gc.collect()



36

In [8]:
from tensorflow.keras.layers import Input, BatchNormalization, Convolution2D, MaxPooling2D, Dropout, Dense, Flatten
from tensorflow.keras import optimizers, losses, activations, models

In [9]:
input_shape = (99, 81, 1)
nclass = 12
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(16, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.binary_crossentropy)
model.summary()

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=2017)


model.fit(x_train, y_train, batch_size=16, validation_data=(x_valid, y_valid), epochs=3, shuffle=True, verbose=1,
         callbacks=[tf.keras.callbacks.TensorBoard(log_dir="/tmp/model1/")])

model.save('model1.model')

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 99, 81, 1)]       0         
_________________________________________________________________
batch_normalization_v2 (Batc (None, 99, 81, 1)         4         
_________________________________________________________________
conv2d (Conv2D)              (None, 98, 80, 8)         40        
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 97, 79, 8)         264       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 48, 39, 8)         0         
_________________________________________________________________
dropout (Dropout)            (None, 48, 39, 8)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 46, 37, 16)        1168  

In [10]:
model.save_weights('model1.h5')

In [11]:
def test_data_generator(batch=16):
#     fpaths = glob(os.path.join(v_list, '*wav'))
#     fpaths = os.path.join(v_list, '/').endswith(".wav")
    fpaths = [name for name in os.listdir(v_list) if os.path.join(v_list, name).endswith(".wav")]
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        rate, samples = wavfile.read(os.path.join(v_list, path))
        samples = pad_audio(samples)
        resampled = signal.resample(samples, int(new_sample_rate / rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        imgs.append(specgram)
        fnames.append(path.split('\\')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
        yield fnames, imgs
    raise StopIteration()

In [12]:
# exit() #delete this
# del x_train, y_train
# gc.collect()

index = []
results = []
for fnames, imgs in test_data_generator(batch=32):
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results
df.to_csv('submission.csv', index=False)