In [1]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras.callbacks import EarlyStopping
from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
root_path = r'..'
out_path = r'.'
model_path = r'.'
train_audio_path = '../train/audio/'
test_audio_path = '../test/audio/'

Using TensorFlow backend.


In [2]:
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [3]:
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=1000):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [4]:
path=train_audio_path
subdir_list = [dir for dir in os.listdir(path)]
labels_list= subdir_list
target_word_list=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
silence_list=['_background_noise_']
unknown_label_list=[label for label in labels_list if label not in target_word_list and label not in silence_list]

In [5]:
subfolderlist = []
for x in os.listdir(train_audio_path):
    if os.path.isdir(train_audio_path + '/' + x):
        subfolderlist.append(x)

In [6]:
# obtain a file map with path and label for training iterators
file_target = []
for x in subfolderlist:
    temp = [[train_audio_path+x+'/'+y,x] for y in os.listdir(train_audio_path + x) if '.wav' in y]
    file_target += temp

train_file_map = pd.DataFrame(file_target, columns=['path', 'label'])

In [7]:
train_file_map.tail()

Unnamed: 0,path,label
64722,../train/audio/zero/2ad772d6_nohash_1.wav,zero
64723,../train/audio/zero/9ff2d2f4_nohash_0.wav,zero
64724,../train/audio/zero/30f31e42_nohash_0.wav,zero
64725,../train/audio/zero/37dca74f_nohash_1.wav,zero
64726,../train/audio/zero/0d393936_nohash_0.wav,zero


In [8]:
train_file_map["nlabel"]=''    

In [9]:
for index,row in train_file_map.iterrows():
    if row["label"] in target_word_list:
        row["nlabel"]=row["label"]
    elif row["label"]=='_background_noise_':
        row["nlabel"]='silence'
    else:
        row["nlabel"]='unknown'

In [10]:
train_file_map.head()

Unnamed: 0,path,label,nlabel
0,../train/audio/bed/c245d3d7_nohash_0.wav,bed,unknown
1,../train/audio/bed/a1a59b72_nohash_0.wav,bed,unknown
2,../train/audio/bed/89f3ab7d_nohash_1.wav,bed,unknown
3,../train/audio/bed/35c8fa78_nohash_1.wav,bed,unknown
4,../train/audio/bed/1706c35f_nohash_0.wav,bed,unknown


In [11]:
targets_to_keep = ['yes', 'no', 'up', 'down', 'left',
                   'right', 'on', 'off', 'stop', 'go', 'silence']
train_file_map['target'] = train_file_map['nlabel']
train_file_map['target'] = train_file_map['target'].apply(
    lambda x: x if x in targets_to_keep else 'unknown')
label_to_ix = {
    'unknown': 0,
    'silence': 1,
    'down': 2,
    'go': 3,
    'left': 4,
    'no': 5,
    'off': 6,
    'on': 7,
    'right': 8,
    'stop': 9,
    'up': 10,
    'yes': 11}
ix_to_label ={
    0: 'unknown',
    1: 'silence',
    2: 'down',
    3: 'go',
    4: 'left',
    5: 'no',
    6: 'off',
    7: 'on',
    8: 'right',
    9: 'stop',
    10: 'up',
    11: 'yes'
}
train_file_map['label'] = train_file_map['target'].apply(lambda x: label_to_ix[x])

In [12]:
train_file_map=train_file_map.drop(columns="nlabel")

In [13]:
train_file_map.tail()

Unnamed: 0,path,label,target
64722,../train/audio/zero/2ad772d6_nohash_1.wav,0,unknown
64723,../train/audio/zero/9ff2d2f4_nohash_0.wav,0,unknown
64724,../train/audio/zero/30f31e42_nohash_0.wav,0,unknown
64725,../train/audio/zero/37dca74f_nohash_1.wav,0,unknown
64726,../train/audio/zero/0d393936_nohash_0.wav,0,unknown


In [14]:
train_file_map["target"].unique()

array(['unknown', 'right', 'on', 'stop', 'no', 'off', 'silence', 'up',
       'yes', 'go', 'left', 'down'], dtype=object)

In [15]:
random_state=2017
test_size=0.1
val_size=0.1
BATCH=16

In [16]:
import os
import numpy as np
import pandas as pd
import random
    
from glob import glob

from scipy.io import wavfile
from scipy.signal import stft

from sklearn.model_selection import train_test_split

from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

In [17]:
def generator(df,batch_size,mode):
    new_sample_rate=16000
    print("chum")
    df_train,df_val=train_test_split(df,test_size=val_size,random_state=random_state)
    while True:
        # Depending on mode select DataFrame with paths
        if mode == 'train':
            ids = random.sample(range(df_train.shape[0]), df_train.shape[0])
        elif mode == 'val':
            ids = random.sample(range(df_val.shape[0]),df_val.shape[0])
        elif mode == 'test':
            ids = list(range(df_test.shape[0]))
            print(len(ids))
        else:
            raise ValueError('The mode should be either train, val or test.')
                
            # Create batches (for training data the batches are randomly permuted)
        for start in range(0, len(ids), batch_size):
            X_batch = []
            if mode != 'test': 
                y_batch = []
            end = min(start + batch_size, len(ids))
            i_batch = ids[start:end]
            for i in i_batch:
                sample_rate, samples = wavfile.read(train_file_map["path"][i])
                samples = pad_audio(samples)
                if len(samples) > 16000:
                    n_samples = chop_audio(samples)
                else: n_samples = [samples]
                for samples in n_samples:
                    resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
                    _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
                    y_batch.append(train_file_map["label"][i])
                    X_batch.append(specgram)
               
            X_batch = np.array(X_batch)
            X_batch = X_batch.reshape(tuple(list(X_batch.shape)+[1]))
            y_batch=to_categorical(y_batch, num_classes = len(label_to_ix))
            
            if mode != 'test':
                yield (X_batch, y_batch)
            else:
                print("test")
                yield X_batch

In [18]:
sample_map_df_train,sample_map_df_val=train_test_split(train_file_map,test_size=val_size,random_state=random_state)

In [19]:
input_shape = (99, 161, 1)
nclass = 12
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(16, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(16, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(64, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(2, 2))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(256, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(256, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()
#callbacks = [EarlyStopping(monitor='val_acc', patience=4, verbose=1, mode='max')]
model.compile(optimizer=opt, loss=losses.binary_crossentropy)
model.summary()

#x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=2017)
history=model.fit_generator(generator=generator(train_file_map,batch_size=16, mode='train'),steps_per_epoch=int(np.ceil(len(sample_map_df_train)/16)), validation_data=generator(train_file_map,batch_size=16, mode='val'), validation_steps=int(np.ceil(len(sample_map_df_val)/16)), epochs=5)

model.save(os.path.join(model_path, 'light_cnn.model'))

W0917 03:46:34.414534 139871869134592 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0917 03:46:34.447576 139871869134592 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0917 03:46:34.465629 139871869134592 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0917 03:46:34.466262 139871869134592 deprecation_wrapper.py:119] From /home/ubuntu/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_ba

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 99, 161, 1)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 99, 161, 1)        4         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 160, 16)       80        
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 97, 159, 16)       1040      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 79, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 79, 16)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 46, 77, 32)        4640      
__________



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
def test_data_generator(batch=16):
    fpaths = glob(os.path.join(test_audio_path, '*wav'))
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        resampled = signal.resample(samples, int(new_sample_rate / rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        imgs.append(specgram)
        fnames.append(path.split('\\')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
        yield fnames, imgs
    raise StopIteration()

In [21]:
new_sample_rate=16000
index = []
results = []
for fnames, imgs in test_data_generator(batch=32):
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [ix_to_label[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results

df2=df.copy()
M = lambda x: x["fname"][14:]
df2=df2.apply(M,axis=1)
df["fname"]=df2

df.to_csv(os.path.join(out_path, 'sub4.csv'))

