In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import os
import librosa as lr
import shutil

from skimage.io import imread
import h5py
import glob
from sklearn.utils import shuffle

from keras.models import Model, load_model
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from keras.layers import Dropout, Input, BatchNormalization
from keras.optimizers import Nadam
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import np_utils


Using TensorFlow backend.


In [2]:
in_dim = (192,192,1)
out_dim = 176
batch_size = 32
wav_path = 'data/roller/'
tr_path = 'data/train_wav/'
va_path = 'data/valid_wav/'
te_path = 'data/test_wav/'
data_size = 66176
tr_size = 52800
va_size = 4576
te_size = 8800

In [3]:
import os
def process_audio(in_folder, out_folder, sub_folder):
    
    os.makedirs(os.path.join(out_folder, 'train_wav/' + sub_folder), exist_ok=True)
    os.makedirs(os.path.join(out_folder, 'valid_wav/' + sub_folder), exist_ok=True)
    os.makedirs(os.path.join(out_folder, 'test_wav/unknown'), exist_ok=True)
    files = shuffle(glob.glob(in_folder+'*.wav'))
    numb_files = len(files)
    num_train =int( 0.6*numb_files)
    num_test = int(0.2*numb_files)
    num_valid = int(0.2*numb_files)
    train, test, valid = files[:num_train],files[num_train:num_train+num_test], files[num_train+num_test:]
    start = len(in_folder)
    for file in train:
        img = wav_to_img(file)
        sp.misc.imsave(os.path.join(out_folder, 'train_wav/' + sub_folder + "/") + file[start:] + '.jpg', img)
    for file in test:
        img = wav_to_img(file)
        sp.misc.imsave(os.path.join(out_folder, 'test_wav/unknown/') + file[start:] + '.jpg', img)
        
    for file in valid:
        img = wav_to_img(file)
        sp.misc.imsave(os.path.join(out_folder, 'valid_wav/', sub_folder+'/') + file[start:] + '.jpg', img)
        
def process_audio_with_classes(in_folder, out_folder, labels):
    os.makedirs(out_folder, exist_ok=True)
    for i in range(len(labels['Sample Filename'])):
        file = labels['Sample Filename'][i]
        lang = labels['Language'][i]
        os.makedirs(out_folder + lang, exist_ok=True)
        img = mp3_to_img(in_folder+file)
        sp.misc.imsave(out_folder + lang + '/' + file + '.jpg', img)

In [28]:
process_audio('data/roller/fast/', 'data', 'fast')

In [29]:
process_audio('data/roller/slow/', 'data', 'slow'),
process_audio('data/roller/very_fast/', 'data', 'very_fast')

In [57]:
import scipy
import os
def load_dataset(base_dir):
    x=[]
    y=[]
    files = glob.glob(base_dir + '/fast/*.jpg')
    yy=[1,0,0]
    for file in files:
        x += [np.reshape(scipy.ndimage.imread(file),in_dim )]
        y += [yy]
    yy = [0,1,0]
    files = glob.glob(base_dir + '/slow/*.jpg')
    for file in files:
        x += [np.reshape(scipy.ndimage.imread(file),in_dim )]
        y += [yy]
    yy = [0,0,1]
    files = glob.glob(base_dir + '/very_fast/*.jpg')
    for file in files:
        x += [np.reshape(scipy.ndimage.imread(file),in_dim )]
        y += [yy]

    x,y = shuffle(x,y)
    x = np.array(x)/255.0
    y = np.array(y)
    return x,y



In [65]:
i = Input(shape=in_dim)
m = Conv2D(16, (3, 3), activation='elu', padding='same')(i)
m = MaxPooling2D()(m)
m = Conv2D(32, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
m = Conv2D(64, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
m = Conv2D(128, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
m = Conv2D(256, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
hidden = Flatten()(m)
hidden_1 = Dense(512, activation='elu')(hidden)
m = Dropout(0.5)(hidden_1)
o = Dense(out_dim, activation='softmax')(m)

model = Model(inputs=i, outputs=o)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 192, 192, 1)       0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 192, 192, 16)      160       
_________________________________________________________________
max_pooling2d_21 (MaxPooling (None, 96, 96, 16)        0         
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 96, 96, 32)        4640      
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 48, 48, 32)        0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 48, 48, 64)        18496     
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 24, 24, 64)        0         
__________

In [66]:
model.compile(loss='categorical_crossentropy', optimizer=Nadam(lr=1e-3), metrics=['accuracy'])
model = load_model('speech_v9.h5')


In [76]:
model2 = Model(inputs=i, outputs=hidden)
x, y_tr = load_dataset("data/train_wav")

x_tr =model2.predict(x)
N, D = x_tr.shape

x, y_val = load_dataset('data/valid_wav')
x_val = model2.predict(x)

print (y_tr.shape, y_val.shape)



(393, 3) (135, 3)


In [114]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
print(D)
i1 =  Input(shape=[D])
m1 = Dense(512, activation='elu')(i1)
m1 = Dropout(0.5)(m1)
m1 = Dense(200)(m1)
m1 = Dropout(0.2)(m1)
o1 = Dense(M, activation='softmax')(m1)
model_audio = Model(inputs=i1, outputs=o1)
model_audio.summary()

9216
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        (None, 9216)              0         
_________________________________________________________________
dense_40 (Dense)             (None, 512)               4719104   
_________________________________________________________________
dropout_21 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 200)               102600    
_________________________________________________________________
dropout_22 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 3)                 603       
Total params: 4,822,307
Trainable params: 4,822,307
Non-trainable params: 0
_____________________________________________________________

In [122]:
import keras as K
optmiser = K.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model_audio.compile(loss='categorical_crossentropy', optimizer=optmiser, 
                  metrics=['accuracy'])

print(y_tr.shape, y_val.shape)

(393, 3) (135, 3)


In [123]:
model_audio_wait = 'model_adio.weights.best.hdf5'

In [127]:
from keras.callbacks import ModelCheckpoint   
model_audio = load_model(model_audio_wait)
checkpointer = ModelCheckpoint(filepath=model_audio_wait, verbose=1, 
                               save_best_only=True)
hist = model_audio.fit(x_tr, y_tr, batch_size=100, epochs=20,
          validation_data=(x_val, y_val), callbacks=[checkpointer], 
          verbose=2, shuffle=True)

Train on 393 samples, validate on 135 samples
Epoch 1/20
Epoch 00000: val_loss improved from inf to 0.69417, saving model to model_adio.weights.best.hdf5
1s - loss: 0.6102 - acc: 0.7455 - val_loss: 0.6942 - val_acc: 0.7111
Epoch 2/20
Epoch 00001: val_loss improved from 0.69417 to 0.67934, saving model to model_adio.weights.best.hdf5
0s - loss: 0.6071 - acc: 0.7532 - val_loss: 0.6793 - val_acc: 0.6593
Epoch 3/20
Epoch 00002: val_loss improved from 0.67934 to 0.67296, saving model to model_adio.weights.best.hdf5
0s - loss: 0.5970 - acc: 0.7532 - val_loss: 0.6730 - val_acc: 0.6667
Epoch 4/20
Epoch 00003: val_loss did not improve
0s - loss: 0.6066 - acc: 0.7277 - val_loss: 0.6864 - val_acc: 0.6889
Epoch 5/20
Epoch 00004: val_loss improved from 0.67296 to 0.67007, saving model to model_adio.weights.best.hdf5
0s - loss: 0.6095 - acc: 0.7481 - val_loss: 0.6701 - val_acc: 0.6667
Epoch 6/20
Epoch 00005: val_loss did not improve
0s - loss: 0.5934 - acc: 0.7405 - val_loss: 0.6703 - val_acc: 0.659