In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import os
import librosa as lr
import shutil

from skimage.io import imread
import h5py
import glob
from sklearn.utils import shuffle

from keras.models import Model, load_model
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from keras.layers import Dropout, Input, BatchNormalization
from keras.optimizers import Nadam
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import np_utils


Using TensorFlow backend.


In [2]:
in_dim = (192,192,1)
out_dim = 176
batch_size = 32
wav_path = 'data/roller/'
tr_path = 'data/train_wav/'
va_path = 'data/valid_wav/'
te_path = 'data/test_wav/'
data_size = 66176
tr_size = 52800
va_size = 4576
te_size = 8800

In [3]:
import os
def wav_to_img(path, height=192, width=192):
    signal, sr = lr.load(path, res_type='kaiser_fast')
    hl = signal.shape[0]//(width*1.1) #this will cut away 5% from start and end
    spec = lr.feature.melspectrogram(signal, n_mels=height, hop_length=int(hl))
    img = lr.logamplitude(spec)**2
    start = (img.shape[1] - width) // 2
    return img[:, start:start+width]

def process_audio(in_folder, out_folder, sub_folder):
    
    os.makedirs(os.path.join(out_folder, 'train_wav/' + sub_folder), exist_ok=True)
    os.makedirs(os.path.join(out_folder, 'valid_wav/' + sub_folder), exist_ok=True)
    os.makedirs(os.path.join(out_folder, 'test_wav/' + sub_folder), exist_ok=True)
    files = shuffle(glob.glob(in_folder+'*.wav'))
    numb_files = len(files)
    num_train =int( 0.6*numb_files)
    num_test = int(0.2*numb_files)
    num_valid = int(0.2*numb_files)
    train, test, valid = files[:num_train],files[num_train:num_train+num_test], files[num_train+num_test:]
    start = len(in_folder)
    for file in train:
        img = wav_to_img(file)
        sp.misc.imsave(os.path.join(out_folder, 'train_wav/' + sub_folder + "/") + file[start:] + '.jpg', img)
    for file in test:
        img = wav_to_img(file)
        sp.misc.imsave(os.path.join(out_folder, 'test_wav/' + sub_folder + "/")+ file[start:] + '.jpg', img)
        
    for file in valid:
        img = wav_to_img(file)
        sp.misc.imsave(os.path.join(out_folder, 'valid_wav/', sub_folder+'/') + file[start:] + '.jpg', img)
        
    
def process_audio_with_classes(in_folder, out_folder, labels):
    os.makedirs(out_folder, exist_ok=True)
    for i in range(len(labels['Sample Filename'])):
        file = labels['Sample Filename'][i]
        lang = labels['Language'][i]
        os.makedirs(out_folder + lang, exist_ok=True)
        img = mp3_to_img(in_folder+file)
        sp.misc.imsave(out_folder + lang + '/' + file + '.jpg', img)

In [11]:
process_audio('data/liners/with/', 'data', 'with')

In [12]:
process_audio('data/liners/without/', 'data', 'without'),


(None,)

In [37]:
import scipy
import os
def load_dataset(base_dir):
    x=[]
    y=[]
    files = glob.glob(base_dir + '/with/*.jpg')
    yy=[1,0]
    for file in files:
        x += [np.reshape(scipy.ndimage.imread(file),in_dim )]
        y += [yy]
    yy = [0,1]
    files = glob.glob(base_dir + '/without/*.jpg')
    for file in files:
        x += [np.reshape(scipy.ndimage.imread(file),in_dim )]
        y += [yy]
    

    x,y = shuffle(x,y)
    x = np.array(x)/255.0
    y = np.array(y)
    return x,y

def process_audio_in_place(in_folder):
    x=[]
    y=[]
    yy = [1,0]
    files = shuffle(glob.glob(in_folder+'/with/*.wav'))
    for file in files:
        img = wav_to_img(file)
        x += [np.reshape(img,in_dim )]
        y += [yy]
    yy = [0,1]
    files = shuffle(glob.glob(in_folder+'/without/*.wav'))
    for file in files:
        img = wav_to_img(file)
        x += [np.reshape(img,in_dim )]
        y += [yy]
    x,y = shuffle(x,y)
    x = np.array(x)/255.0
    y = np.array(y)
    return x,y

In [5]:
i = Input(shape=in_dim)
m = Conv2D(16, (3, 3), activation='elu', padding='same')(i)
m = MaxPooling2D()(m)
m = Conv2D(32, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
m = Conv2D(64, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
m = Conv2D(128, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
m = Conv2D(256, (3, 3), activation='elu', padding='same')(m)
m = MaxPooling2D()(m)
hidden = Flatten()(m)
hidden_1 = Dense(512, activation='elu')(hidden)
m = Dropout(0.5)(hidden_1)
o = Dense(out_dim, activation='softmax')(m)

model = Model(inputs=i, outputs=o)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 192, 192, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 192, 192, 16)      160       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 96, 96, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 96, 96, 32)        4640      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 48, 48, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 48, 48, 64)        18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 24, 24, 64)        0         
__________

In [6]:
model.compile(loss='categorical_crossentropy', optimizer=Nadam(lr=1e-3), metrics=['accuracy'])
model = load_model('speech_v9.h5')


In [21]:
model2 = Model(inputs=i, outputs=hidden)
x, y_tr = load_dataset("data/train_wav")

x_tr =model2.predict(x)
N, D = x_tr.shape

x, y_val = load_dataset('data/valid_wav')
x_val = model2.predict(x)

x, y_te = load_dataset('data/test_wav')
x_te = model2.predict(x)



print (y_tr.shape, y_val.shape)

_,M = y_tr.shape


(785, 2) (263, 2)


In [17]:
print(M)
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
print(D)
i1 =  Input(shape=[D])
m1 = Dense(512, activation='elu')(i1)
m1 = Dropout(0.5)(m1)
m1 = Dense(200)(m1)
m1 = Dropout(0.2)(m1)
o1 = Dense(M, activation='softmax')(m1)
model_audio = Model(inputs=i1, outputs=o1)
model_audio.summary()

2
9216
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 9216)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               4719104   
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 200)               102600    
_________________________________________________________________
dropout_7 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 402       
Total params: 4,822,106
Trainable params: 4,822,106
Non-trainable params: 0
___________________________________________________________

In [18]:
import keras as K
optmiser = K.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model_audio.compile(loss='categorical_crossentropy', optimizer=optmiser, 
                  metrics=['accuracy'])

print(y_tr.shape, y_val.shape)

(785, 2) (263, 2)


In [19]:
model_audio_wait = 'model_audio_with_without.weights.best.hdf5'

In [46]:
from keras.callbacks import ModelCheckpoint   
#model_audio = load_model(model_audio_wait)
checkpointer = ModelCheckpoint(filepath=model_audio_wait, verbose=1, 
                               save_best_only=True)
hist = model_audio.fit(x_tr, y_tr, batch_size=100, epochs=20,
          validation_data=(x_val, y_val), callbacks=[checkpointer], 
          verbose=2, shuffle=True)

Train on 785 samples, validate on 263 samples
Epoch 1/20
Epoch 00000: val_loss improved from inf to 0.16758, saving model to model_audio_with_without.weights.best.hdf5
0s - loss: 0.1850 - acc: 0.9962 - val_loss: 0.1676 - val_acc: 1.0000
Epoch 2/20
Epoch 00001: val_loss improved from 0.16758 to 0.15805, saving model to model_audio_with_without.weights.best.hdf5
0s - loss: 0.1730 - acc: 0.9975 - val_loss: 0.1580 - val_acc: 1.0000
Epoch 3/20
Epoch 00002: val_loss improved from 0.15805 to 0.14928, saving model to model_audio_with_without.weights.best.hdf5
0s - loss: 0.1646 - acc: 0.9975 - val_loss: 0.1493 - val_acc: 1.0000
Epoch 4/20
Epoch 00003: val_loss improved from 0.14928 to 0.14120, saving model to model_audio_with_without.weights.best.hdf5
0s - loss: 0.1568 - acc: 0.9975 - val_loss: 0.1412 - val_acc: 1.0000
Epoch 5/20
Epoch 00004: val_loss improved from 0.14120 to 0.13375, saving model to model_audio_with_without.weights.best.hdf5
0s - loss: 0.1476 - acc: 0.9975 - val_loss: 0.1337 -

In [47]:
model_audio = load_model(model_audio_wait)


In [48]:
model_audio.evaluate(x_te,y_te)

 32/261 [==>...........................] - ETA: 0s

[0.074981760554786392, 1.0]

In [49]:
np.min(y_te)

0

In [50]:
np.max(y_te)

1

In [51]:
y=np.argmax(model_audio.predict(x_te), axis=1)

In [52]:
print(y)

[1 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1
 0 0 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1
 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 1
 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0
 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 1 0 1 0
 0 1 0 1 1 0 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0
 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 0
 1 1]


# x, y_te1 = process_audio_in_place('data/roller_test')

#x, y_te1 = process_audio_in_place('data/liners')
x_te1 = model2.predict(x)
print(x_te1.shape)

model_audio.evaluate(x_te1,y_te1)

In [55]:
t =model_audio.predict(x_te1)
y = np.argmax(t,axis=1)
print(y)

y = np.argmax(y_te1,axis=1)
print(y)

[0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
