In [1]:
import os
import librosa
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile

In [2]:
traindir='train/'
samples,sample_rate=librosa.load(traindir+'audio/bird/00b01445_nohash_0.wav',sr=16000)
ipd.Audio(samples,rate=sample_rate)
#print(sample_rate)

In [4]:
samples=librosa.resample(samples,sample_rate,8000)
ipd.Audio(samples,rate=sample_rate)


In [5]:
labels=os.listdir(traindir+'audio')
print(labels)

['bed', 'bird', 'cat', 'dog', 'one', '_background_noise_']


In [6]:
no_of_recording=[]

for label in labels:
    waves=[f for f in os.listdir(traindir+'audio'+'/'+label) if f.endswith('.wav')]
    no_of_recording.append(len(waves))

In [7]:
label=['bed', 'bird', 'cat', 'dog', 'one', '_background_noise_']

In [None]:
duration_of_recording=[]

for label in labels:
    
    waves=[f for f in os.listdir(traindir+'audio'+'/'+label) if f.endswith('.wav')]
    
    for wav in waves:
        sample_rate,samples=wavfile.read(traindir+'audio'+'/'+label+'/'+wav)
        duration_of_recording.append(float(len(samples)/sample_rate))
        
plt.hist(np.array(duration_of_recording))

In [9]:
all_wave=[]
all_label=[]
for label in labels:
    
    print(label)
    
    waves=[ f for f in os.listdir(traindir+'audio'+'/'+label) if f.endswith('.wav')]
    
    for wav in waves:
        
        samples,sample_rate=librosa.load(traindir+'audio'+'/'+label+'/'+wav,sr=16000)
        samples=librosa.resample(samples,sample_rate,8000)
        
        if (len(samples)==8000):
            
            all_wave.append(samples)
            all_label.append(label)
        

bed
bird
cat
dog
one
_background_noise_


In [10]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()









y=le.fit_transform(all_label)
classes=list(le.classes_)
print(classes)





['bed', 'bird', 'cat', 'dog', 'one']


In [11]:
from keras.utils import np_utils
y=np_utils.to_categorical(y,num_classes=len(labels))


In [12]:
all_wave=np.array(all_wave).reshape(-1,8000,1)

In [13]:
from sklearn.model_selection import train_test_split

x_train,x_val,y_train,y_val=train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size=0.2,random_state=777,shuffle=True)


In [14]:
from keras.layers import Dense,Dropout,Flatten,Conv1D,Input,MaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras import backend as k
k.clear_session()


inputs=Input(shape=(8000,1))

conv=Conv1D(8,13,padding='valid',activation='relu',strides=1)(inputs)
conv=MaxPooling1D(3)(conv)
conv=Dropout(0.3)(conv)


conv=Conv1D(16,11,padding='valid',activation='relu',strides=1)(inputs)
conv=MaxPooling1D(3)(conv)
conv=Dropout(0.3)(conv)



conv=Conv1D(32,9,padding='valid',activation='relu',strides=1)(inputs)
conv=MaxPooling1D(3)(conv)
conv=Dropout(0.3)(conv)


conv=Flatten()(conv)


conv=Dense(256,activation='relu')(conv)
conv=Dropout(0.3)(conv)



conv=Dense(128,activation='relu')(conv)
conv=Dropout(0.3)(conv)

outputs=Dense(len(labels),activation='relu')(conv)

model=Model(inputs,outputs)
model.summary()




Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 8000, 1)]         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 7992, 32)          320       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 2664, 32)          0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 2664, 32)          0         
_________________________________________________________________
flatten (Flatten)            (None, 85248)             0         
_________________________________________________________________
dense (Dense)                (None, 256)               21823744  
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0     

In [15]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [16]:
es=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=10,min_delta=0.0001)
mc=ModelCheckpoint('voiceRecModel.hdf5',monitor='val_acc',verbose=1,save_best_only=True,mode='max')

print(x_train.shape,x_val.shape,y_train.shape,y_val.shape)


(6536, 8000, 1) (1634, 8000, 1) (6536, 6) (1634, 6)


In [17]:
history=model.fit(x_train,y_train,epochs=100,callbacks=[es,mc],batch_size=32,validation_data=(x_val,y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 00014: early stopping


In [18]:
model.save('voiceRecModel.hdf5')

# test the model

In [19]:
from keras.models import load_model

model=load_model('voiceRecModel.hdf5')
print("model has loadded")

model has loadded


In [21]:
def predict(audio):
    prob=model.predict(audio.reshape(1,8000,1))
    index=np.argmax(prob[0])
    return classes[index]

import random
index=random.randint(0,len(x_val)-1)

samples=x_val[index].ravel()

print("AUdio:",classes[np.argmax(y_val[index])])

ipd.Audio(samples,rate=8000)

AUdio: dog


In [22]:
print ("text :",predict(samples))

text : bed
