In [35]:
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
from scipy.fftpack import fft
import os
import numpy as np
import random
#To find the duration of wave file in seconds
import wave
import contextlib
#Keras imports
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D

In [2]:
imwidth = 54
imheight = 36
total_examples = 2000
speakers = 4
examples_per_speaker = 50
tt_split = 0.1
num_classes = 10

In [3]:
# Function to find the duration of the wave file in seconds
def findDuration(fname):
    with contextlib.closing(wave.open(fname,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        return duration

In [9]:
def graph_spectrogram(wav_file, nfft=512, noverlap=511):
    rate, data = wavfile.read(wav_file)
    fig,ax = plt.subplots(1)
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
    ax.axis('off')
    pxx, freqs, bins, im = ax.specgram(x=data, Fs=rate, noverlap=noverlap, NFFT=nfft)
    ax.axis('off')
    plt.rcParams['figure.figsize'] = [0.75,0.5]
    #fig.savefig('sp_xyz.png', dpi=300, frameon='false')
    fig.canvas.draw()
    width, height = fig.get_size_inches() * fig.get_dpi()
    #print(width, height)
    mplimage = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    imarray = np.reshape(mplimage, (int(height), int(width), 3))
    plt.close(fig)
    return imarray

In [10]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

In [11]:
def normalize_gray(array):
    return (array - array.min())/(array.max() - array.min())

In [12]:
def create_train_test(audio_dir):
    file_names = [f for f in os.listdir(audio_dir) if '.wav' in f]
    file_names.sort()
    test_list = []
    train_list = []
    
    for i in range(int(total_examples/examples_per_speaker)):
        test_list.extend(random.sample(file_names[(i*examples_per_speaker+1):(i+1)*examples_per_speaker], int(examples_per_speaker*tt_split)))

    train_list = [x for x in file_names if x not in test_list]

    y_test = np.zeros(len(test_list))
    y_train = np.zeros(len(train_list))
    x_train = np.zeros((len(train_list), imheight, imwidth))
    x_test = np.zeros((len(test_list), imheight, imwidth))

    for i, f in enumerate(test_list):
        y_test[i] = int(f[0])
        x_test[i,:,:] = normalize_gray(rgb2gray(graph_spectrogram(audio_dir + f)))
        
    for i, f in enumerate(train_list):
        y_train[i] = int(f[0])
        x_train[i,:,:] = normalize_gray(rgb2gray(graph_spectrogram(audio_dir + f)))
        
    return x_train, y_train, x_test, y_test

In [13]:
x_train, y_train, x_test, y_test = create_train_test('./recordings/')

54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0


54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0


54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0
54.0 36.0


In [14]:
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [15]:
x_train = x_train.reshape(x_train.shape[0], imheight, imwidth, 1)
x_test = x_test.reshape(x_test.shape[0], imheight, imwidth, 1)
input_shape = (imheight, imwidth, 1)
batch_size = 4
epochs = 10

In [19]:
x_test.shape

(200, 36, 54, 1)

In [20]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

In [21]:
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.adam(), metrics=['accuracy'])

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 34, 52, 32)        320       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 50, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 16, 25, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16, 25, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 25600)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               3276928   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
__________

In [25]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test))

Train on 1800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e04c442400>