# Audio Recognition

In [9]:
#import pyaudio
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
import os
import numpy as np
import pandas as pd
import random
from skimage.measure import block_reduce

#To find the duration of wave file in seconds
import wave
import contextlib
import librosa
import soundfile as sf

#Keras imports
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
from keras.models import model_from_json

import time
import datetime

In [10]:
def save_model_to_disk(model):
    """
    Converts the model to a json and saves as an h5 file
    """
    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.h5")
    print("Saved model to disk")


def rgb2gray(rgb):
    """
    Convert color image to grayscale
    """
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

def normalize_gray(array):
    """
    Normalize Gray colored image
    """
    return (array - array.min())/(array.max() - array.min())

def findDuration(fname):
    """
    Function to find the duration of the wave file in seconds
    """

    sample_rate,samples = wavfile.read(fname)
    frequencies, times, spectogram = signal.spectrogram(samples, sample_rate)
    sf.write('tmp.wav', t, 16000)
    wave.open('tmp.wav','r')
    frames = f.getnframes()
    rate = f.getframerate()
    sw   = f.getsampwidth()
    chan = f.getnchannels()
    duration = frames / float(rate)
    return duration
        
def graph_spectrogram(wav_file, nfft=512, noverlap=256):
    """
    Converts the wav file to a spectrogram for the NN to interpret
    """
    findDuration(wav_file)
    rate, data = wavfile.read(wav_file)
    #print("")
    fig,ax = plt.subplots(1)
    fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
    ax.axis('off')
    pxx, freqs, bins, im = ax.specgram(x=data, Fs=rate, noverlap=noverlap, NFFT=nfft)
    ax.axis('off')
    plt.rcParams['figure.figsize'] = [0.75,0.5]
    #fig.savefig('sp_xyz.png', dpi=300, frameon='false')
    fig.canvas.draw()
    size_inches  = fig.get_size_inches()
    dpi          = fig.get_dpi()
    width, height = fig.get_size_inches() * fig.get_dpi()

    #print(size_inches, dpi, width, height)
    mplimage = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    #print("MPLImage Shape: ", np.shape(mplimage))
    imarray = np.reshape(mplimage, (int(height), int(width), 3))
    plt.close(fig)
    return imarray

## Creating a Directory for the Files

In [11]:
os.environ['KMP_DUPLICATE_LIB_OK']  = 'True'

folder_names = [f for f in os.listdir('train/audio') if not 'background' in f]
folder_names.sort()

imheight = int(17)
imwidth = int(25)
k = int(0)

In [12]:
file = open("train/testing_list.txt", "r")
testing_list = file.read().split('\n')
file.close()
file = open("train/validation_list.txt", "r")
validation_list = file.read().split('\n')

In [13]:
master_file_list = []
master_target_list = []
train_val_test_list = []
for phrase in folder_names:
    file_list = [phrase + '/' + f for f in os.listdir('train/audio/' + phrase) if '.wav' in f]
    for file in file_list:
        if file in testing_list:
            train_val_test_list.append('test')
        elif file in validation_list:
            train_val_test_list.append('val')
        else:
            train_val_test_list.append('train')
        master_target_list.append(phrase)
        master_file_list.append(file)
data = np.transpose(np.array([master_file_list, master_target_list, train_val_test_list]))
master_df = pd.DataFrame(data = data, columns=['filename','target','train_val_test'])
master_df.head()

Unnamed: 0,filename,target,train_val_test
0,bed/6a1908f8_nohash_0.wav,bed,train
1,bed/7ff085e1_nohash_0.wav,bed,train
2,bed/8ec6dab6_nohash_1.wav,bed,test
3,bed/2a89ad5c_nohash_0.wav,bed,val
4,bed/cfde27ba_nohash_1.wav,bed,test


## Creating the Keras Model

In [16]:
num_classes = len(master_df['target'].unique().tolist())
input_shape = (imheight, imwidth, 1)
batch_size = 4
epochs = 1

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 15, 23, 32)        320       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 13, 21, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 10, 64)         0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 6, 10, 64)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3840)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               491648    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)              

In [21]:
training_df = master_df[master_df['train_val_test'] == 'train'].copy()
sound_clips = []
for index, train_entry in training_df.iloc[2:4].iterrows():
    fpath = 'train/audio/' + train_entry['filename']
    x, sr = librosa.load(fpath, sr=22050)
    X = librosa.stft(x)
    Xdb = librosa.amplitude_to_db(abs(X))



NoBackendError: 

In [19]:
# master_df[master_df['train_val_test'] == 'train']
# master_df[master_df['train_val_test'] == 'val']
# master_df[master_df['train_val_test'] == 'test']
training_df = master_df[master_df['train_val_test'] == 'train'].copy()
sound_clips = []
for index, train_entry in training_df.iloc[:2].iterrows():
    fpath = 'train/audio/' + train_entry['filename']
#     spectrogram = graph_spectrogram(fpath)
#     graygram = rgb2gray(spectrogram)
#     normgram = normalize_gray(graygram)
# #     if(normgram.shape[0]>150):
# #         continue
#     redgram = block_reduce(normgram, block_size = (3,3), func = np.mean)
#     sound_clips.append(redgram)
    with open(fpath, 'rb') as f:
        sample_rate,samples = wavfile.read(f)
    frequencies, times, spectogram = signal.spectrogram(samples, sample_rate)
    sf.write('tmp.wav', t, 16000)
    sample_rate, samples = wavfile.read('tmp.wav')
    frequencies, times, spectogram = signal.spectrogram(samples, sample_rate)

print(spectogram)
    
# training_df['audio'] = sound_clips
# training_df

ValueError: File format b'R\x9b\x8f\x8c' not understood. Only 'RIFF' and 'RIFX' supported.

In [None]:
for i in range(int(len(folder_names))):
    # goes to the column for the respective audio folder
    folder_name = ws.cell(row=1, column=i+1).value
    for j in range(train_num+test_num):
        # iterates through the rows
        file_name = ws.cell(row=j+2, column=i+1).value
        file_path = str(main_directory) + "\\" + str(folder_name) + "\\" +str(file_name) 
        print(str(j))
        spectrogram = graph_spectrogram(file_path)
        graygram = rgb2gray(spectrogram)
        normgram = normalize_gray(graygram)
        norm_shape = normgram.shape
        if(norm_shape[0]>150):
            continue
        redgram = block_reduce(normgram, block_size = (3,3), func = np.mean)
        if j < int(train_num):
            # check to make sure the split is right
            x_train[j,:,:] = redgram
            y_train[j] = k
            print(folder_name + ": Training Data Progress = {:2.1%}".format(float(j+1) / int(train_num), end="\r"))
        else:
            x_test[j,:,:] = redgram
            y_test[j] = k
            print(folder_name + ": Testing Data Progress = {:2.1%}".format(float(j+1-int(train_num)) / int(test_num), end="\r"))
        if j == train_num:
            k = 0
        k = k + 1

In [None]:
num_classes = 30

y_train = keras.utils.to_categorical(y_train, num_classes)
#converts the data set so each integer(?) input is scaled from 0 to num_classes ???
#could also be a dictionary
x_train = x_train.reshape(x_train.shape[0], imheight, imwidth, 1)

y_test = keras.utils.to_categorical(y_test, num_classes)
#converts the data set so each integer(?) input is scaled from 0 to num_classes ???
#could also be a dictionary
x_test = x_train.reshape(x_test.shape[0], imheight, imwidth, 1)

print("x and y training/testing data done and formatted, starting keras sequential model function now")

model.fit(x_train, y_train, batch_size=4, epochs=10, verbose=1, validation_data=(x_test, y_test))

save_model_to_disk(model)

In [None]:
# for i in range(int(len(folder_names))):
#     # goes to the column for the respective audio folder
#     folder_name = ws.cell(row=1, column=i+1).value
#     for j in range(train_num+test_num):
#         # iterates through the rows
#         file_name = ws.cell(row=j+2, column=i+1).value
#         file_path = str(main_directory) + "\\" + str(folder_name) + "\\" +str(file_name) 
#         print(str(j))
#         spectrogram = graph_spectrogram(file_path)
#         graygram = rgb2gray(spectrogram)
#         normgram = normalize_gray(graygram)
#         norm_shape = normgram.shape
#         if(norm_shape[0]>150):
#             continue
#         redgram = block_reduce(normgram, block_size = (3,3), func = np.mean)
#         if j < int(train_num):
#             # check to make sure the split is right
#             x_train[j,:,:] = redgram
#             y_train[j] = k
#             print(folder_name + ": Training Data Progress = {:2.1%}".format(float(j+1) / int(train_num), end="\r"))
#         else:
#             x_test[j,:,:] = redgram
#             y_test[j] = k
#             print(folder_name + ": Testing Data Progress = {:2.1%}".format(float(j+1-int(train_num)) / int(test_num), end="\r"))
#         if j == train_num:
#             k = 0
#         k = k + 1