Imports:

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from brian2 import *
from brian2hears import *
import librosa
import librosa.display
from pydub import AudioSegment
import soundfile as sf

Convert Stereo Audio Files to Mono PCM 16 Wav And Pad to 4 Seconds:

In [None]:
def absoluteFilePaths(directory):
    file_paths = []
    for folder, subfolders, files in os.walk(directory):
        for file in files:
            filePath = os.path.abspath(os.path.join(folder, file)).replace('//','/')
            file_paths.append(filePath)
    return file_paths

paths = absoluteFilePaths('data/urbansound8k/')

for path in absoluteFilePaths('data/urbansound8k/'):
    fold = os.path.basename(os.path.dirname(path))
    slice_name = os.path.basename(path)

    sr = librosa.get_samplerate(path)
    y, sr = librosa.load(path,sr=sr,mono=True)
    total_samples_with_pad = sr*4
    y = librosa.util.fix_length(y,size=total_samples_with_pad)

    export_path = 'data/urbansound8k_mono/' + fold + '/' + slice_name
    print(export_path)
    sf.write(export_path,y,sr,'PCM_16')

Cochleagram

In [None]:
def generate_cgram(filepath,cfN = 224, num_frames = 224):

    sound = Sound(filepath)
    
    cf = erbspace(20*Hz,20000*Hz,cfN)
    gammatone = Gammatone(sound,cf)
    cochlea = FunctionFilterbank(gammatone, lambda x: clip(x,0,Inf)**(1/3))
    output = cochlea.process()

    samples_length = np.shape(output)[0]

    N = samples_length

    frame_length = int((N / (0.5*num_frames+0.5)))
    hop_length = int(frame_length/2)

    frames = librosa.util.frame(output.T,frame_length=frame_length,hop_length=hop_length)

    while shape(frames)[2] != num_frames:
        frames = frames[:,:,:-1]

    windowed_frames = np.absolute(frames)
    summed_energy = np.sum(windowed_frames,axis=1)
    cochleagram_data = summed_energy+1e-12
    cochleagram_data = np.absolute(cochleagram_data)*hamming(num_frames)
    return cochleagram_data

Linear Gammachirp

In [None]:
def generate_LinGC(filepath,cfN=224,num_frames=224):
    sound = Sound(filepath)

    #center frequencies with a spacing following an ERB scale
    center_frequencies = erbspace(20*Hz, 20000*Hz, cfN)

    c = 0 #glide slope
    time_constant = linspace(3, 0.3, cfN)*ms

    gamma_chirp = LinearGammachirp(sound, center_frequencies, time_constant, c)
    gamma_chirp = FunctionFilterbank(gamma_chirp, lambda x: clip(x,0,Inf)**(1/3))

    output = gamma_chirp.process()

    samples_length = np.shape(output)[0]
    N = samples_length

    frame_length = int((N / (0.5*num_frames+0.5)))
    hop_length = int(frame_length/2)

    frames = librosa.util.frame(output.T,frame_length=frame_length,hop_length=hop_length)

    while shape(frames)[2] != num_frames:
        frames = frames[:,:,:-1]

    windowed_frames = np.absolute(frames)
    summed_energy = np.sum(windowed_frames,axis=1)
    LinGC_data = summed_energy+1e-12
    LinGC_data = np.absolute(LinGC_data)*hamming(num_frames)

    return LinGC_data

Logarithmic Gammachirp

In [None]:
def generate_LogGC(filepath, cfN = 224, num_frames = 224, c1 = -2.96, b1 = 1.81):

    #c1 - glide slope
    #b1 - factor determining time constant of the filters

    sound = Sound(filepath)

    cf = erbspace(20*Hz, 20000*Hz, cfN) # centre frequencies

    fb = LogGammachirp(sound, cf, c=c1, b=b1)
    cochlea = FunctionFilterbank(fb, lambda x: clip(x,0,Inf)**(1/3))
    output = cochlea.process()

    samples_length = np.shape(output)[0]

    N = samples_length

    frame_length = int((N / (0.5*num_frames+0.5)))
    hop_length = int(frame_length/2)

    frames = librosa.util.frame(output.T,frame_length=frame_length,hop_length=hop_length)

    while shape(frames)[2] != num_frames:
        frames = frames[:,:,:-1]

    windowed_frames = np.absolute(frames)
    summed_energy = np.sum(windowed_frames,axis=1)
    LogGC_data = summed_energy+1e-12
    LogGC_data = np.absolute(LogGC_data)*hamming(num_frames)

    return LogGC_data

Approximate Cochleagram

In [None]:
def generate_approxGT(filepath, cfN = 224, num_frames = 224):

    sound = Sound(filepath)

    cf = erbspace(20*Hz, 20000*Hz, cfN) # centre frequencies
    
    bw = 10**(0.037+0.785*log10(cf/Hz))

    fb = ApproximateGammatone(sound, cf, bw, order=4)
    cochlea = FunctionFilterbank(fb, lambda x: clip(x,0,Inf)**(1/3))
    output = cochlea.process()

    samples_length = np.shape(output)[0]

    N = samples_length

    frame_length = int((N / (0.5*num_frames+0.5)))
    hop_length = int(frame_length/2)

    frames = librosa.util.frame(output.T,frame_length=frame_length,hop_length=hop_length)

    while shape(frames)[2] != num_frames:
        frames = frames[:,:,:-1]

    windowed_frames = np.absolute(frames)
    summed_energy = np.sum(windowed_frames,axis=1)
    approxGT_data = summed_energy+1e-12
    approxGT_data = np.absolute(approxGT_data)*hamming(num_frames)

    return approxGT_data

Example of each representation

In [None]:
# I had to make the two columns separately for a 2x2 graph in my final paper
# because matplotlib would not remove the excess horizontal spacing when attempting to do it
# in the intended way -> f, axarr = plt.subplots(2,2)
# I tried many different pad arguments to no avail

soundpath = "data/urbansound8k_mono/fold7/99812-1-2-0.wav"

cochleagram =  generate_cgram(soundpath)
lin_gc = generate_LinGC(soundpath)
log_gc = generate_LogGC(soundpath)
approx_gt = generate_approxGT(soundpath)

f, axarr = plt.subplots(2,1)
axarr[0].imshow(cochleagram, origin='lower', vmin=0)
axarr[0].set_title('Cochleagram')
axarr[0].title.set_size(9)
axarr[1].imshow(lin_gc, origin='lower', vmin=0)
axarr[1].set_title('Linear Gammachirp')
axarr[1].title.set_size(9)

f.subplots_adjust(hspace=0.0)
f.tight_layout()
for ax in axarr.flat:
    ax.axis('off')

f.savefig('DataRepCol1.png', pad_inches=0, transparent=True)

g, axarr = plt.subplots(2,1)
axarr[0].imshow(approx_gt, origin='lower', vmin=0)
axarr[0].set_title('Approximate Gammatone')
axarr[0].title.set_size(9)
axarr[1].imshow(log_gc, origin='lower', vmin=0)
axarr[1].set_title('Logarithmic Gammachirp')
axarr[1].title.set_size(9)

g.subplots_adjust(hspace=0.0)
g.tight_layout()
for ax in axarr.flat:
    ax.axis('off')
g.savefig('DataRepCol2.png', pad_inches=0, transparent=True)

Blacklisted Data Points - Incorrect PCM Encoding

In [None]:
meta_df = pd.read_csv("data/UrbanSound8K.csv")
blacklist = ['19007-4-0-0.wav','36429-2-0-6.wav','36429-2-0-7.wav', '36429-2-0-13.wav', '36429-2-0-14.wav', '36429-2-0-15.wav','36429-2-0-18.wav','36429-2-0-23.wav','88466-7-0-0.wav']
meta_df[meta_df['slice_file_name'].isin(blacklist)]

Metadata Preparation

In [None]:
#load in metadata, set directory for converted mono sounds
meta_df = pd.read_csv("data/UrbanSound8K.csv")
file_path = 'data/urbansound8k_mono'
class_map = {'0' : 'air_conditioner', '1' : 'car_horn', '2' : 'children_playing', '3' : 'dog_bark', '4' : 'drilling', 
                 '5' : 'engine_idling', '6' : 'gun_shot', '7' : 'jackhammer', '8' : 'siren', '9' : 'street_music'}

#remove corrupt fsIDs - 8723 data points remain
meta_df = meta_df[meta_df.fsID != 19007]
meta_df = meta_df[meta_df.fsID != 36429]
meta_df = meta_df[meta_df.fsID != 88466]

Create Data Representations

In [None]:
# I uncommented one of the four commented functions below before running this script
# in order to create representations for each type, one at a time due to large runtime

features = []
count_records = len(meta_df.slice_file_name)
j = 1

for index, row in meta_df.iterrows():
    
    file_name = os.path.join(os.path.abspath(file_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    print(file_name + ' ' + str(j) + '/' + str(count_records))
    class_label = row["classID"]
    fold = row["fold"]
    #data = generate_cgram(file_name, cfN = 224, num_frames = 224)
    #data = generate_approxGT(file_name, cfN = 224, num_frames = 224)
    #data = generate_LogGC(file_name, cfN = 224, num_frames = 224, c1 = -2.96, b1 = 1.81)
    #data = generate_LinGC(file_name, cfN = 224, num_frames = 224)
    features.append([data, class_label, fold])
    j=j+1

# Convert into a Pandas dataframe, save off into pickle file with label and fold 
featuresdf = pd.DataFrame(features, columns=['feature','class_label', 'fold'])
featuresdf.to_pickle('LogGC_224_comp3.pkl')