The feature extraction process is very losely based on example code posted by Aaqib Saeed http://aqibsaeed.github.io/2016-09-03-urban-sound-classification-part-1/

In [2]:
import glob
import os
import librosa
import numpy as np
import pandas as pd
from librosa.feature import melspectrogram

I implemented the Feature extraction described in Salamon and Bello's paper (Salamon, Justin, and Juan Pablo Bello. "Deep convolutional neural networks and data augmentation for environmental sound classification." IEEE Signal processing letters 24.3 (2017): 279-283.) for environmental sound classification in this code. The data set is converted into Log Mel Spectograms with 128,128 dimentions as it was required in the paper. 

In [3]:
def LOG_MEL_SPEC(parent_dir,sub_dirs):
    labels = []
    log_mel_spectrogram=[]
    exten="*.wav"
    for l, sub_dir in enumerate(sub_dirs):
        for filename in glob.glob(os.path.join(parent_dir, sub_dir, exten)):
            # Take labels from the file names
            labels.append(filename.split('fold')[1].split('-')[1])      
            f, sr = librosa.load(filename,sr=44100)
            three_sec_samples=3*sr
            if(len(f)>=three_sec_samples):
                log_mel_spec = librosa.power_to_db(melspectrogram(f[:three_sec_samples], sr=sr, n_fft=1034, hop_length=1034))
                # Log Mel spectograms of first 3 secs 
            else:                               
                #If audio is not 3 sec repeat pad until its 3 secs long
                while(len(f)<three_sec_samples):          
                    f=np.concatenate((f, f))
                log_mel_spec = librosa.power_to_db(melspectrogram(f[:three_sec_samples], sr=sr, n_fft=1034, hop_length=1034))
            log_mel_spectrogram.append(log_mel_spec) 
            # Log Mel spectograms of first 3 secs       
    return np.array(log_mel_spectrogram), np.array(labels,dtype = np.int)


def encode(labels):
    # One hot encoding of labels
    labels_total = len(labels)
    unique_labels_total = len(np.unique(labels))
    one_hot_encoded = np.zeros((labels_total,unique_labels_total))
    one_hot_encoded[np.arange(labels_total), labels] = 1
    return one_hot_encoded

def file_Creater(final_path,filename):
    new_path = os.path.join(os.getcwd(), final_path)
    if not os.path.exists(new_path):
        os.makedirs(new_path)
    return os.path.join(final_path, filename)
    

# Saving all folds in one file

We save all the folds to one file of features and one file of labels.

In [4]:
parent_directory = 'UrbanSound8K/audio' # Where you have saved the UrbanSound8K data set"   
final_dir="UrbanSound8K/UrbanSound8K_Processed"
feature_file = file_Creater(final_dir, 'allfolds_features_x.npy')
labels_file = file_Creater(final_dir,'allfolds_labels_y.npy')
features, labels = LOG_MEL_SPEC(parent_directory, ['fold1','fold2','fold3','fold4','fold5','fold6','fold7','fold8','fold9','fold10'])
labels_encoded = encode(labels)
np.save(feature_file, features)
print ("DONE... " + feature_file)
np.save(labels_file, labels_encoded)
print ("DONE... " + labels_file)

DONE... UrbanSound8K/UrbanSound8K_Processed\allfolds_features_x.npy
DONE...UrbanSound8K/UrbanSound8K_Processed\allfolds_labels_y.npy
