# Feature Engineering 

Import the necessary libraries/modules:

In [1]:
import numpy as np
import librosa
import librosa.display
import os
from os.path import isdir, join
import csv
%matplotlib inline
import matplotlib.pyplot as plt
import random
import soundfile as sf
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import webrtcvad
import IPython.display as ipd
from scipy import stats

In [54]:
# The code was removed by Watson Studio for sharing.

In [None]:
from ibm_botocore.client import Config
import ibm_boto3
cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials_1['IBM_API_KEY_ID'],
    ibm_service_instance_id=credentials_1['IAM_SERVICE_ID'],
    ibm_auth_endpoint=credentials_1['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials_1['ENDPOINT'])

Set up variables for labels, directories and the supplied validation and training lists to split dataset

In [None]:
!ls data/train/audio

In [48]:
classes = ['yes', 'no', 
           'up', 'down', 
           'left', 'right', 
           'on', 'off', 
           'stop', 'go', 
           'unknown']

folders = os.listdir('./data/processed/')
# put folders in same order as in the classes list, used when making sets
all_classes = [x for x in classes[:11]]
for ind, cl in enumerate(folders):
    if cl not in classes:
        all_classes.append(cl)
print(all_classes)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'unknown', 'eight', 'one', 'silence', 'happy', 'cat', 'sheila', 'three', 'nine', 'two', 'marvin', 'house', 'dog', 'bird', 'bed', 'tree', 'seven', 'wow', 'six', 'five', 'four', 'zero']


In [46]:
processed_audio_path = 'data/processed/'
train_audio_path = 'data/train/audio/'
dirs = [d for d in os.listdir(train_audio_path) if d in all_classes]

In [49]:
with open('data/train/validation_list.txt') as val_list:
    validation_list = [row[0] for row in csv.reader(val_list)]
assert len(validation_list) == 6798, 'file not loaded'

#add silence files to validation_list
for i, file in enumerate(os.listdir(processed_audio_path + 'silence/')):
    if i%10==0:
        validation_list.append('silence/'+file)

training_list = []
all_files_list = []
class_counts = {}

for direct in dirs:
    files = os.listdir(processed_audio_path + direct)
    for i, f in enumerate(files):
        all_files_list.append(direct + '/' + f)
        path = direct + '/' + f
        if path not in validation_list:
            training_list.append(direct + '/' + f)        
        class_counts[direct] = i

#remove filenames from validation_list that don't exist anymore (due to eda)
validation_list = list(set(validation_list).intersection(all_files_list))

## Feature Creation

In order for our models to process the data we have to extract some information from the wav files. The two best ways to do this that I found were using Spectrograms (essentially a STFT of the signal) and FFTs of the signal. 

## Spectrogram
For spectrograms, mel-power, log-scaled spectrograms yielded the best results. Raw log-specs of the signal were also tested but performed more poorly.

In [None]:
def make_spec(file, file_dir = processed_audio_path, flip = False, ps = False, st = 4):
    """
    create a melspectrogram from the amplitude of the sound
    
    Args:
        file (str): filename
        file_dir (str): directory path
        flip (bool): reverse time axis
        ps (bool): pitch shift
        st (int): half-note steps for pitch shift
    Returns:
        np.array with shape (122,85) (time, freq)
    """
    sig, rate = sf.read(file_dir + file)
    if len(sig) < 16000: # pad shorter than 1 sec audio with ramp to zero
        sig = np.pad(sig, (0,16000-len(sig)), 'linear_ramp')
    if ps:
        sig = librosa.effects.pitch_shift(sig, rate, st)
    D = librosa.amplitude_to_db(librosa.stft(sig[:16000], n_fft = 512, 
                                             hop_length = 128, 
                                             center = False), ref = np.max)
    S = librosa.feature.melspectrogram(S=D, n_mels = 85).T
    if flip:
        S = np.flipud(S)
    return S.astype(np.float32)

In [7]:
'''def make_logspec(file, file_dir = processed_audio_path, flip = False, ps = False, st = 4):
   
    sig, rate = sf.read(file_dir + file)
    if len(sig) < 16000: # pad shorter than 1 sec audio with ramp to zero
        sig = np.pad(sig, (0,16000-len(sig)), 'linear_ramp')
    if ps:
        sig = librosa.effects.pitch_shift(sig, rate, st)
    D = librosa.amplitude_to_db(librosa.stft(sig[:16000], n_fft = 512, 
                                             hop_length = 128, 
                                             center = False), ref = np.max).T
    #S = librosa.feature.melspectrogram(S=D, n_mels = 85).T
    if flip:
        S = np.flipud(S)
    return D.astype(np.float32)'''

In [None]:
def create_sets(file_list = training_list):
    X_array = np.zeros([len(file_list),122,85])
    Y_array = np.zeros([len(file_list)])    
    for ind, file in enumerate(file_list):
        if ind%2000 == 0:
            print(ind, file)    
        try:
            X_array[ind] = make_spec(file)
        except (ValueError):
            print(ind, file, ValueError)
        Y_array[ind] = classes.index(file.rsplit('/')[0])
        
    return X_array, Y_array   

In [6]:
'''def create_sets_log(file_list = training_list):
    X_array = np.zeros([len(file_list),122,257])
    Y_array = np.zeros([len(file_list)])    
    for ind, file in enumerate(file_list):
        if ind%2000 == 0:
            print(ind, file) 

       # try:    
        X_array[ind] = make_logspec(file)
       # except ValueError:
           # print(ind, file, ValueError)
        Y_array[ind] = all_classes.index(file.rsplit('/')[0])
        
    return X_array, Y_array  '''

Create and save the training and validation sets of spectrograms:

In [8]:
X_train, Y_train_all = create_sets()

0 no/8830e17f_nohash_1.wav




2000 no/28497c5b_nohash_1.wav
4000 eight/6af4aa07_nohash_1.wav
6000 one/4c4d2526_nohash_1.wav
8000 up/0137b3f4_nohash_4.wav
10000 happy/48a8a69d_nohash_0.wav
12000 off/f8f60f59_nohash_4.wav
14000 on/1a673010_nohash_0.wav
16000 cat/8f811bbc_nohash_0.wav
18000 stop/4c77947d_nohash_1.wav
20000 three/3389305e_nohash_0.wav
22000 yes/190821dc_nohash_2.wav
24000 yes/61d3e51e_nohash_0.wav
26000 nine/3bc21161_nohash_2.wav
28000 down/23abe1c9_nohash_0.wav
30000 two/14587ff0_nohash_0.wav
32000 marvin/a84dee7b_nohash_1.wav
34000 dog/d103dd6e_nohash_0.wav
38000 bed/f174517e_nohash_1.wav
40000 left/c71e3acc_nohash_0.wav
42000 right/2da58b32_nohash_2.wav
44000 right/652b3da7_nohash_0.wav
46000 seven/e53139ad_nohash_3.wav
48000 six/f4386675_nohash_1.wav
50000 five/caf9fceb_nohash_0.wav
52000 four/66276b0e_nohash_1.wav
54000 four/66aa0f29_nohash_0.wav
56000 zero/aba19127_nohash_1.wav
58000 go/f19c1390_nohash_1.wav


In [None]:
Y_train = np.where(Y_train_all < 11, Y_train_all, 11)

In [27]:
X_train.shape

(58299, 122, 257)

In [36]:

np.save('data/X_train_log2.npy', np.expand_dims(X_train, -1)+1.3)
np.save('data/Y_train_log.npy', Y_train.astype(np.int))
np.save('data/Y_train_all_log.npy', Y_train_all.astype(np.int))

In [None]:

np.save('data/X_train_mfcc_norm.npy', np.expand_dims(X_train_mfcc, -1)+1.3)
np.save('data/Y_train_mfcc_norm.npy', Y_train_mfcc_sorted.astype(np.int))
np.save('data/Y_train_mfcc_all_norm.npy', Y_train_mfcc.astype(np.int))

In [None]:
X_val, Y_val_all = create_sets(file_list = validation_list)

In [40]:
Y_val_log = np.where(Y_val_log_all < 11, Y_val_log_all, 11)

In [41]:
np.save('data/X_val_log.npy', np.expand_dims(X_val_log, -1)+1.3)
np.save('data/Y_val_log.npy', Y_val_log.astype(np.int))
np.save('data/Y_val_log_all.npy', Y_val_log_all.astype(np.int))

## FFT
FFTs were neccessary for inputs into our non-deep MLP and SVM model given that they only take 2D matrices as input. 

In [None]:
def make_fft(file, file_dir = processed_audio_path):
   

    y, fs = librosa.load(file_dir + file, sr = 16000)
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    vals = 2.0/N * np.abs(yf[0:N//2])  # FFT is symmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    return vals.astype(np.float32)
    

In [None]:
def create_ffts(file_list = training_list):
    X_array = np.zeros([len(file_list),8000])
    Y_array = np.zeros([len(file_list)])    
    for ind, file in enumerate(file_list):
        if ind%2000 == 0:
            print(ind, file)    
        #try:
        X_array[ind] = make_fft(file)
       # except ValueError:
        #    print(ind, file, ValueError)'''
        Y_array[ind] = all_classes.index(file.rsplit('/')[0])
        
    return X_array, Y_array     

In [None]:
X_train_fft, Y_train_all_fft = create_ffts()

In [None]:
np.save('data/X_train_fft.npy', X_train_fft)
np.save('data/Y_train_fft.npy', Y_train_fft.astype(np.int))
np.save('data/Y_train_all_fft.npy', Y_train_all_fft.astype(np.int))

In [None]:
X_val_fft, Y_val_all_fft = create_ffts(file_list = validation_list)

In [None]:
Y_val_fft = np.where(Y_val_all_fft < 11, Y_val_all_fft, 11)

In [None]:

np.save('data/X_val_fft.npy', X_val_fft)
np.save('data/Y_val_fft.npy', Y_val_fft.astype(np.int))
np.save('data/Y_val_all_fft.npy', Y_val_all_fft.astype(np.int))

## Feature Transformation

Normalised sets were created for both spectrograms and ffts where each set was normalised agaisnst the mean and sd for every word individually. These also performed poorly and thus were not included in this final notebook

## Iterative Steps (Additional Feature for Evaluation Step)

MFCC spectrograms were created to test if they improved accuracy in our network. However these again yielded poorer results as you'll see in the Evaluation stage.

In [52]:
def create_sets_mfcc(file_list = training_list):
    X_array = np.zeros([len(file_list),32,13])
    Y_array = np.zeros([len(file_list)])    
    for ind, file in enumerate(file_list):
        if ind%2000 == 0:
            print(ind, file)  
        try:    
            X_array[ind] = make_mfccspec(file)
        except ValueError:
            print(ind, file, ValueError)
        Y_array[ind] = all_classes.index(file.rsplit('/')[0])
        
    return X_array, Y_array 

In [51]:
def make_mfccspec(file, file_dir = processed_audio_path, flip = False, ps = False, st = 4):
    """
    create a mfcc spectrogram using a pre-existing log-mel spec
    
    Args:
        file (str): filename
        file_dir (str): directory path
        flip (bool): reverse time axis
        ps (bool): pitch shift
        st (int): half-note steps for pitch shift
    Returns:
        np.array with shape (122,85) (time, freq)
    """
    sig, rate = sf.read(file_dir + file)
    if len(sig) < 16000: # pad shorter than 1 sec audio with ramp to zero
        sig = np.pad(sig, (0,16000-len(sig)), 'linear_ramp')
    
    mfcc = librosa.feature.mfcc(sig, sr = 16000, n_mfcc=13).T

    # Let's pad on the first and second deltas while we're at it
    delta2_mfcc = librosa.feature.delta(mfcc, order=2, mode='nearest')

    return delta2_mfcc.astype(np.float32)

In [53]:
X_train_mfcc, Y_train_mfcc_all = create_sets_mfcc()


0 no/8830e17f_nohash_1.wav
2000 no/28497c5b_nohash_1.wav
4000 eight/6af4aa07_nohash_1.wav
6000 one/4c4d2526_nohash_1.wav
8000 up/0137b3f4_nohash_4.wav
10000 happy/48a8a69d_nohash_0.wav
12000 off/f8f60f59_nohash_4.wav
14000 on/1a673010_nohash_0.wav
16000 cat/8f811bbc_nohash_0.wav
18000 stop/4c77947d_nohash_1.wav
20000 three/3389305e_nohash_0.wav
22000 yes/190821dc_nohash_2.wav
24000 yes/61d3e51e_nohash_0.wav
26000 nine/3bc21161_nohash_2.wav
28000 down/23abe1c9_nohash_0.wav
30000 two/14587ff0_nohash_0.wav
32000 marvin/a84dee7b_nohash_1.wav
34000 dog/d103dd6e_nohash_0.wav
36000 bird/e0344f60_nohash_1.wav
38000 bed/f174517e_nohash_1.wav
40000 left/c71e3acc_nohash_0.wav
42000 right/2da58b32_nohash_2.wav
44000 right/652b3da7_nohash_0.wav
46000 seven/e53139ad_nohash_3.wav
48000 six/f4386675_nohash_1.wav
50000 five/caf9fceb_nohash_0.wav
52000 four/66276b0e_nohash_1.wav
54000 four/66aa0f29_nohash_0.wav
56000 zero/aba19127_nohash_1.wav
58000 go/f19c1390_nohash_1.wav


In [56]:
Y_train_mfcc = np.where(Y_train_mfcc_all < 11, Y_train_mfcc_all, 11)

In [58]:
np.save('data/X_train_mfcc.npy', np.expand_dims(X_train_mfcc, -1)+1.3)
np.save('data/Y_train_mfcc.npy', Y_train_mfcc.astype(np.int))
np.save('data/Y_train_mfcc_all.npy', Y_train_mfcc_all.astype(np.int))

In [59]:
X_val_mfcc, Y_val_mfcc_all = create_sets_mfcc(file_list = validation_list)


0 bed/f84762e5_nohash_0.wav
2000 three/a6d586b7_nohash_4.wav
4000 bird/8910e5ef_nohash_0.wav
6000 go/471a0925_nohash_4.wav


In [60]:
Y_val_mfcc = np.where(Y_val_mfcc_all < 11, Y_val_mfcc_all, 11)

In [61]:
np.save('data/X_val_mfcc.npy', np.expand_dims(X_val_mfcc, -1)+1.3)
np.save('data/Y_val_mfcc.npy', Y_val_mfcc.astype(np.int))
np.save('data/Y_val_mfcc_all.npy', Y_val_mfcc_all.astype(np.int))

Save all training and validation sets to IBM COS.

In [None]:
# The code was removed by Watson Studio for sharing.