In [17]:
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile
import librosa


from sklearn.model_selection import train_test_split

L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
root_path = r'..'
out_path = r'.'
model_path = r'.'
train_audio_path = '../../train/audio/'
test_audio_path = '../../test/audio/'

In [30]:
"""
Define utility functions to generate log spectrogram, mel spectrogram and mfcc which will serve as image representations of 
the audio files provided to us. 
We will use these different inputs to train our deep learning models. 
The expectation is that these different types of input may help us capture different aspects of the audio.
"""
def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return np.log(spec.T.astype(np.float32) + eps)

def mel_specgram(samples, sample_rate):
    
    S = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128)
    # Convert to log scale (dB). We'll use the peak power (max) as reference.
    log_S = librosa.power_to_db(S, ref=np.max)
    return log_S

def mfcc(samples,sample_rate):
    S_spec = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128)
    log_S= librosa.power_to_db(S_spec, ref=np.max)
    mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
    return mfcc

In [3]:
"""
Some extra utilities to deal with  differences in length of the audio files provided. We have pad_audio which will help us with 
making clips which are less than 1 second long to be exactly 1 second long. Chop_audio will help us generate 1 second silence clips
from the long silence clips provided to us in the training set. 
"""
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=1000):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

In [4]:
path=train_audio_path
subdir_list = [dir for dir in os.listdir(path)]
labels_list= subdir_list
target_word_list=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
silence_list=['_background_noise_']
unknown_label_list=[label for label in labels_list if label not in target_word_list and label not in silence_list]

In [5]:
subfolderlist = []
for x in os.listdir(train_audio_path):
    if os.path.isdir(train_audio_path + '/' + x):
        subfolderlist.append(x)

In [6]:
""" obtain a file map with path and label for training iterators """
file_target = []
for x in subfolderlist:
    temp = [[train_audio_path+x+'/'+y,x] for y in os.listdir(train_audio_path + x) if '.wav' in y]
    file_target += temp

train_file_map = pd.DataFrame(file_target, columns=['path', 'label'])

In [7]:
""" Create a dataframe containing the path to the audio files provided to us with the corresponding label """
train_file_map.head()

Unnamed: 0,path,label
0,../../train/audio/bed/c245d3d7_nohash_0.wav,bed
1,../../train/audio/bed/a1a59b72_nohash_0.wav,bed
2,../../train/audio/bed/89f3ab7d_nohash_1.wav,bed
3,../../train/audio/bed/35c8fa78_nohash_1.wav,bed
4,../../train/audio/bed/1706c35f_nohash_0.wav,bed


In [8]:
train_file_map["nlabel"]=''    

In [9]:
for index,row in train_file_map.iterrows():
    if row["label"] in target_word_list:
        row["nlabel"]=row["label"]
    elif row["label"]=='_background_noise_':
        row["nlabel"]='silence'
    else:
        row["nlabel"]='unknown'

In [10]:
train_file_map.head()

Unnamed: 0,path,label,nlabel
0,../../train/audio/bed/c245d3d7_nohash_0.wav,bed,unknown
1,../../train/audio/bed/a1a59b72_nohash_0.wav,bed,unknown
2,../../train/audio/bed/89f3ab7d_nohash_1.wav,bed,unknown
3,../../train/audio/bed/35c8fa78_nohash_1.wav,bed,unknown
4,../../train/audio/bed/1706c35f_nohash_0.wav,bed,unknown


In [11]:
targets_to_keep = ['yes', 'no', 'up', 'down', 'left',
                   'right', 'on', 'off', 'stop', 'go', 'silence']
train_file_map['target'] = train_file_map['nlabel']
train_file_map['target'] = train_file_map['target'].apply(
    lambda x: x if x in targets_to_keep else 'unknown')
label_to_ix = {
    'unknown': 11,
    'silence': 0,
    'down': 1,
    'go': 2,
    'left': 3,
    'no': 4,
    'off': 5,
    'on': 6,
    'right': 7,
    'stop': 8,
    'up': 9,
    'yes': 10}
ix_to_label ={
    11: 'unknown',
    0: 'silence',
    1: 'down',
    2: 'go',
    3: 'left',
    4: 'no',
    5: 'off',
    6: 'on',
    7: 'right',
    8: 'stop',
    9: 'up',
    10: 'yes'
}
train_file_map['label'] = train_file_map['target'].apply(lambda x: label_to_ix[x])

In [12]:
train_file_map=train_file_map.drop(columns="nlabel")

In [13]:
""" The map file now has numerical values associated with the labels """
train_file_map.tail()

Unnamed: 0,path,label,target
64722,../../train/audio/zero/2ad772d6_nohash_1.wav,11,unknown
64723,../../train/audio/zero/9ff2d2f4_nohash_0.wav,11,unknown
64724,../../train/audio/zero/30f31e42_nohash_0.wav,11,unknown
64725,../../train/audio/zero/37dca74f_nohash_1.wav,11,unknown
64726,../../train/audio/zero/0d393936_nohash_0.wav,11,unknown


In [14]:
train_file_map.to_csv("train_file_map_csv.csv", index=True)

In [15]:
train_file_map["target"].unique()

array(['unknown', 'right', 'on', 'stop', 'no', 'off', 'silence', 'up',
       'yes', 'go', 'left', 'down'], dtype=object)

In [21]:
""" Generating the log spectrogram input for the training data """
new_sample_rate=16000
y_train = []
x_train = []
for i in range(train_file_map.shape[0]):
    sample_rate, samples = wavfile.read(train_file_map["path"][i])
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        y_train.append(train_file_map["label"][i])
        x_train.append(specgram)
x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape)+[1]))
y_train= np.array(y_train)



In [20]:
print('min: ',np.min(x_train), 
      '\nmax: ', np.max(x_train), 
      '\nmean: ', np.mean(x_train),
      '\nmedian: ', np.median(x_train),
      '\nvariance: ', np.var(x_train))

min:  -23.02585 
max:  16.476927 
mean:  -3.101561 
median:  -3.2602808 
variance:  36.329456


In [22]:
""" Saving the log spectrograms and the labels as numpy arrays """
np.save('../../train/x_train.npy', x_train)
np.save('../../train/y_train.npy', y_train.astype(np.int))

In [None]:
""" Generating the Mel spectrogram input for the training data """
new_sample_rate=16000
y_train = []
x_train = []
for i in range(train_file_map.shape[0]):
    if (i + 1) % 500 == 0:
        print(">>> Generating %ith spectrogram..." % (i + 1))
    samples,sample_rate=librosa.load(train_file_map["path"][i],sr=16000)
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        #resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        specgram = mel_specgram(samples, sample_rate)
        y_train.append(train_file_map["label"][i])
        x_train.append(specgram)
x_train = np.array(x_train)
#x_train = x_train.reshape(tuple(list(x_train.shape)+[1]))
y_train= np.array(y_train)

In [None]:
""" Saving the mel spectrograms and the labels as numpy arrays """

np.save('../../train/x_train_melspec.npy', x_train)
np.save('../../train/y_train_melspec.npy', y_train.astype(np.int))

In [31]:
""" Generating the MFCC representation for the training data """
new_sample_rate=16000
y_train = []
x_train = []
for i in range(train_file_map.shape[0]):
    if (i + 1) % 500 == 0:
        print(">>> Generating %ith spectrogram..." % (i + 1))
    samples,sample_rate=librosa.load(train_file_map["path"][i],sr=16000)
    samples = pad_audio(samples)
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
    for samples in n_samples:
        #resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        
        y_train.append(train_file_map["label"][i])
        x_train.append(mfcc(samples, sample_rate))

>>> Generating 500th spectrogram...
>>> Generating 1000th spectrogram...
>>> Generating 1500th spectrogram...
>>> Generating 2000th spectrogram...
>>> Generating 2500th spectrogram...
>>> Generating 3000th spectrogram...
>>> Generating 3500th spectrogram...
>>> Generating 4000th spectrogram...
>>> Generating 4500th spectrogram...
>>> Generating 5000th spectrogram...
>>> Generating 5500th spectrogram...
>>> Generating 6000th spectrogram...
>>> Generating 6500th spectrogram...
>>> Generating 7000th spectrogram...
>>> Generating 7500th spectrogram...
>>> Generating 8000th spectrogram...
>>> Generating 8500th spectrogram...
>>> Generating 9000th spectrogram...
>>> Generating 9500th spectrogram...
>>> Generating 10000th spectrogram...
>>> Generating 10500th spectrogram...
>>> Generating 11000th spectrogram...
>>> Generating 11500th spectrogram...
>>> Generating 12000th spectrogram...
>>> Generating 12500th spectrogram...
>>> Generating 13000th spectrogram...
>>> Generating 13500th spectrogr

[array([[-6.64221802e+02, -6.55702881e+02, -6.52751038e+02,
         -6.49370850e+02, -6.49632202e+02, -6.55987122e+02,
         -6.64617188e+02, -6.72303101e+02, -6.74798950e+02,
         -6.68269226e+02, -6.72569946e+02, -6.76649414e+02,
         -6.76298401e+02, -6.64679993e+02, -6.45385498e+02,
         -6.34434998e+02, -5.94776733e+02, -4.38888489e+02,
         -3.59091248e+02, -3.41102753e+02, -3.52265137e+02,
         -3.62383331e+02, -3.61234100e+02, -3.65657898e+02,
         -3.79975342e+02, -4.08692535e+02, -4.53523193e+02,
         -5.18944397e+02, -5.18999329e+02, -5.24747986e+02,
         -5.65577454e+02, -6.04207947e+02],
        [ 6.88884888e+01,  7.48949738e+01,  7.48137970e+01,
          7.41489410e+01,  7.14021149e+01,  6.70969315e+01,
          6.23771210e+01,  5.95197754e+01,  5.66406136e+01,
          5.65572739e+01,  5.66603584e+01,  5.51428070e+01,
          5.77845001e+01,  7.23310165e+01,  8.70577698e+01,
          9.18888855e+01,  8.77468719e+01,  7.54693604e+

In [33]:
x_train_delta = []
x_train_delta2 = []
for instance in x_train:
    mfcc_delta = librosa.feature.delta(instance)
    mfcc_delta2 = librosa.feature.delta(instance, order=2)
    x_train_delta.append(mfcc_delta)
    x_train_delta2.append(mfcc_delta2)

In [34]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_train_delta = np.array(x_train_delta)
x_train_delta2 = np.array(x_train_delta2)

In [35]:
""" Saving the MFCC and the labels as numpy arrays """

np.save('../../train/x_train_mfcc.npy', x_train)
np.save('../../train/x_train_mfcc_delta.npy', x_train_delta)
np.save('../../train/x_train_mfcc_delta2.npy', x_train_delta2)
np.save('../../train/y_train_mfcc.npy', y_train.astype(np.int))

### We now have three different forms of input (in the form of numpy arrays). 
1. Log Spectrogram
2. Mel Spectrogram
3. MFCC Coefficients 

In [1]:
## Concludes Input Generation ## 