### Speech Command Classification - ETL - V2.0

Restructure data
Perform VAD and ensure all data is correct length


In [108]:
!ls data/processed

bed   dog    five  happy  marvin  off  right   silence	three  up   zero
bird  down   four  house  nine	  on   seven   six	tree   wow
cat   eight  go    left   no	  one  sheila  stop	two    yes


In [80]:
import numpy as np
import librosa
import librosa.display
import os
from os.path import isdir, join
import csv
%matplotlib inline
import matplotlib.pyplot as plt
import random
import soundfile as sf
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import webrtcvad
import IPython.display as ipd

create vars for directory with input data and for processed data

In [81]:
train_audio_path = "data/train/audio/"
processed_audio_path = "data/processed/"

!ls data/processed


bed   dog    five  happy  marvin  off  right   silence	three  up   zero
bird  down   four  house  nine	  on   seven   six	tree   wow
cat   eight  go    left   no	  one  sheila  stop	two    yes


Move the background noise data out of the audio folder

In [None]:
'''%%bash
mv data/train/audio/_background_noise_ data/train
ls data/train'''

Create silence dir and create audio files using bg_noise, splitting it into 1s 'chunks' for use in training

In [21]:
def create_silence():
    """
    reads wav files in background noises folder, 
    splits them and saves to silence folder in train_dir
    """
    for file in os.listdir('data/train/_background_noise_/'):
        if 'wav' in file:
            sig, rate = librosa.load('data/train/_background_noise_/' + file, sr = 16000)        
            sig_arr = split_arr(sig)
            if not os.path.exists(train_audio_path+'silence/'):
                os.makedirs(train_audio_path+'silence/')
            for ind, arr in enumerate(sig_arr):
                filename = 'frag%d' %ind + '_%s' %file # example: frag0_running_tap.wav
                librosa.output.write_wav(train_audio_path+'silence/'+filename, arr, 16000)
  

In [23]:
def split_arr(arr):
    """
    split an array into chunks of length 16000
    Returns:
        list of arrays
    """
    return np.split(arr, np.arange(16000, len(arr), 16000))

In [26]:
create_silence()

Create all_classes labels list

In [4]:
commands = 'yes no up down left right on off stop go silence unknown'.split()
folders = os.listdir(train_audio_path)
# put folders in same order as in the classes list, used when making sets
all_classes = [x for x in commands[:11]]
for ind, cl in enumerate(folders):
    if cl not in commands:
        all_classes.append(cl)
        
##all_classes.remove('processed')
print(all_classes)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'eight', 'one', 'happy', 'cat', 'sheila', 'three', 'nine', 'two', 'marvin', 'house', 'dog', 'bird', 'bed', 'tree', 'seven', 'wow', 'six', 'five', 'four', 'zero']


Process the audio files, applying VAD & padding any samples that are < 1 sec. Output them to new processed directory

In [86]:
  


dirs = [d for d in os.listdir(train_audio_path) if d in vad_classes]
for direct in dirs:
    waves = [f for f in os.listdir(join(train_audio_path, direct)) if f.endswith('.wav')]
    for wav in waves:
        sample_rate, samples = wavfile.read(join(train_audio_path, direct)+'/'+wav)
        if len(samples) < 16000: # pad shorter than 1 sec audio with ramp to zero
                samples = np.pad(samples, (0,16000-len(samples)), 'linear_ramp')
        vad_samples = doVad(samples, sample_rate)
        if len(vad_samples) < 16000: # pad shorter than 1 sec audio with ramp to zero
                vad_samples = np.pad(vad_samples, (0,16000-len(vad_samples)), 'linear_ramp')
        if not os.path.exists(processed_audio_path+ direct + '/'):
                os.makedirs(processed_audio_path+ direct + '/')
        wavfile.write(processed_audio_path+direct+'/'+wav, 16000, vad_samples)


Create the Vad_classes list from all_classes - remove silence as if silence is processed then VAD encounters an error

In [85]:
vad_classes = all_classes.copy()
vad_classes.remove("silence")

len(training_list)==(len(all_files_list))-(len(validation_list))
print(len(training_list))
print(len(validation_list))
print(len(all_files_list))
print(class_counts)


57938
6798
64736
{'eight': 2351, 'four': 2371, 'happy': 1741, 'go': 2371, 'off': 2356, 'marvin': 1745, 'right': 2366, 'sheila': 1733, 'seven': 2376, 'five': 2356, 'wow': 1744, 'dog': 1745, 'yes': 2376, 'down': 2358, 'bed': 1712, 'bird': 1730, 'house': 1749, 'cat': 1732, 'two': 2372, 'stop': 2379, 'nine': 2363, 'tree': 1732, 'up': 2374, 'left': 2352, 'on': 2366, 'zero': 2375, 'one': 2369, 'no': 2374, 'three': 2370, 'six': 2368}


In [96]:
!ls data/train/audio/processed
!ls data/train/audio
##!cp -avr data/train/audio/silence data/train/audio/processed


bed   dog    five  happy  marvin  off  right   silence	three  up   zero
bird  down   four  house  nine	  on   seven   six	tree   wow
cat   eight  go    left   no	  one  sheila  stop	two    yes
bed   dog    five  happy  marvin  off  processed  sheila   stop   two  yes
bird  down   four  house  nine	  on   right	  silence  three  up   zero
cat   eight  go    left   no	  one  seven	  six	   tree   wow


Check processed Audio is equal to init audio after VAD:

In [100]:
#!ls data/train/audio/processed/unknown
print(len(waves))
i=0
for f in os.listdir(join(processed_audio_path,direct)):
    i=i+1
print(str(i))
ipd.Audio(processed_audio_path+direct+'/'+waves[21])

2372
2372


Count all files for a class and create a list of validation and training files:

In [9]:
with open('data/train/validation_list.txt') as val_list:
    validation_list = [row[0] for row in csv.reader(val_list)]
assert len(validation_list) == 6798, 'file not loaded'

#add silence files to validation_list
for i, file in enumerate(os.listdir(processed_audio_path + 'silence/')):
    if i%10==0:
        validation_list.append('silence/'+file)

training_list = []
all_files_list = []
class_counts = {}

for direct in dirs:
    files = os.listdir(processed_audio_path + direct)
    for i, f in enumerate(files):
        all_files_list.append(direct + '/' + f)
        path = direct + '/' + f
        if path not in validation_list:
            training_list.append(direct + '/' + f)        
        class_counts[direct] = i

#remove filenames from validation_list that don't exist anymore (due to eda)
validation_list = list(set(validation_list).intersection(all_files_list))

The VAD Function:

In [84]:
##As a Function
def doVad(samples, sample_rate):
    
    vad = webrtcvad.Vad()

    # set aggressiveness from 0 to 3
    vad.set_mode(0)
    
    import struct
    raw_samples = struct.pack("%dh" % len(samples), *samples)
    
    window_duration = 0.03 # duration in seconds

    samples_per_window = int(window_duration * sample_rate + 0.5)

    bytes_per_sample = 2
    
    segments = []

    for start in np.arange(0, len(samples), samples_per_window):
        stop = min(start + samples_per_window, len(samples))
        
        is_speech = vad.is_speech(raw_samples[start * bytes_per_sample: stop * bytes_per_sample], 
                              sample_rate = sample_rate)

        segments.append(dict(
           start = start,
           stop = stop,
           is_speech = is_speech))
        
    if segments.count('is_speech')>0:
        speech_samples = np.concatenate([samples[segment['start']:segment['stop']] for segment in segments if segment['is_speech']])
    else:
        speech_samples = samples
    return speech_samples

In [272]:
print(waves[2200])
ipd.Audio('data/train/audio/processed/yes/'+waves[2000])

b4aa9fef_nohash_3.wav
