# Trigger Word Detection (TWD) Part 1

In [1]:
# Pydub for audio processing
from pydub import AudioSegment as AS
from pydub.playback import play

import numpy as np
import random

import glob



In [2]:
file_dict = {}
i = 0

# DIRECTORY : audio --> mutiple folders --> multiple .wav files

for filename in glob.glob('audio\*'):
    file_dict[i] = filename
    i = i + 1

In [3]:
file_dict

{0: 'audio\\bed',
 1: 'audio\\bird',
 2: 'audio\\cat',
 3: 'audio\\dog',
 4: 'audio\\down',
 5: 'audio\\eight',
 6: 'audio\\five',
 7: 'audio\\four',
 8: 'audio\\go',
 9: 'audio\\happy',
 10: 'audio\\house',
 11: 'audio\\left',
 12: 'audio\\marvin',
 13: 'audio\\nine',
 14: 'audio\\no',
 15: 'audio\\off',
 16: 'audio\\on',
 17: 'audio\\one',
 18: 'audio\\right',
 19: 'audio\\seven',
 20: 'audio\\sheila',
 21: 'audio\\six',
 22: 'audio\\stop',
 23: 'audio\\three',
 24: 'audio\\tree',
 25: 'audio\\two',
 26: 'audio\\up',
 27: 'audio\\wow',
 28: 'audio\\yes',
 29: 'audio\\zero',
 30: 'audio\\_background_noise_'}

These are all the kinds of sounds. Inside each folder there are multiple wav files. I took the word 'marvin' to be the trigger word. For convenience I swapped 'marvin' with the last word 'zero'. <br><br>
__marvin__   (12)        => Positive word (triggers) <br>
__29 words__ (0 - 11, 13 - 29)    => Negative words (does not trigger) <br>
____background_noise____ (30) => Background noise to replicate real life situation

In [4]:
flag = file_dict[12]
file_dict[12] = file_dict[29]
file_dict[29] = flag

## About Data

Each second of audio is sampled at 16,000 Hz. This means that <br>
1 second is represented by 16,000 integers (or) <br>
1 milli second is represented by 160 integers. <br> <br>

1 training input is considered to be a 10 second sample. Hence it will be 16,000 * 10 = 1,60,000 integers long. <br>
Similarly the training output will be an sparse array of size 1,60,000. <br>

I will use a 10 second background noise on which 6 words will be overlayed. 3 positive words, 3 negative words.<br>
Note that in the line 22, 1600 zeroes are replaced with ones. These 1s are inserted after the the occurance of a trigger word.

In [5]:
def merge(bg, ls1, ls2):
    
    ls = ls1 + ls2 
    
    # maximum gap = total background / total number of words
    max_gap = int((bg_time/( 2 * pos_neg_count)) * 10)
    ty = np.zeros((len(bg) * 16,),dtype = int)
    
    # This returns all numbers between 0 and 2*pos_neg_count in a random order
    numbers = random.sample(range(0, 2 * pos_neg_count) , 2 * pos_neg_count)
    
    # loc1 -> Location which determines where the audio should be overlayed
    loc1 = len(bg) * random.randint(0, max_gap) / 100

    for i in range(len(numbers)):
        
        # Overlay a sound on the background
        bg = bg.overlay(ls[numbers[i]], position = loc1)
        
        # If the ovrlayed sound is the trigger word, change the next 1600 0s into 1s
        if numbers[i] < pos_neg_count:
            ty[int(loc1 + len(ls[numbers[i]])) * 16 : int(loc1 + len(ls[numbers[i]]) + 100) * 16] = 1 
        
        # loc1 is updated     
        loc1 = (len(bg) * random.randint(10, max_gap) / 100) + loc1
        
        # pydub block converted into array
        bg_array = np.array(bg.get_array_of_samples())
        
    return bg_array, ty, bg

In [6]:
tx_list = []
ty_list = []

pos_neg_count = 3 # 6 sounds = 3 positive + 3 negative
bg_time = 10  # Total background duration

for samp in range(100):
    pos = []
    neg = []
    
    rand = random.randint(7, 15)
    
    # Randomly select a background and choose 10 sec 
    bg = (AS.from_wav(random.choice(glob.glob('audio\_background_noise_\*.wav')))[rand * 1000 : (rand + bg_time) * 1000]) - rand
    
    # Randomly select negative words
    for i in range(pos_neg_count):
        filename = random.choice(glob.glob('audio\marvin\*.wav'))
        pos.append(AS.from_wav(filename))
        
    # Randomly select positive word    
    for j in range(pos_neg_count):
        choice = random.randint(0, 28)
        filename = random.choice(glob.glob(file_dict[choice] + '\*.wav'))
        neg.append(AS.from_wav(filename)) 
        
    tx, ty, wav_file = merge(bg, pos, neg)
    
    tx_list.append(tx)
    ty_list.append(ty)
    wav_file.export('generated\inp' + str(samp) + '.wav', format="wav")


In [7]:
trainy = np.stack(ty_list)
trainx = np.stack(tx_list)

np.save('generated\ trainx.npy', trainx)
np.save('generated\ trainy.npy', trainy)

print(trainx.shape,trainy.shape)

(100, 160000) (100, 160000)
