In [None]:
import os
import numpy as np
import keras
import librosa
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, Flatten, GlobalMaxPooling2D , GlobalMaxPooling1D, MaxPooling2D, MaxPooling1D, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPool2D, concatenate, Dense, Dropout
from keras.layers import  Conv2D, MaxPooling2D, UpSampling2D, Lambda, Reshape
from keras.layers import Input, GRU, RepeatVector, BatchNormalization, TimeDistributed, Conv1D
from keras.layers import GlobalAveragePooling1D, LSTM
from keras.optimizers import Adam
from tensorflow.python.keras.utils import to_categorical

from sklearn.metrics import classification_report,accuracy_score

from scipy.io import wavfile
from tqdm import tqdm
from pydub import AudioSegment


path = "data/train/audio"


In [None]:
labels_actual = 'yes no up down left right on off stop go silence unknown'.split()

In [None]:
labels = ['right','eight','cat','tree','bed','happy','go','silence',
          'dog','no','wow','nine','left','stop','three','sheila','one',
          'bird','zero','seven','up','marvin','two',
          'house','down','six','yes','on','five','off','four']

In [None]:
def wav2mfcc(file,n_fft=960, hop_length=320):
    wave, sr = librosa.load(file,sr=None)
    # It is very important that all signals be of same length as we need to stack them up vertically on top of 
    # another for creating a numpy array - eventually for building the model
    #print(sr)
    if len(wave) > sr:
        wave = wave[:sr]
    else:
        # when the given wave is slightly less than 1 sec, or length is less than 16000, we need to pad with zeros
        wave = np.pad(wave, (0,max(0,sr-len(wave))),mode = 'constant')
    mfcc = librosa.feature.mfcc(wave, sr=16000,n_mfcc = 12,n_fft=960, hop_length=320)
    
    return mfcc

## Data generation Without Data Augmentation 

In [None]:
# Split Background noise audio files into chunks of silence files

list_files = os.listdir("data/train/audio/_background_noise_/")
list_files.remove('.DS_Store')
for i,file in enumerate(list_files):
    #wav, sr = librosa.load("data/train/audio/_background_noise_/" +file)
    wav = AudioSegment.from_wav("data/train/audio/_background_noise_/" +file)
    time_steps = 200
    sr = 16000
    for i in range(len(wav)//time_steps):
        chunk = wav[i*time_steps:i*time_steps + sr] 
        if len(chunk)<200:
            continue
        chunk = np.array(chunk.get_array_of_samples()).astype(np.int16)
        wavfile.write("data/train/audio/silence/"+file.split('.')[0]+str(i)+".wav",data=chunk,rate=16000)

In [None]:
## NO AUGMENTATION - Generate numpy files for every label except silence label
def preprocessing_data(path=path):
    labels_ = labels.copy()
    labels_.remove('silence')
    for data in labels_:
        filepath = path + '/'+data
        mfcc_vectors = []
        for allfiles in os.listdir(filepath):
                 mfcc = wav2mfcc(filepath+ '/' +allfiles)
                 #print(mfcc.shape)
                 mfcc_vectors.append(mfcc)
        mfcc_vectors = np.array(mfcc_vectors)
        print(mfcc_vectors.shape)
        np.save("data_np_mfcc" + '/' + data + '.npy', mfcc_vectors)
        print(data + ".npy  filesaved")
        x = np.load("data_np_mfcc"+ '/' + data + ".npy")
        #print(x.shape)

In [None]:
# NO AUGMENTATION - Create MFCCs for silence chunks -
mfcc_vectors = []
filepath = path + '/' + data
print(filepath)
for allfiles in os.listdir(filepath):
     if allfiles !='.DS_Store':
         wave, sr = librosa.load(filepath+ '/' +allfiles, mono=True, sr=None)
         if len(wave)>0:
             mfcc = wav2mfcc(filepath+ '/' +allfiles)
             print(mfcc.shape)
             mfcc_vectors.append(mfcc)
mfcc_vectors = np.array(mfcc_vectors)
np.save("data_np_mfcc" + '/' + data + '.npy', mfcc_vectors)
print(data + ".npy  filesaved")
x = np.load("data_np_mfcc"+ '/' + data + ".npy")
print(x.shape)

In [None]:
preprocessing_data()

## Data generation With Data Augmentation

In [None]:
def read_audio(filepath):
    wav,sr = librosa.load(filepath, sr = None)
    if len(wav)>sr:
        wav = wav[:sr]
    else:
        wav = np.pad(wav, (0, max(0, sr - len(wav))), "constant")
    return wav

In [None]:
def read_audio(filepath):
    wav,sr = librosa.load(filepath, sr = None)
    if len(wav)>sr:
        wav = wav[:sr]
    else:
        wav = np.pad(wav, (0, max(0, sr - len(wav))), "constant")
    return wav

In [None]:
def normalize_audio(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

In [None]:
def join_audio(audio1, audio2):
    joined = (audio1 + audio2)
    return joined

In [None]:
def stretch(wav, rate=1):
    sr = 16000
    wav = librosa.effects.time_stretch(wav, rate)
    if len(wav)>sr:
        wav = wav[:sr]
    else:
        wav = np.pad(wav, (0, max(0, sr - len(wav))), "constant")
    return wav

In [None]:
## Creating one big normalized background noise file

backgr_noise_files = [file for file in os.listdir('data/train/audio/_background_noise_')]
backgr_noise_files.remove('.DS_Store')

masterfile_noise = AudioSegment.from_wav(('data/train/audio/_background_noise_/'+backgr_noise_files[0]))
masterfile_noise = normalize_audio(noise, -15)

for i in range(1, len(backgr_noise_files)):
    noise = AudioSegment.from_wav(('data/train/audio/_background_noise_/'+backgr_noise_files[i]))
    noise = normalize_audio(noise, -15)
    masterfile_noise = join_audio(masterfile_noise, noise) 
    
masterfile_noise_wav = np.array(masterfile_noise.get_array_of_samples()).astype(np.int16)

In [None]:
# Creating chunks of silence files on normalized background noise :

wav = masterfile_noise
time_steps = 200
sr = 16000
for i in range(len(wav)//time_steps):
    chunk = wav[i*time_steps:i*time_steps + sr] 
    if len(chunk)<200:
        continue
    chunk = np.array(chunk.get_array_of_samples()).astype(np.int16)
    wavfile.write("data/train/audio/silence/"+'masterfile_noise'+str(i)+".wav",data=chunk,rate=16000)

In [None]:
## AUGMENTATION - TRAIN DATA NUMPY ARRAYS CREATION
def preprocessing_data_augmentation(path=path):
    labels_ = labels.copy()
    labels_.remove('silence')
    for data in labels_:
        filepath = path + '/' + data
        mfcc_vectors = []
        for allfiles in os.listdir(filepath):
             mfcc = wav2mfcc(filepath+ '/' +allfiles)
             #print(mfcc.shape)
             mfcc_vectors.append(mfcc)
             wav = read_audio(filepath+ '/' +allfiles)
             wav_time_stretch = stretch(wav,rate = 0.8)
             mfcc = librosa.feature.mfcc(wav_time_stretch, sr=16000,n_mfcc = 12,n_fft=960, hop_length=320)
             #print(mfcc.shape)
             mfcc_vectors.append(mfcc)
             wav_pitch_shift  = librosa.effects.pitch_shift(wav, sr, n_steps=4)
             mfcc = librosa.feature.mfcc(wav_pitch_shift, sr=16000,n_mfcc = 12,n_fft=960, hop_length=320)
             #print(mfcc.shape)
             mfcc_vectors.append(mfcc)
             wav_white_noise = wav + 0.005*(np.random.randn(len(wav)))
             mfcc = librosa.feature.mfcc(wav_white_noise, sr=16000,n_mfcc = 12,n_fft=960, hop_length=320)
             #print(mfcc.shape)
             mfcc_vectors.append(mfcc)
        mfcc_vectors = np.array(mfcc_vectors)
        
        #print(mfcc_vectors.shape)
        np.save("data_np_mfcc_aug" + '/' + data + '.npy', mfcc_vectors)
        print(data + ".npy  filesaved")
        x = np.load("data_np_mfcc_aug"+ '/' + data + ".npy")
        print(x.shape)

In [None]:
preprocessing_data_augmentation()

In [None]:
# DATA AUGMENTATION - Create MFCCs for silence chunks - silence files
data = 'silence'
mfcc_vectors = []
filepath = path + '/' + data
print(filepath)
for allfiles in os.listdir(filepath):
     if allfiles !='.DS_Store':
         print(filepath+ '/' +allfiles)
         wave, sr = librosa.load(filepath+ '/' +allfiles, mono=True, sr=None)
         if len(wave)>0:
             mfcc = wav2mfcc(filepath+ '/' +allfiles)
             print(mfcc.shape)
             mfcc_vectors.append(mfcc)
        
mfcc_vectors = np.array(mfcc_vectors)
np.save("data_np_mfcc_aug" + '/' + data + '.npy', mfcc_vectors)
print(data + ".npy  filesaved")
x = np.load("data_np_mfcc_aug"+ '/' + data + ".npy")
print(x.shape)

## Create test numpy arrays

In [None]:
test_path = "data/test/audio/"
mfcc_test = []
for data in tqdm(os.listdir(test_path)):
    mfcc = wav2mfcc(test_path+'/'+data)
    mfcc_test.append(mfcc)

In [None]:
mfcc_test = np.array(mfcc_test)

np.save('mfcc_test.npy', mfcc_test)

In [None]:
test_files = [file for file in os.listdir('data/test/audio/')]

test_files = np.array(test_files)

 np.save('test_files.npy', test_files)