In [1]:
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd
import random
from pydub import AudioSegment



In [2]:
spotter_data = "../../spotter_data/"
background_data = "../../spotter_data/_background_noise_/"

noise_len = 10000
Tx_spectogram = 5511
Tx_mfcc = 216
n_freq = 101
Ty = 1375

In [3]:
df_target = pd.read_csv('../../Target_words_dataframe')
df_unknown = pd.read_csv('../../Unknown_words_dataframe')

In [4]:
df_target = df_target.drop(columns=['Unnamed: 0'], axis=1)
df_unknown = df_unknown.drop(columns=['Unnamed: 0'], axis=1)

In [5]:
background_dir = os.listdir(background_data)

In [6]:
background_dir

['doing_the_dishes.wav',
 'dude_miaowing.wav',
 'exercise_bike.wav',
 'pink_noise.wav',
 'running_tap.wav',
 'white_noise.wav']

In [7]:
def pick_random_background(background_dir):
    
    return random.choice(background_dir)

In [8]:
def pick_random_target_word():
    
    return random.choice(range(0, df_target.shape[0]))

In [9]:
def pick_random_unknown_word():
    
    return random.choice(range(0, df_unknown.shape[0]))

In [10]:
def get_background_segment(audio_clip, noise_len):
    
    start_ms = 0
    
    while True:
        
        ind = random.choice(range(0, len(audio_clip)))
        
        if ind + noise_len < len(audio_clip):
            
            return audio_clip[ind: ind + noise_len]

In [11]:
def get_random_time_segment(background_len, audio_clip_len):
    
    start_segment = random.choice(range(0, background_len - audio_clip_len))
    
    return (start_segment, start_segment + audio_clip_len - 1)

In [12]:
def is_intersect(segments, new_segment):
    
    is_overlapping = False
    
    segment_start, segment_end = new_segment
    
    for previous_start, previous_end in segments:
        
        if segment_start <= previous_end and segment_end >= previous_start:
            is_overlapping = True

    return is_overlapping

In [13]:
def graph_spectogram(path):
    
    nfft = 200 
    fs = 8000 
    noverlap = 120 
    
    data, sample_rate = librosa.load(path, sr=44100)
    pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
    
    return pxx;

In [14]:
def match_target_amplitude(sound, target_dBFS):
    
    change_in_dBFS = target_dBFS - sound.dBFS
    
    return sound.apply_gain(change_in_dBFS)

In [15]:
def make_sample_1(data_type, noise_len):
    
    noise = pick_random_background(background_dir)
    
    data = AudioSegment.from_wav(background_data + noise)
    data = data - 20
    
    X = get_background_segment(data, noise_len)
    y = np.zeros((1, Ty))
    
    segments = []
    
    cnt = 0
    
    while (cnt < 3):
        
        target_word_index = pick_random_target_word()
        target_word_path = df_target['path'][target_word_index]
        target_word_data = AudioSegment.from_wav(target_word_path)
        target_word_segment = get_random_time_segment(len(X), len(target_word_data))
        
        start_x = target_word_segment[0]
        end_x = target_word_segment[1]
        end_y = int(end_x * Ty / 10000.0)
        
        if is_intersect(segments, target_word_segment) == False:
            
            cnt += 1
            segments.append(target_word_segment)
            
            for j in range(end_y + 1, end_y + 51):
                if j < Ty:
                    y[0, j] = 1
                
            X = X.overlay(target_word_data, position = start_x)
    
    cnt = 0
    
    while (cnt < 2):
        
        unknown_word_index = pick_random_unknown_word()
        unknown_word_path = df_unknown['path'][unknown_word_index]
        unknown_word_data = AudioSegment.from_wav(unknown_word_path)
        unknown_word_segment = get_random_time_segment(len(X), len(unknown_word_data))
        
        start_x = unknown_word_segment[0]
        end_x = unknown_word_segment[1]
        end_y = int(end_x * Ty / 10000.0)
        
        if is_intersect(segments, unknown_word_segment) == False:
            
            cnt += 1
            segments.append(unknown_word_segment)
            
            for j in range(end_y + 1, end_y + 51):
                if j < Ty:
                    y[0, j] = 1
                
            X = X.overlay(unknown_word_data, position = start_x)

    X = match_target_amplitude(X, -20.0)
    save_name = "train.wav"
    save_path = X.export(save_name, format="wav")
    
    if data_type == "spectogram":
    
        X = graph_spectogram(save_name)
        return X, y
    
    elif data_type == "mfcc":
        
        X, sample_rate = librosa.load(save_name, res_type='kaiser_fast',duration=10,sr=22050*2,offset=0.5)  
        mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
    
        return mfcc, y
    
    else:
        
        raise TypeError("Unknown audio type")

In [16]:
def make_batch_sample_1(count, data_type):
    
    X, Y = make_sample(data_type)
    X = X.T
    Y = Y.T
    X = np.expand_dims(X, axis=0)
    Y = np.expand_dims(Y, axis=0)
    
    for i in range(0, count - 1):
        
        newX, newY = make_sample(data_type)
        newX = newX.T
        newY = newY.T
        newX = np.expand_dims(newX, axis=0)
        newY = np.expand_dims(newY, axis=0)
        
        X = np.concatenate((X, newX), axis=0)
        Y = np.concatenate((Y, newY), axis=0)
    
    return X, Y

In [17]:
def make_sample_2(noise_len, type_word):
    noise = pick_random_background(background_dir)
    
    data = AudioSegment.from_wav(background_data + noise)
    data = data - 20
    
    X = get_background_segment(data, noise_len)
    Y = pd.Series()
    
    if type_word == "TARGET":
        
        target_word_index = pick_random_target_word()
        target_word_path = df_target['path'][target_word_index]
        target_word_data = AudioSegment.from_wav(target_word_path)
        target_word_segment = get_random_time_segment(len(X), len(target_word_data))
        
        start_x = target_word_segment[0]
        end_x = target_word_segment[1]
        
        X = X.overlay(target_word_data, position = start_x)
        Y = df_target['word'][target_word_index]
        
    elif type_word == "UNKNOWN":
        
        unknown_word_index = pick_random_unknown_word()
        unknown_word_path = df_unknown['path'][unknown_word_index]
        unknown_word_data = AudioSegment.from_wav(unknown_word_path)
        unknown_word_segment = get_random_time_segment(len(X), len(unknown_word_data))
        
        start_x = unknown_word_segment[0]
        end_x = unknown_word_segment[1]
        
        X = X.overlay(unknown_word_data, position = start_x)
        Y = df_unknown['word'][unknown_word_index]
        
    else:
        
        raise TypeError("Unknown word type") 
        
    X = match_target_amplitude(X, -20.0)
    save_name = "train.wav"
    save_path = X.export(save_name, format="wav")
    X, sample_rate = librosa.load(save_name, res_type='kaiser_fast',duration=1.1,sr=22050*2,offset=0.5)  
    mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
    
    return mfcc, Y
    