In [1]:
# basic library imports
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import random
from tqdm import tqdm

# pandas setting 
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
pd.options.display.max_rows = 5000

# encoding the classes
from sklearn.preprocessing import LabelEncoder

# librosa for audion feature extraction
import librosa
import gc
import pickle
import random
from multiprocessing import Pool
from PIL import Image
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# plotly libraries
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly_express as px


# keras components for constructing the model
import tensorflow as tf
from tensorflow.keras import optimizers, losses, activations, models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler
from tensorflow.keras.layers import (Dense, Input, Dropout, Convolution2D, 
MaxPool2D, GlobalMaxPool2D, GlobalAveragePooling2D, concatenate)
from tensorflow.keras.applications.xception import Xception

In [2]:
# audio file length and batch size for generator
sampling_freq = 16000
duration = 4
input_length = sampling_freq*duration
batch_size = 32

### Input target mapping

In [3]:
def input_to_target(base_data_path = 'data/'):

    # audio files and their corresponding labels
    train_path = base_data_path + "train/Train/*.wav"
    train_label_path = base_data_path +  "train.csv"
    test_path =  base_data_path + "test/Test/*.wav"

    # input
    train_files = glob.glob(train_path)
    train_files = pd.DataFrame({'train_file_paths': train_files})
    train_files['ID'] = train_files['train_file_paths'].apply(lambda x:x.split('/')[-1].split('.')[0])
    train_files['ID'] = train_files['ID'].astype(int)
    train_files = train_files.sort_values(by='ID')
    test_files = glob.glob(test_path)

    # target
    train_labels = pd.read_csv(train_label_path)
    train_file_to_label = train_files.merge(train_labels, on= "ID", how='inner')

    # encoding the classes
    int_encode = LabelEncoder()
    train_file_to_label['class_int_encode'] = int_encode.fit_transform(train_file_to_label['Class'])
    
    
    return train_file_to_label


def audio_normalization(data):
    
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+0.0001)
    return data-0.5

input_length = 16000*4

batch_size = 32
n_mels = 320

def mel_spectrum_db(audio, sample_rate=16000, window_size=20, #log_specgram
                 step_size=10, eps=1e-10):

    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels= n_mels)
    mel_db = (librosa.power_to_db(mel_spec, ref=np.max) + 40)/40

    return mel_db.T


def stretch(data, rate=1):

    data = librosa.effects.time_stretch(data, rate)
    if len(data)>input_length:
        data = data[:input_length]
    else:
        data = np.pad(data, (0, max(0, input_length - len(data))), "constant")

    return data

def pitch_shift(data, n_steps=3.0):

    data = librosa.effects.pitch_shift(data, sr=input_length, n_steps=n_steps)
    if len(data)>input_length:
        data = data[:input_length]
    else:
        data = np.pad(data, (0, max(0, input_length - len(data))), "constant")

    return data




def loguniform(low=0.00000001, high=0.01):
    return np.exp(np.random.uniform(np.log(low), np.log(high)))


def white(N, state=None):
    """
    White noise.
    :param N: Amount of samples.
    :param state: State of PRNG.
    :type state: :class:`np.random.RandomState`
    White noise has a constant power density. It's narrowband spectrum is therefore flat.
    The power in white noise will increase by a factor of two for each octave band,
    and therefore increases with 3 dB per octave.
    """
    state = np.random.RandomState() if state is None else state
    return state.randn(N)

def augment(data):
    if np.random.uniform(0, 1)>0.95:
        wnoise = loguniform()
        data = data + wnoise*white(len(data))
    if np.random.uniform(0, 1)>0.95:
        stretch_val = np.random.uniform(0.9, 1.1)
        data = stretch(data, stretch_val)
    if np.random.uniform(0, 1)>0.95:
        pitch_shift_val = np.random.uniform(-6, 6)
        data = pitch_shift(data, n_steps=pitch_shift_val)
    return data

def load_audio_file(file_path, input_length=input_length):
    data = librosa.core.load(file_path, sr=16000)[0] #, sr=16000
    if len(data)>input_length:
        
        
        max_offset = len(data)-input_length
        
        offset = np.random.randint(max_offset)
        
        data = data[offset:(input_length+offset)]
        
        
    else:
        if input_length > len(data):
            max_offset = input_length - len(data)

            offset = np.random.randint(max_offset)
        else:
            offset = 0
        
        
        data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        
    data = augment(data)
    data = mel_spectrum_db(data)

    return data

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generator(file_paths, target_labels, batch_size=32):
    while True:
        file_paths, target_labels = shuffle(file_paths, target_labels)
        
        for batch_files, batch_labels in zip(chunker(file_paths, size=batch_size),
                                             chunker(target_labels, size= batch_size)):

            batch_data = [load_audio_file(fpath) for fpath in batch_files]
            batch_data = np.array(batch_data)[:,:,:,np.newaxis]

            
            yield batch_data, batch_labels

In [5]:
train_file_to_label = input_to_target()
train_file_to_label.head()


Unnamed: 0,train_file_paths,ID,Class,class_int_encode
0,data/train/Train/0.wav,0,siren,8
1,data/train/Train/1.wav,1,street_music,9
2,data/train/Train/2.wav,2,drilling,4
3,data/train/Train/3.wav,3,siren,8
4,data/train/Train/4.wav,4,dog_bark,3


### 1-d conv

In [54]:
def build_2d_model(input_shape = (126, 320, 1), nclass = 10):
       
    input_wave = Input(shape=input_shape)

    xception = Xception(input_shape=(126, 320, 1), weights=None, include_top=False)

    x = xception(input_wave)
    x = GlobalMaxPool2D()(x)
    x = Dropout(rate=0.1)(x)

    x = Dense(128, activation=activations.relu)(x)
    x = Dense(nclass, activation=activations.softmax)(x)

    model = models.Model(inputs=input_wave, outputs=x)
    opt = optimizers.Adam()

    model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
    model.summary()
    return model

### Generator

In [55]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generator(file_paths, target_labels, batch_size=batch_size):
    while True:
        file_paths, target_labels = shuffle(file_paths, target_labels)
        
        for batch_files, batch_labels in zip(chunker(file_paths, size=batch_size),
                                             chunker(target_labels, size= batch_size)):
                                             
            batch_data = [load_audio_file(fpath) for fpath in batch_files]
            batch_data = np.array(batch_data)[:,:,:, np.newaxis]
            
            yield batch_data, batch_labels

### Train

In [56]:
def train_audio(train_files, train_labels,
                val_files, val_labels, input_shape = (126,320,1),nclass = 10, epochs = 20):
    
    model = build_2d_model(input_shape=(126,320,1), nclass=n_class)
    
    model.fit_generator(generator(train_files, train_labels), 
                        steps_per_epoch=len(train_files)//batch_size, epochs=epochs,

                        validation_data=generator(val_files, val_labels), 
                        validation_steps=len(val_files)//batch_size,
                        use_multiprocessing=True, max_queue_size=1,
                        callbacks=[ModelCheckpoint("models/baseline_cnn.h5",
                                                   monitor="val_acc", save_best_only=True),
                                   EarlyStopping(patience=5, monitor="val_acc")])


    model.save_weights("models/baseline_cnn.h5")
    

In [57]:
input_files = train_file_to_label['train_file_paths']
target_labels = train_file_to_label['class_int_encode']

train_files, val_files, train_labels, val_labels = train_test_split(input_files, target_labels,
                                                                    test_size=0.15, random_state=42)
train_files = train_files.values
val_files = val_files.values
train_labels = train_labels.values
val_labels = val_labels.values
n_class= len(train_file_to_label['Class'].unique())

train_audio(train_files, train_labels,
                val_files, val_labels, nclass = 10, epochs = 20)

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 126, 320, 1)]     0         
_________________________________________________________________
xception (Model)             (None, 4, 10, 2048)       20860904  
_________________________________________________________________
global_max_pooling2d_4 (Glob (None, 2048)              0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               262272    
_________________________________________________________________
dense_9 (Dense)              (None, 10)                1290      
Total params: 21,124,466
Trainable params: 21,069,938
Non-trainable params: 54,528
__________________________________________

UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[{{node xception_4/block1_conv1/Conv2D}}]]
	 [[loss_4/dense_9_loss/sparse_categorical_crossentropy/weighted_loss/broadcast_weights/assert_broadcastable/is_valid_shape/else/_2523/has_valid_nonscalar_shape/then/_5254/has_invalid_dims/concat/_348]] [Op:__inference_keras_scratch_graph_76975]

### Test



In [None]:
def test_audio(test_files, list_labels):
    bag = 5
    array_preds = 0

    for i in tqdm(range(bag)):

        list_preds = []

        for batch_files in tqdm(chunker(test_files, size=batch_size), total=len(test_files)//batch_size ):
            batch_data = [load_audio_file(fpath) for fpath in batch_files]
            batch_data = np.array(batch_data)[:,:,np.newaxis]
            preds = model.predict(batch_data).tolist()
            list_preds += preds


        array_preds += np.array(list_preds)/bag

    list_labels = np.array(list_labels)

    top_5 = list_labels[np.argsort(-array_preds, axis=1)[:, :5]]
    pred_labels = [' '.join(list(x)) for x in top_5]


    df = pd.DataFrame(test_files, columns=["file_name"])
    df['label'] = pred_labels
    df['file_name'] = df['file_name'].apply(lambda x: x.split("/")[-1])
    
    return df


In [None]:
base_data_path = 'data/'
test_path =  base_data_path + "test/Test/*.wav"
test_files = glob.glob(test_path)[0:3]
list_labels = train_file_to_label['Class'].unique().tolist()
test_audio(test_files, list_labels)