In [12]:
# basic library imports

import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import random
from tqdm import tqdm

# encoding the classes
from sklearn.preprocessing import LabelEncoder

# librosa for audion feature extraction
import librosa
import gc
import pickle
import random
from multiprocessing import Pool
from PIL import Image
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# plotly libraries
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly_express as px


# keras components for constructing the model
import tensorflow as tf
from tensorflow.keras import optimizers, losses, activations, models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler
from tensorflow.keras.layers import (Dense, Input, Dropout, Convolution1D, 
MaxPool1D, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate)

In [13]:
# audio file length and batch size for generator
sampling_freq = 16000
duration = 4
input_length = sampling_freq*duration
batch_size = 32

### Input target mapping

In [14]:
def input_to_target(base_data_path = 'data/'):

    # audio files and their corresponding labels
    train_path = base_data_path + "train/Train/*.wav"
    train_label_path = base_data_path +  "train.csv"
    test_path =  base_data_path + "test/Test/*.wav"

    # input
    train_files = glob.glob(train_path)
    train_files = pd.DataFrame({'train_file_paths': train_files})
    train_files['ID'] = train_files['train_file_paths'].apply(lambda x:x.split('/')[-1].split('.')[0])
    train_files['ID'] = train_files['ID'].astype(int)
    train_files = train_files.sort_values(by='ID')
    test_files = glob.glob(test_path)

    # target
    train_labels = pd.read_csv(train_label_path)
    train_file_to_label = train_files.merge(train_labels, on= "ID", how='inner')

    # encoding the classes
    int_encode = LabelEncoder()
    train_file_to_label['class_int_encode'] = int_encode.fit_transform(train_file_to_label['Class'])
    
    
    return train_file_to_label


In [15]:
train_file_to_label = input_to_target()
train_file_to_label.head()

Unnamed: 0,train_file_paths,ID,Class,class_int_encode
0,data/train/Train/0.wav,0,siren,8
1,data/train/Train/1.wav,1,street_music,9
2,data/train/Train/2.wav,2,drilling,4
3,data/train/Train/3.wav,3,siren,8
4,data/train/Train/4.wav,4,dog_bark,3


### Normalization and loading audio files

In [16]:
def audio_normalization(data):
    
    max_data = np.max(data)
    min_data = np.min(data)
    data = (data-min_data)/(max_data-min_data+0.0001)
    return data-0.5

def mel_spectrum_db(audio, sample_rate=16000, window_size=20, #log_specgram
                 step_size=10, eps=1e-10, n_mels = 500):

    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels= n_mels)
    mel_db = (librosa.power_to_db(mel_spec, ref=np.max) + 40)/40

    return mel_db.T

def load_audio_file(file_path, input_length=input_length, mel_frequency=False):
    
    data, sr = librosa.core.load(file_path, sr=16000) 
    if len(data)>input_length:
        
        max_offset = len(data)-input_length
        offset = np.random.randint(max_offset)
        data = data[offset:(input_length+offset)]
              
    else:
        
        if input_length > len(data):
            max_offset = input_length - len(data)
            offset = np.random.randint(max_offset)
            
        else:
            offset = 0
        
        data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        
    if mel_frequency:
        data = mel_spectrum_db(data)
    else:
        data = audio_normalization(data)
        
    return data

### Plotting time-vs amplitude

In [17]:
def plot_audio(wave1, wave2, plot_range= 20000): 
    
    data1 = pd.DataFrame({'amplitude':wave1[0:plot_range]})
    data1['time(ms)'] = (np.arange(0,plot_range)*1/16000)/(1e-3)
    data1['class'] = ['drilling']*data1.shape[0]

    data2 = pd.DataFrame({'amplitude':wave2[0:plot_range]})
    data2['time(ms)'] = (np.arange(0,plot_range)*1/16000)/(1e-3)
    data2['class'] = ['siren']*data2.shape[0]

    data = pd.concat([data1, data2], )
    fig = px.line(data, x='time(ms)', y='amplitude', facet_row = 'class', color='class', 
                  template='ggplot2', title = 'audio wave (time vs amplitude)',
                 color_discrete_sequence=px.colors.qualitative.D3)
    iplot(fig)

In [18]:
wave1= load_audio_file(train_file_to_label['train_file_paths'][2], input_length=16000*4)
wave2= load_audio_file(train_file_to_label['train_file_paths'][3], input_length=16000*4)
plot_audio(wave1,wave2)

### 1-d conv

In [38]:
def build_model(input_length, nclass):
    
    input_wave = Input(shape=(input_length, 1))
    
    # convolution block 1
    conv_1 = Convolution1D(16, kernel_size=9, activation=activations.relu, padding="valid")(input_wave)
    conv_1 = Convolution1D(16, kernel_size=9, activation=activations.relu, padding="valid")(conv_1)
    conv_1 = MaxPool1D(pool_size=16)(conv_1)
    conv_1 = Dropout(rate=0.1)(conv_1)
    
    # convolution block 2
    conv_2 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(conv_1)
    conv_2 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(conv_2)
    conv_2 = MaxPool1D(pool_size=8)(conv_2)
    conv_2 = Dropout(rate=0.1)(conv_2)

    
    # convolution block 4
    conv_3 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(conv_2)
    conv_3 = Convolution1D(32, kernel_size=3, activation=activations.relu, padding="valid")(conv_3)
    conv_3 = MaxPool1D(pool_size=4)(conv_3)
    conv_3 = Dropout(rate=0.1)(conv_3)

    
    # convolution block 5
    conv_4 = Convolution1D(256, kernel_size=3, activation=activations.relu, padding="valid")(conv_3)
    conv_4 = Convolution1D(256, kernel_size=3, activation=activations.relu, padding="valid")(conv_4)
    conv_4 = GlobalMaxPool1D()(conv_4)
    conv_4 = Dropout(rate=0.2)(conv_4)

    # dense block 1
    dense_1 = Dense(64, activation=activations.relu)(conv_4)
    dense_1 = Dense(1028, activation=activations.relu)(dense_1)
    dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

    model = models.Model(inputs=input_wave, outputs=dense_1)
    opt = optimizers.Adam(0.00001)

    model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
    model.summary()
    return model

### Generator

In [39]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generator(file_paths, target_labels, batch_size=batch_size):
    while True:
        file_paths, target_labels = shuffle(file_paths, target_labels)
        
        for batch_files, batch_labels in zip(chunker(file_paths, size=batch_size),
                                             chunker(target_labels, size= batch_size)):
                                             
            batch_data = [load_audio_file(fpath) for fpath in batch_files]
            batch_data = np.array(batch_data)[:,:,np.newaxis]
            
            yield batch_data, batch_labels


### Training

In [40]:
train_file_to_label.head(10)

Unnamed: 0,train_file_paths,ID,Class,class_int_encode
0,data/train/Train/0.wav,0,siren,8
1,data/train/Train/1.wav,1,street_music,9
2,data/train/Train/2.wav,2,drilling,4
3,data/train/Train/3.wav,3,siren,8
4,data/train/Train/4.wav,4,dog_bark,3
5,data/train/Train/6.wav,6,children_playing,2
6,data/train/Train/10.wav,10,street_music,9
7,data/train/Train/11.wav,11,drilling,4
8,data/train/Train/12.wav,12,gun_shot,6
9,data/train/Train/15.wav,15,dog_bark,3


In [None]:
input_files = train_file_to_label['train_file_paths']
target_labels = train_file_to_label['class_int_encode']

train_files, val_files, train_labels, val_labels = train_test_split(input_files, target_labels,
                                                                    test_size=0.15, random_state=42)

train_files = train_files.values
val_files = val_files.values
train_labels = train_labels.values
val_labels = val_labels.values


model = build_model(input_length=input_length, nclass=len(train_file_to_label['Class'].unique()))
model.fit_generator(generator(train_files, train_labels), steps_per_epoch=len(train_files)//batch_size, epochs=20,
                    
                    validation_data=generator(val_files, val_labels), 
                    validation_steps=len(val_files)//batch_size,
                    use_multiprocessing=True, max_queue_size=20,
                    callbacks=[ModelCheckpoint("baseline_cnn.h5",
                                               monitor="val_acc", save_best_only=True),
                               EarlyStopping(patience=5, monitor="val_acc")])


# In[18]:


model.save_weights("baseline_cnn.h5")
#model.load_weights("baseline_cnn_mel_bigger.h5")


Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 64000, 1)]        0         
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 63992, 16)         160       
_________________________________________________________________
conv1d_55 (Conv1D)           (None, 63984, 16)         2320      
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 3999, 16)          0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 3999, 16)          0         
_________________________________________________________________
conv1d_56 (Conv1D)           (None, 3997, 32)          1568      
_________________________________________________________________
conv1d_57 (Conv1D)           (None, 3995, 32)          3104

In [None]:
librosa.core.load('/home/redowan/code/urban-sound-classification/data/train/Train/1036.wav')

In [None]:
a=[1,2,3,5]
b = [6,7,8,9]

shuffle(a,b)

