In [2]:
import os
import shutil
import urllib
import zipfile
import glob
from __future__ import print_function

In [3]:
import numpy as np
import pydub
import librosa

class Clip:
    """A single 5-sec long recording."""
    
    RATE = 44100   # All recordings in ESC are 44.1 kHz
    FRAME = 512    # Frame size in samples
    
    class Audio:
        """The actual audio data of the clip.
        
            Uses a context manager to load/unload the raw audio data. This way clips
            can be processed sequentially with reasonable memory usage.
        """
        
        def __init__(self, path):
            self.path = path
        
        def __enter__(self):
            # Actual recordings are sometimes not frame accurate, so we trim/overlay to exactly 5 seconds
            self.data = pydub.AudioSegment.silent(duration=5000)
            self.data = self.data.overlay(pydub.AudioSegment.from_file(self.path)[0:5000])
            self.raw = (np.fromstring(self.data._data, dtype="int16") + 0.5) / (0x7FFF + 0.5)   # convert to float
            return(self)
        
        def __exit__(self, exception_type, exception_value, traceback):
            if exception_type is not None:
                print(exception_type, exception_value, traceback)
            del self.data
            del self.raw
        
    def __init__(self, filename):
        self.filename = os.path.basename(filename)
        self.path = os.path.abspath(filename)        
        self.directory = os.path.dirname(self.path)
        self.category = self.directory.split('/')[-1]
        
        self.audio = Clip.Audio(self.path)
        
        with self.audio as audio:
            self._compute_mfcc(audio)    
            self._compute_zcr(audio)
            
    def _compute_mfcc(self, audio):
        # MFCC computation with default settings (2048 FFT window length, 512 hop length, 128 bands)
        self.melspectrogram = librosa.feature.melspectrogram(audio.raw, sr=Clip.RATE, hop_length=Clip.FRAME)
        self.logamplitude = librosa.logamplitude(self.melspectrogram)
        self.mfcc = librosa.feature.mfcc(S=self.logamplitude, n_mfcc=13).transpose()
            
    def _compute_zcr(self, audio):
        # Zero-crossing rate
        self.zcr = []
        frames = int(np.ceil(len(audio.data) / 1000.0 * Clip.RATE / Clip.FRAME))
        
        for i in range(0, frames):
            frame = Clip._get_frame(audio, i)
            self.zcr.append(np.mean(0.5 * np.abs(np.diff(np.sign(frame)))))

        self.zcr = np.asarray(self.zcr)
            
    @classmethod
    def _get_frame(cls, audio, index):
        if index < 0:
            return None
        return audio.raw[(index * Clip.FRAME):(index+1) * Clip.FRAME]
    
    def __repr__(self):
        return '<{0}/{1}>'.format(self.category, self.filename)

In [4]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import IPython.display
import librosa.display

import seaborn as sb
sb.set(style="white", palette="muted")

import pandas as pd

In [5]:
def load_dataset(name):
    """Load all dataset recordings into a nested list."""
    clips = []
    
    for directory in sorted(os.listdir('{0}/'.format(name))):
        directory = '{0}/{1}'.format(name, directory)
        if os.path.isdir(directory) and os.path.basename(directory)[0:3].isdigit():
            print('Parsing ' + directory)
            category = []
            for clip in sorted(os.listdir(directory)):
                if clip[-3:] == 'ogg':
                    category.append(Clip('{0}/{1}'.format(directory, clip)))
            clips.append(category)
            
    IPython.display.clear_output()
    print('All {0} recordings loaded.'.format(name))            
    
    return clips

clips_10 = load_dataset('ESC-10')

All ESC-10 recordings loaded.


In [8]:
categories = 10
clips_shown = 40
Y_data = np.zeros((categories,clips_shown))
for c in range(1, categories):
    Y_data[c] =np.asarray([c]*clips_shown)

In [9]:
clips_400 = np.array(clips_10).flatten()
Y_data_400= Y_data.flatten()

In [10]:
Y_data_400.shape

(400,)

In [12]:
# You can reload this cell to get a different clip at every try

import random
all_recordings = glob.glob('ESC-10/*/*.ogg')
clip = Clip(all_recordings[random.randint(0, len(all_recordings) - 1)])    

# with clip.audio as audio:
#     plt.subplot(2, 1, 1)
#     plt.title('{0} : {1}'.format(clip.category, clip.filename))
#     plt.plot(np.arange(0, len(audio.raw)) / 44100.0, audio.raw)
#     print(len(audio.raw))
   
#     plt.subplot(2, 1, 2)
#     librosa.display.specshow(clip.logamplitude, sr=44100, x_axis='frames', y_axis='linear', cmap='RdBu_r')
    
# IPython.display.Audio(filename=clip.path, rate=Clip.RATE)    

In [13]:
mfcc_inp = []
for recording in all_recordings:
    clip = Clip(recording)
    mfcc_inp.append(clip.mfcc)

In [22]:
mfcc_np2D = np.array(mfcc_inp)
mfcc_np = mfcc_np2D.reshape(mfcc_np2D.shape[0],mfcc_np2D.shape[1],mfcc_np2D.shape[2],1)

In [15]:
# Import required libraries
# Add whatever you want
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

#print "TensorFlow Version {}".format(tf.__version__)

In [16]:
class BaseModel(object):
    def __init__(self):
        self.num_epoch = 5
        self.batch_size = 128
        self.log_step = 50
        self._build_model()

In [25]:
# Final Working Model with 70% accuracy
class YourModel(BaseModel):
    def __init__(self):
        super(YourModel, self).__init__()
        self.num_epoch = 20
        self.batch_size = 50
    
    # Define max pooling and conv layers
    def conv2d(self, input, kernel_size, stride, num_filter):
        stride_shape = [1, stride, stride, 1]
        filter_shape = [kernel_size, kernel_size, input.get_shape()[3], num_filter]
        W = tf.get_variable('w', filter_shape, tf.float32, tf.contrib.layers.xavier_initializer())
        b = tf.get_variable('b', [1, 1, 1, num_filter], initializer=tf.constant_initializer(0.0))
        
        weight_decay = tf.multiply(tf.nn.l2_loss(W), 0.0, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
        
        return tf.nn.conv2d(input, W, stride_shape, padding='SAME') + b

    def max_pool(self, input, kernel_size, stride):
        ksize = [1, kernel_size, kernel_size, 1]
        strides = [1, stride, stride, 1]
        return tf.nn.max_pool(input, ksize=ksize, strides=strides, padding='SAME')

    #######################self######################################################
    # TODO: You can add any layers (fully-connected, normalization)             #
    #############################################################################
    # def FC(input, out_neurons):
    #     return tf.contrib.layers.fully_connected(input, out_neurons)
    def FC(self, input, inp_neurons, out_neurons):
        #W = tf.get_variable('w',[inp_neurons,out_neurons], tf.float32, tf.random_normal_initializer(0.0, 0.02))
        W = tf.get_variable('w',[inp_neurons,out_neurons], tf.float32, tf.contrib.layers.xavier_initializer())
        b = tf.get_variable('b', [out_neurons], initializer=tf.constant_initializer(0.0))
        
        weight_decay = tf.multiply(tf.nn.l2_loss(W), 0.004, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
        
        return tf.matmul(input, W) + b

    def Normalization(self, input):
        return tf.nn.local_response_normalization(input,
                                                  alpha=0.001 / 9.0,
                                                  beta=0.75,
                                                  depth_radius=4,
                                                  bias=1.0)

    #############################################################################
    #                             END OF YOUR CODE                              #
    #############################################################################  
    def _model(self):
        print('-' * 5 + '  Sample model  ' + '-' * 5)

        print('intput layer: ' + str(self.X.get_shape()))

        with tf.variable_scope('conv1'):
            self.conv1 = self.conv2d(self.X, 3, 1, 4)
            self.relu1 = tf.nn.relu(self.conv1)
            #self.lnorm1 = self.Normalization(self.relu1)
            #self.pool1 = self.max_pool(self.lnorm1, 2, 2)            
            print('conv1 layer: ' + str(self.relu1.get_shape()))

        with tf.variable_scope('conv2'):
            #############################################################################
            # TODO: Complete the following functions                                    #
            #############################################################################
            self.conv2 = self.conv2d(self.relu1, 3, 1, 8)
            self.relu2 = tf.nn.relu(self.conv2)
            #self.lnorm2 = self.Normalization(self.relu2)
            #self.pool2 = self.max_pool(self.lnorm2, 2, 2)            
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            print('conv2 layer: ' + str(self.relu2.get_shape()))


        with tf.variable_scope('conv3'):
            #############################################################################
            # TODO: Complete the following functions                                    #
            #############################################################################
            self.conv3 = self.conv2d(self.relu2, 3, 1, 16)
            self.relu3 = tf.nn.relu(self.conv3)
#             self.lnorm3 = self.Normalization(self.relu3)
#             self.pool3 = self.max_pool(self.lnorm3, 2, 2)   

            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            print('conv3 layer: ' + str(self.relu3.get_shape()))
            
        with tf.variable_scope('conv4'):
            #############################################################################
            # TODO: Complete the following functions                                    #
            #############################################################################
            self.conv4 = self.conv2d(self.relu3, 3, 1, 16)
            self.relu4 = tf.nn.relu(self.conv4)
#             self.lnorm3 = self.Normalization(self.relu3)
#             self.pool3 = self.max_pool(self.lnorm3, 2, 2)   

            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            print('conv4 layer: ' + str(self.relu4.get_shape()))   
        
        #############################################################################
        # TODO: Flatten the output tensor from conv2 layer                          #
        #############################################################################
        self.flat = tf.contrib.layers.flatten(self.relu4)
        #############################################################################
        #                             END OF YOUR CODE                           #
        #############################################################################      
        print('flat layer: ' + str(self.flat.get_shape()))

        with tf.variable_scope('fc5'):
            #############################################################################
            # TODO: Complete the following functions                                    #
            #############################################################################
            self.fc5 = self.FC(self.flat, self.flat.get_shape()[1], 1024)
            self.relu5 = tf.nn.relu(self.fc5)
            if self.is_train:
                self.drop_out5 = tf.nn.dropout(self.relu5, self.keep_prob_fc5)
            else:
                self.drop_out5 = self.relu5
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            print('fc5 layer: ' + str(self.drop_out5.get_shape()))
            
        
        with tf.variable_scope('fc6'):
            #############################################################################
            # TODO: Complete the following functions                                    #
            #############################################################################
            self.fc6 = self.FC(self.drop_out5, self.drop_out5.get_shape()[1], 128)
            self.relu6 = tf.nn.relu(self.fc6)
            if self.is_train:
                self.drop_out6 = tf.nn.dropout(self.relu6, self.keep_prob_fc6)
            else:
                self.drop_out6 = self.relu6
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            print('fc6 layer: ' + str(self.drop_out6.get_shape()))
        

        with tf.variable_scope('fc7'):
            #############################################################################
            # TODO: Complete the following functions                                    #
            #############################################################################
            self.fc7 = self.FC(self.drop_out6, self.drop_out6.get_shape()[1], 10)            
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################
            print('fc7 layer: ' + str(self.fc7.get_shape()))
        
        # Return the last layer
        return self.fc7

    def _input_ops(self):
        # Placeholders
        self.X = tf.placeholder(tf.float32, [None, 431, 13, 1])
        self.Y = tf.placeholder(tf.int64, [None])
        
        #############################################################################
        # TODO: You can add any placeholders                                        #
        #############################################################################
        self.is_train = True
        self.keep_prob_fc5 = tf.placeholder(tf.float32)
        self.keep_prob_fc6 = tf.placeholder(tf.float32)
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################

    def _build_optimizer(self):
        # Adam optimizer 'self.train_op' that minimizes 'self.loss_op'
        #############################################################################
        # TODO: Complete the following functions                                    #
        #############################################################################
        self.global_step = tf.Variable(0, trainable=False)
        self.initial_lr = 1e-3
        #self.exp_decay = tf.train.exponential_decay(self.initial_lr, self.global_step, 500, 0.96)
        self.train_op = tf.train.AdamOptimizer(learning_rate=self.initial_lr).minimize(self.loss_op, global_step=self.global_step)

        
    def _loss(self, labels, logits):
        # Softmax cross entropy loss 'self.loss_op'
        #############################################################################
        # TODO: Complete the following functions                                    #
        #############################################################################
       
        cross_entropy_mean = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))     
        tf.add_to_collection('losses', cross_entropy_mean)
        self.loss_op = tf.add_n(tf.get_collection('losses'), name='total_loss')
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################

    def _build_model(self):
        # Define input variables
        self._input_ops()

        # Convert Y to one-hot vector
        labels = tf.one_hot(self.Y, 10)

        # Build a model and get logits
        logits = self._model()

        # Compute loss
        self._loss(labels, logits)
        
        # Build optimizer
        self._build_optimizer()

        # Compute accuracy
        predict = tf.argmax(logits, 1)
        correct = tf.equal(predict, self.Y)
        self.accuracy_op = tf.reduce_mean(tf.cast(correct, tf.float32))
        
    def train(self, sess, X_train, Y_train, X_val, Y_val):
        sess.run(tf.global_variables_initializer())

        step = 0
        losses = []
        accuracies = []
        print('-' * 5 + '  Start training  ' + '-' * 5)
        
        self.is_train = True
        for epoch in range(self.num_epoch):
            print('train for epoch %d' % epoch)
            for i in range(num_training // self.batch_size):
                X_ = X_train[i * self.batch_size:(i + 1) * self.batch_size][:]
                Y_ = Y_train[i * self.batch_size:(i + 1) * self.batch_size]

                #############################################################################
                # TODO: You can change feed data as you want                                #
                #############################################################################
                feed_dict = {self.X: X_, self.Y:Y_, self.keep_prob_fc5: 0.7, self.keep_prob_fc6: 0.8}                
                fetches = [self.train_op, self.loss_op, self.accuracy_op]

                _, loss, accuracy = sess.run(fetches, feed_dict=feed_dict)
                losses.append(loss)
                accuracies.append(accuracy)
                if step % self.log_step == 0:
                    print('iteration (%d): loss = %.3f, accuracy = %.3f' %
                        (step, loss, accuracy))
                step += 1

            #############################################################################
            # TODO: Plot training curves                                                #
            #############################################################################
            # Graph 1. X: epoch, Y: training loss

            # Graph 2. X: epoch, Y: training accuracy
            
            #############################################################################
            #                             END OF YOUR CODE                              #
            #############################################################################

            # Print validation results
            self.is_train = False
            print('validation for epoch %d' % epoch)
            val_accuracy = self.evaluate(sess, X_val, Y_val)
            print('-  epoch %d: validation accuracy = %.3f' % (epoch, val_accuracy))
            self.is_train = True

    def evaluate(self, sess, X_eval, Y_eval):
        eval_accuracy = 0.0
        eval_iter = 0
        for i in range(X_eval.shape[0] // self.batch_size):
            X_ = X_eval[i * self.batch_size:(i + 1) * self.batch_size][:]
            Y_ = Y_eval[i * self.batch_size:(i + 1) * self.batch_size]

            feed_dict = {self.X:X_, self.Y:Y_, self.keep_prob_fc5: 0.7, self.keep_prob_fc6: 0.8}  
            accuracy = sess.run(self.accuracy_op, feed_dict=feed_dict)
            eval_accuracy += accuracy
            eval_iter += 1
        return eval_accuracy / eval_iter

In [27]:
num_training = 300
num_validation = 50
num_test = 50
# Clear old computation graphs
tf.reset_default_graph()

sess = tf.Session()

#############################################################################
# TODO: Preprocessing                                                       #
#############################################################################
X_train_ = mfcc_np[:300]
X_val_ = mfcc_np[300:350]
X_test_ = mfcc_np[350:400]

Y_train = Y_data_400[:300]
Y_val = Y_data_400[300:350]
Y_test = Y_data_400[350:400]



#############################################################################
#                             END OF YOUR CODE                              #
#############################################################################

model = YourModel()
model.train(sess, X_train_, Y_train, X_val_, Y_val)
model.is_train = False
accuracy = model.evaluate(sess, X_test_, Y_test)
print('***** test accuracy: %.3f' % accuracy)

# Save your model
saver = tf.train.Saver()
model_path = saver.save(sess, "./S2I.ckpt")
print("Model saved in %s" % model_path)

sess.close()

-----  Sample model  -----
intput layer: (?, 431, 13, 1)
conv1 layer: (?, 431, 13, 4)
conv2 layer: (?, 431, 13, 8)
conv3 layer: (?, 431, 13, 16)
conv4 layer: (?, 431, 13, 16)
flat layer: (?, 89648)
fc5 layer: (?, 1024)
fc6 layer: (?, 128)
fc7 layer: (?, 10)
-----  Start training  -----
train for epoch 0
iteration (0): loss = 9.155, accuracy = 0.300
validation for epoch 0
-  epoch 0: validation accuracy = 0.040
train for epoch 1
validation for epoch 1
-  epoch 1: validation accuracy = 0.000
train for epoch 2
validation for epoch 2
-  epoch 2: validation accuracy = 0.020
train for epoch 3
validation for epoch 3
-  epoch 3: validation accuracy = 0.040
train for epoch 4
validation for epoch 4
-  epoch 4: validation accuracy = 0.080
train for epoch 5
validation for epoch 5
-  epoch 5: validation accuracy = 0.080
train for epoch 6
validation for epoch 6
-  epoch 6: validation accuracy = 0.100
train for epoch 7
validation for epoch 7
-  epoch 7: validation accuracy = 0.060
train for epoch 8
i