In [1]:
# If run this file on google colab:
# Please use GPU for colab as CPU runtime would make the trainig process extremely slow
# It can be found in Notebook settings or Runtime > Change runtime type
# select GPU as Hardware accelerator.
# There are a limit for using GPU on google colab for free in a period of time
# but the quota is enough just to tryout this notebook

%tensorflow_version 1.x
import pandas as pd
import h5py
import random
import numpy as np
import keras
from keras.optimizers import Adam
from keras.applications import VGG19
from keras.utils import to_categorical, plot_model
# from tensorflow.keras.callbacks import TensorBoard
from keras.models import Model, Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Activation, Flatten, Dropout, Input, concatenate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import cv2
import tensorflow as tf

TensorFlow 1.x selected.


Using TensorFlow backend.


In [2]:
# if using google colab, run the following lines
from google.colab import drive
import os

# give colab the right to access your file
drive.mount('/content/drive')
os.chdir('/content/drive/')

Mounted at /content/drive


# Read the dataset

In [3]:
# the data are stored in hdf5 format
# formated in five array for specs, sonotypes, 
# start and end time times, min and max frequencies, and groups

f =  h5py.File('My Drive/CNN-example/samples.hdf5', "r")

specs_h5 = np.array(f["specs"]).astype("float32") # each element is a array of 224 * 224 * 3 floats
sonotypes_h5 = np.array(f["sonotypes"]).astype("float32") # each element is an floats
times_h5 = np.array(f["times"]).astype("float32") # each element is array of two floats
freqs_h5 = np.array(f["freqs"]).astype("float32") # each element is array of two floats
groups_h5 = np.array(f["groups"]) # each element is a string

f.close()

In [4]:
# append x_times an x_freqs to be auxiliary_input
aux_input_h5 = np.append(times_h5,freqs_h5, axis = 1)

# print the length and one of the sample
print("Number of samples:",aux_input_h5.shape[0])
print("The auxiliary input of one sample:",aux_input_h5[10])

Number of samples: 808
The auxiliary input of one sample: [21411.273 21412.98  21411.273 21412.98 ]


In [5]:
# create the dictionary for sonotypes and groups
sono2group = dict(zip(sonotypes_h5,groups_h5))

# sonotype and frequency in decending order
s_unique, s_freq = np.unique(sonotypes_h5,return_counts=True)
s_freq_order = np.argsort(s_freq)[::-1]
s_freq_desc = s_freq[s_freq_order]

# print some stat
print("Number of sonotypes:", len(s_unique))
print("Sonotype in decending order of sample size:", s_unique[s_freq_order][:20])
print("Sample size:",s_freq_desc[:20])

Number of sonotypes: 13
Sonotype in decending order of sample size: [ 52. 138. 283. 463.  86. 175.  -3.  -4.  -7.  -6.  -2.  -8.  -5.]
Sample size: [231 153 109  91  62  49  35  28  24  13   6   4   3]


In [8]:
# pick sonotypes

numUsed =6 # number of sonotype to use, at most 6 (will pick 6 if > 6)
groupUsed = b'b' # the group to use, the sample dataset only contain birds

# randomly pick numUsed number of sonotype
# positive sonotype are for bird sounds,
# negative sonotypes are for noises used in data agumentation
typeUsed = [x for x in s_unique if x > 0] 
random.shuffle(typeUsed)
typeUsed = typeUsed[:numUsed]

print("Sonotype used:", typeUsed)

Sonotype used: [175.0, 283.0, 463.0, 86.0, 138.0, 52.0]


In [9]:
# get the samples, balanced samples

# sample size to use for each sonotype
num_pick = 49

# train
specs = []
aux_input = []
sonotypes = []

# test and validation
spec_test = []
aux_test = []
y_test = []
spec_val = []
aux_val = []
y_val = []

for i in range(len(typeUsed)):
  # get index of the current type of spec
  cur_index = np.argwhere(sonotypes_h5 == typeUsed[i]).flatten()
  
  # categorize into train, validate, and test in 8:1:1 ratio
  random.shuffle(cur_index)
  cur_index_resized = cur_index[:int(num_pick * 0.8)]
  test_index = cur_index[int(num_pick * 0.8): int(num_pick * 0.9)]
  val_index = cur_index[int(num_pick * 0.9): num_pick]

  # put into each list
  if len(specs):
    # not null
    specs = np.append(specs, specs_h5[cur_index_resized], axis=0)
    aux_input = np.append(
        aux_input, aux_input_h5[cur_index_resized], axis=0)
    # sonotypes in training are start from 0
    sonotypes = np.append(sonotypes, np.repeat(i, int(num_pick * 0.8)))
    spec_test = np.append(spec_test, specs_h5[test_index], axis=0)
    aux_test = np.append(aux_test, aux_input_h5[test_index], axis=0)
    spec_val = np.append(spec_val, specs_h5[val_index], axis=0)
    aux_val = np.append(aux_val, aux_input_h5[val_index], axis=0)
    y_test = np.append(y_test, np.repeat(i, len(test_index)))
    y_val = np.append(y_val, np.repeat(i, len(val_index)))
  else:
    specs = specs_h5[cur_index_resized]
    aux_input = aux_input_h5[cur_index_resized]   
    # sonotypes in training are start from 0  
    sonotypes = np.repeat(i, int(num_pick * 0.8))
    spec_test = specs_h5[test_index]
    aux_test = aux_input_h5[test_index]
    spec_val = specs_h5[val_index]
    aux_val = aux_input_h5[val_index]
    y_test = np.repeat(i, len(test_index))
    y_val = np.repeat(i, len(val_index))

# format the input for test and validation into the required format
x_test = [spec_test, aux_test]
x_val = [spec_val, aux_val]

# categorical the output for test and validation into the required format
cat_y_test = to_categorical(pd.factorize(y_test)[0], num_classes=len(typeUsed))
cat_y_val = to_categorical(pd.factorize(y_val)[0], num_classes=len(typeUsed))

# just make a copy of specs for training
specs_keep = np.copy(specs)

# just print some stats
print("Number of sample for tests or validation:", len(y_test))
print("Number of samples for training:", specs.shape[0])

Number of sample for tests or validation: 30
Number of samples for training: 234


# Normalization

In [11]:
def normalize(specs):
  '''
  Linear normalization of the data
  @param: specs is the list of spcetrograms to normalize
  @return: the normalized spectrograms
  '''

  return_specs = []
  for i in range(len(specs)):
    # make a copy to ensure not changing the original spectrogram
    cur_spec = np.copy(specs[i])
    s_min = np.amin(cur_spec)
    s_max = np.amax(cur_spec)
    return_specs.append((cur_spec - s_min)/(s_max - s_min) * 255)

  return return_specs

# Augmentation Methods

For each function, make a copy of the original spectrogram
to ensure that we do not change the original one

Return all the augmented spectrograms in lists for consistency

In [12]:
def time_chop(spec, rand_start):
  '''
  chop the spectrogram on x axis (time) from the right
  @param: spec, the spectrogram to chop
  @param: rand_start: the randomed index to start chopping
  @return: the list of augmented spectrograms
  '''

  time_chopped_spec = np.copy(spec)
  time_chopped_spec[:,224 - rand_start:,:] = 0

  return [time_chopped_spec]

def freq_chop(spec, rand_start):
  '''
  chop the spectrogram on y axis (frequency) from the top
  @param: spec, the spectrogram to chop
  @param: rand_start: the randomed index to start chopping
  @return: the list of augmented spectrograms
  '''

  freq_chopped_spec = np.copy(spec)
  freq_chopped_spec[0:rand_start,:,:] = 0

  return [freq_chopped_spec]

def four_chop(spec, rand_start):
  '''
  chop the spectrogram on four sides
  @param: spec, the spectrogram to chop
  @param: rand_start: the randomed index to start chopping
  @return: the list of augmented spectrograms
  '''

  four_chopped_spec = np.copy(spec)
  four_chopped_spec[0 : rand_start,:,:] = 0  # top
  four_chopped_spec[:,224 - rand_start:,:] = 0  # right
  four_chopped_spec[224 - rand_start:,:,:] = 0  # bottom
  four_chopped_spec[:, 0 : rand_start ,:] = 0  # left

  return [four_chopped_spec]

In [13]:
def add_noises(spec):
  '''
  add noise to the spectrogram with 1/3 ratio
  @param: spec, the spectrogram to chop
  @return: the list of augmented spectrograms
  '''
  # add noise from light rian -2, rain -3, heavy rain -4, thunder -5, aircraft -6, chainsaw -7, and car/truck -8
  return_specs = []
  noise_sonos = [-2, -3, -4, -5,-6,-7,-8]

  for i in range(len(noise_sonos)):
    noises_index = np.argwhere(sonotypes_h5 == noise_sonos[i]).flatten()
    noises = specs_h5[noises_index]
    # randomly pick a noise sample
    index = random.randint(0, len(noises) - 1)
    # normalize sound and noise, add them together with 1/3 ratio
    noise = normalize(np.array(noises[index]) / 3)
    return_specs.append(np.add(normalize([np.copy(spec)])[0], noise))

  return return_specs

In [14]:
def translate(spec, roll_start):
  '''
  roll the spectrogram up and down
  @param: spec, the spectrogram to chop
  @param: roll_start, the index to start rolling
  @return: the list of augmented spectrograms
  '''

  return_specs = []
  return_specs.append(np.roll(spec, -roll_start, axis = 0))
  return_specs.append(np.roll(spec, roll_start, axis = 0))

  return return_specs

In [15]:
def widen(spec, widen_index):
  '''
  widen the spectrogram
  @param: spec, the spectrogram to chop
  @param: widen_index, the index to decide the start and end of
          the spectrogram to widen
  @return: the list of augmented spectrograms
  '''
  return_specs = []
  widen_time_spec=cv2.resize(spec.astype('float32'),(224 + widen_index,224))
  widen_freq_spec=cv2.resize(spec.astype('float32'),(224,224 + widen_index))

  return_specs.append(widen_time_spec[:,widen_index // 2: -widen_index // 2,:])
  return_specs.append(widen_freq_spec[widen_index // 2: -widen_index // 2,:,:])

  return return_specs

def squeeze(spec, squeeze_index):
  '''
  squeeze the spectrogram
  @param: spec, the spectrogram to chop
  @param: widen_index, the index to decide the start and end of
          the spectrogram to widen
  @return: the list of augmented spectrograms
  '''
  
  squeezed=cv2.resize(spec.astype('float32'),(224 - squeeze_index,224 - squeeze_index))
  squeeze_spec = np.zeros([224, 224, 3])
  squeeze_spec[squeeze_index//2 : - squeeze_index //2,squeeze_index//2 : - squeeze_index //2, :] = squeezed

  return [squeeze_spec]

In [16]:
def augment(specs, aux_input, sonotypes, aug_num, augment_range = 0.1):
  '''
  call all the augment methods on the spectrograms

  @param: specs is the list of spectrograms to augment from
  @param: aux_input is the list of auxiliary input corresponds to the spectrograms
  @param: sonotypes is the list of sonnotypes corresponds to the spectrograms
  @param: aug_num is the number of sets of augmented spectrograms
          (returned number of samples will be 1 + 15*aug_num)
  @param: augment_range is the threshold used for augmentations, default to 0.1
  @return: augment_specs_func is the list of augmented spectrograms
  @return: augment_aux_func is the list of  auxiliary input corresponds to the spectrograms
  @return: augment_sono_func is the list of sonotypes input corresponds to the spectrograms
  '''

  augment_specs_func = []
  augment_aux_func = []
  augment_sono_func = []

  # print(len(aux_input))
  for i in range(len(specs)):
    # generate random index array for augmentation
    # in 5% to 10% of the size of the original spectrogram
    # 224 * 224 is the image size
    indices = np.arange(int(224 * augment_range / 3 * 2) , int(224 * augment_range))
    np.random.shuffle(indices)
    indices = indices[:aug_num]
    
    # augment each spec and add to list
    cur_spec = np.copy(specs[i])
    # add itself to the list
    if (len(augment_specs_func)):
      augment_specs_func = np.append(augment_specs_func, [cur_spec], axis = 0)
    else:
      augment_specs_func.append(cur_spec)
    # augment_specs_func.append(cur_spec)

    for index in indices:
      # print(index)
      # chop
      augment_specs_func = np.append(augment_specs_func, time_chop( np.copy(cur_spec), index), axis = 0)
      augment_specs_func = np.append(augment_specs_func, freq_chop( np.copy(cur_spec), index), axis = 0)
      augment_specs_func = np.append(augment_specs_func, four_chop( np.copy(cur_spec), index), axis = 0)

      # widen + squeeze
      augment_specs_func = np.append(augment_specs_func, squeeze( np.copy(cur_spec), index), axis = 0)
      augment_specs_func = np.append(augment_specs_func, widen( np.copy(cur_spec), index), axis = 0)
      
      # noise
      augment_specs_func = np.append(augment_specs_func, add_noises(np.copy(cur_spec)), axis = 0)

      # translate
      augment_specs_func = np.append(augment_specs_func, translate(np.copy(cur_spec), index), axis = 0)

    # total 1 + 15 * aug_num augmented, repeat the sono and aux
    if (len(augment_aux_func)):
      augment_aux_func = np.append(augment_aux_func, np.repeat([aux_input[i]], 1 + 15 * aug_num, axis = 0), axis= 0)
    else:
      augment_aux_func = np.repeat([aux_input[i]], 1 + 15 * aug_num, axis = 0)

    augment_sono_func = np.append(augment_sono_func, np.repeat(sonotypes[i], 1 + 15 * aug_num), axis= 0)

  return augment_specs_func, augment_aux_func, augment_sono_func

# Model and Training

In [17]:
config = dict(
    dropout = 0.5,
    hidden = 1024,
    learn_rate = 0.00001,
    epochs = 30,
    )

In [18]:
def build_finetune_model(base_model, dropouts, fc_layers, num_classes):
    '''
    finetune the model, freeze teh top layers,
    add dropouts, dense layers, 
    another input layer for auxiliary input 
    and concatenate it with the flatten layer
    '''
    
    # freeze the base layers
    for layer in base_model.layers:
       layer.trainable = False

    # add flatten layer
    x = base_model.output
    x = Flatten()(x)

    # add input layer for auxiliary input (time and frequency)
    auxiliary_input = Input(shape=(4,), name='aux_input')
    x = concatenate([x, auxiliary_input])

    #  dense and dropout layer
    for fc, drop in zip(fc_layers, dropouts):
        x = Dense(fc, activation='relu')(x) 
        x = Dropout(drop)(x)

    # final dense layer for output
    predictions = Dense(num_classes, activation='softmax')(x)

    finetune_model = Model(inputs=[base_model.input,auxiliary_input], outputs=predictions)

    return finetune_model

In [19]:
class TestCallback(keras.callbacks.Callback):
    '''
    The class used to see the test result during training
    '''
    
    def __init__(self, test_data):
        self.test_data = test_data

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        loss, acc = self.model.evaluate(x, y, verbose=0)
        print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))

In [20]:
def gen(specs, aux_input, sonotypes):
  '''
  generator functiion for fit_generator
  augment the samples and yield the augmented samples to train the model
  
  As the data would be too large if we augment all the images before training, 
  we need to use this function to augment the images during the training process

  @param: specs, the list of spectrograms
  @param: aux_input, the list of auxiliary input corresponding to the spectrograms
  @param: sonotypes, the list of sonotypes corresponding to the spectrograms
  '''

  # augment_specs, augment_aux, augment_sono =  augment(specs_seperated[i], aux_seperated[i], sono_seperated[i], 1)
  while 1:
    # shuffle data
    indices = np.arange(len(sonotypes))
    np.random.shuffle(indices)
    # augment 4 samples to get 64 samples per yield (suitable for training)
    step_len = 4

    for i in range(len(specs) // step_len):
      step_min = i * step_len
      step_max = min( (i + 1) * step_len, len(specs) )
      
      augment_specs, augment_aux, augment_sono =  augment(specs[indices][step_min: step_max], aux_input[indices][step_min: step_max], sonotypes[indices][step_min: step_max], 1)

      # normalize spectrograms and categorical outputs
      augment_specs_normal = normalize(augment_specs)
      cat_y_train = to_categorical(augment_sono, num_classes= len(typeUsed))

      yield {'input_1': np.array([augment_specs_normal])[0], 'aux_input': np.array([augment_aux])[0]}, np.array([cat_y_train])[0]

In [21]:
model = None
keras.backend.clear_session()
# get the pretrained model
model = VGG19(weights='imagenet', include_top=False, input_shape=(224,224,3))
# finetune to our case
model = build_finetune_model(model, 
                             [config["dropout"], config["dropout"]], 
                             [config["hidden"], config["hidden"]], 
                             len(typeUsed))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# model stats, remove "#" in lines below to print
# model.summary()
# plot_model(model,to_file = 'My Drive/example/model.png')

In [1]:
# we need to reset the model when starting a new training process, can be done with codes below 
# (the same as in one of the above cells, just copy here for easier use)

# model = None
# keras.backend.clear_session()
# # get the pretrained model
# model = VGG19(weights='imagenet', include_top=False, input_shape=(224,224,3))
# # finetune to our case
# model = build_finetune_model(model, 
#                              [config["dropout"], config["dropout"]], 
#                              [config["hidden"], config["hidden"]], 
#                              len(typeUsed))

# training
filepath_loss = 'My Drive/CNN-example/model_loss.hdf5'

# remove model before
if os.path.exists(filepath_loss):
  os.remove(filepath_loss)

# early stopping and checkpoint to save the model with the lowest validation
earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=15)
checkpoint = ModelCheckpoint(filepath_loss, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

# optimization method for training
opt = Adam(lr=config["learn_rate"])


model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

# if we want to see the test result along the training, add "TestCallback((x_test, cat_y_test))" to callbacks
# i.e. callbacks=[checkpoint_loss, checkpoint_acc, TestCallback((x_test, cat_y_test))]

# with augmentation
history = model.fit_generator(gen(specs, aux_input, sonotypes),
                    steps_per_epoch=len(specs) // 4, epochs = 300, validation_data = (x_val, cat_y_val), callbacks=[checkpoint, earlystop])

# without augmentation
# cat_y_train = to_categorical(sonotypes, num_classes=len(typeUsed))
# history = model.fit(x=[specs, aux_input], y=cat_y_train, validation_data=(x_val, cat_y_val), epochs=300, verbose=2, callbacks=[checkpoint, earlystop])

In [None]:
# Evaluate the model

# current model
results = model.evaluate( x= x_test, y=cat_y_test)
print("cur test loss, test acc:", results)

# The model with best loss
# load the model
model = None
keras.backend.clear_session()
model = load_model (filepath_loss)

# evaluate
results = model.evaluate( x= x_test, y=cat_y_test)
print("best loss test loss, test acc:", results)