In [1]:
import warnings
warnings.filterwarnings('ignore')
from utils import general_utils
from utils import audio_utils
import importlib
importlib.reload(general_utils)
import pandas as pd
import models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np

Using TensorFlow backend.
W0928 20:11:46.598551 139828537165632 deprecation_wrapper.py:119] From /home/ec2-user/git/udacity_capstone/utils/general_utils.py:15: The name tf.FixedLenFeature is deprecated. Please use tf.io.FixedLenFeature instead.



In [3]:
importlib.reload(audio_utils)
importlib.reload(general_utils)

<module 'utils.general_utils' from '/home/ec2-user/git/udacity_capstone/utils/general_utils.py'>

# Using Deep Neural Network to Predict Musical Instrument Family

Goal: Use the NSynth Dataset by Google Inc. data to train a deep neural net to label the instrument playing a single note at any pitch or velocity.

Instrument Families: Bass, Brass, Flute, Guitar, Keyboard, Mallet, Organ, Reed, String, Synth Lead, Vocal

Dataset: https://magenta.tensorflow.org/datasets/nsynth

Benchmark Model: Naive Predictor: Given the distribution of instrument samples (seen below), the chances of predicting any one instrument correctly at random are approximately 9.09%
<pre>
Instrument       Samples      Proportion of Dataset
Bass              68,955             22.54%
Brass             13,830              4.52%
Flute              9,423              3.08%
Guitar            35,423             11.58%
Keyboard          54,991             17.97%
Mallet            35,066             11.46%
Organ             36,577             11.95%
Reed              14,866              4.86%
String            20,594              6.73%
Synth Lead         5,501              1.80%
Vocal             10,753              3.51%
</pre>

## Steps

- Extract all the sound samples from the NSynth dataset and generate spectrogram jpegs for each
- Load the the spectograms into three labeled sets (training, validation, testing)
- Train on a number of different models
- Test model

## Inspect a tfrecord file's data

In [5]:
general_utils.list_data_in_tfrecord("data/nsynth-test.tfrecord",2)


RECORD  1 

instrument_family_str
bytes_list {
  value: "bass"
}

velocity
int64_list {
  value: 100
}

pitch
int64_list {
  value: 100
}

instrument_family
int64_list {
  value: 0
}

note_str
bytes_list {
  value: "bass_synthetic_033-100-100"
}

sample_rate
int64_list {
  value: 16000
}

instrument_source
int64_list {
  value: 2
}

qualities
int64_list {
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
}

instrument_source_str
bytes_list {
  value: "synthetic"
}

audio
omitted

instrument
int64_list {
  value: 417
}

qualities_str
bytes_list {
}

note
int64_list {
  value: 149013
}

instrument_str
bytes_list {
  value: "bass_synthetic_033"
}


RECORD  2 

instrument_family
int64_list {
  value: 0
}

note_str
bytes_list {
  value: "bass_synthetic_033-100-127"
}

instrument_source
int64_list {
  value: 2
}

sample_rate
int64_list {
  value: 16000
}

qualities
int64_list {
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0

## Convert Sound Samples to Spectrogram Images

NOTE: Moved most of the logic for writing spectrograms is in the audio_utils library. Here's what it does:

1. Load the Tensorflow recordset (tfrecord file) with three specicific features we need: note_str, audio, and sample_rate).
 - note_str: name of the note (brass_acoustic_059-062-050)
 - audio: array of floats (samples)
 - sample_rate: samples per second
2. Use an asynchronous parallelized process to convert the sound files to spectograms in batches.
 - This function can:
 -  Write either mel scaled spectrograms or normally scaled spectrograms
 -  Overwrite files or not
 -  Use a regex filter to limit the dataset based on sound name patterns


In [47]:
# use only note '64'

test_spectrogram_folder = 'data/nsynth-test-mel-spectrograms-064'
valid_spectrogram_folder = 'data/nsynth-valid-mel-spectrograms-064'
train_spectrogram_folder = 'data/nsynth-train-mel-spectrograms-064'

#regex_filter = None
regex_filter = '.*-064-.*'

In [49]:
importlib.reload(audio_utils)

audio_utils.write_spectograms_parallelized('data/nsynth-test.tfrecord', test_spectrogram_folder, 
                                           batch_size=200, mel=True, overwrite=False, regex_filter=regex_filter)

audio_utils.write_spectograms_parallelized('data/nsynth-valid.tfrecord', valid_spectrogram_folder, 
                                           batch_size=200, mel=True, overwrite=False, regex_filter=regex_filter)

## following took about 4 or 5 hours with full dataset
audio_utils.write_spectograms_parallelized('data/nsynth-train.tfrecord', train_spectrogram_folder, 
                                           batch_size=200, mel=True, overwrite=False, regex_filter=regex_filter)

#output for full dataset was: "processed 289205 files out of 289205 in 1446 batches"

processed 50 files out of 61 in 1 batchesprocessed 61 files out of 61 in 1 batches

## Load the datasets (files, targets, and names)

In [50]:
train_files, train_targets, train_target_names = general_utils.load_dataset(train_spectrogram_folder)

print('info about training set')
print('number of samples: ', len(train_files))
print('categories are: ', sorted(set(train_target_names)))
#print(train_files[500:501])
#print(train_targets[500:501])
#print(train_target_names[500:501])

valid_files, valid_targets, valid_target_names = general_utils.load_dataset(valid_spectrogram_folder)

print('\ninfo about validation set')
print('number of samples: ', len(valid_files))
print('categories are: ', sorted(set(valid_target_names)))
#print(valid_files[500:501])
#print(valid_targets[500:501])
#print(valid_target_names[500:501])

# load the test data
test_files, test_targets, test_target_names = general_utils.load_dataset(test_spectrogram_folder)

print('\ninfo about test set')
print('number of samples: ', len(test_files))
print('categories are: ', sorted(set(test_target_names)))
#print(test_files[500:501])
#print(test_targets[500:501])
#print(test_target_names[500:501])

info about training set
number of samples:  4035
categories are:  ['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ', 'reed', 'string', 'vocal']

info about validation set
number of samples:  160
categories are:  ['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ', 'reed', 'string', 'vocal']

info about test set
number of samples:  61
categories are:  ['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ', 'reed', 'string', 'vocal']


## Build dataframes
build dataframes that will be part of the image generator construction

### PART 1: zip the data

In [51]:
# merge the three arrays into an array of tupples
train_data = list(zip(train_files, train_targets, train_target_names))
valid_data = list(zip(valid_files, valid_targets, valid_target_names))
test_data = list(zip(test_files, test_targets, test_target_names))

### PART 2: Limit the size of the datasets if needed

In [5]:
# uncomment / change the code here

#train_data = train_data[:50000]
#valid_data = valid_data[:2150]
#test_data = test_data[:4096]

### PART 3: Create the dataframes

In [52]:
train_df = pd.DataFrame(train_data, columns = ['file_paths', 'targets', 'target_names'])
valid_df = pd.DataFrame(valid_data, columns = ['file_paths', 'targets', 'target_names'])
test_df = pd.DataFrame(test_data, columns = ['file_paths', 'targets', 'target_names'])

In [53]:
print('### train shape', train_df.shape)
print('### valid shape', valid_df.shape)
print('### test shape', test_df.shape)

### train shape (4035, 3)
### valid shape (160, 3)
### test shape (61, 3)


## Create the ImageDataGenerators

This Keras tool generates batches of tensor image data. It can augment the images as well, but for this project I have not used that feature as spectograms are not 'in the wild' as photographs tend to be. I would be interesting to experiment with this in a further iteration, however.

One generator for each of the three datasets.

In [54]:
# create the data generator
from keras_preprocessing.image import ImageDataGenerator

datagen=ImageDataGenerator()

In [55]:
# make the training data generator
train_generator = datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col='file_paths',
        y_col='target_names',
        batch_size=32,
        seed=69,
        shuffle=True,
        class_mode='categorical',
        target_size=(128,128)
    )

Found 4035 images belonging to 10 classes.


In [56]:
# make the validation data generator
valid_generator = datagen.flow_from_dataframe(
        dataframe=valid_df,
        x_col='file_paths',
        y_col='target_names',
        batch_size=32,
        seed=69,
        shuffle=True,
        class_mode='categorical',
        target_size=(128,128)
    )

Found 160 images belonging to 10 classes.


In [57]:
# make the test data generator
test_generator = datagen.flow_from_dataframe(
        dataframe=test_df,
        x_col='file_paths',
        y_col='target_names',
        batch_size=32,
        seed=69,
        shuffle=True,
        class_mode='categorical',
        target_size=(128,128)
    )

Found 61 images belonging to 10 classes.


In [12]:
# USEFULL THINGS

#to go through all of the files to make sure they're findable
#import os.path
#for f in valid_files:#valid_generator.filepaths:
#    if not os.path.exists(f):
#        print(f, 'does not exist')

# get all of the file paths in the data generator
#generator_files = set(valid_generator.filepaths)

# make sure all of the actual files are accounted for in the generator
#i = 0
#for f in valid_files:
#    if f not in generator_files:
#        print('N', f)
#        i+=1
    #else:
        #print('Y', f)
#print(i, 'files do not match')

### Find the step sizes for each of the sets.

In [13]:
steps_train=train_generator.n//train_generator.batch_size + 1
steps_valid=valid_generator.n//valid_generator.batch_size + 1
steps_test=test_generator.n//test_generator.batch_size + 1
#STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
print('train_generator.n//train_generator.batch_size is {}//{}'.format(train_generator.n, train_generator.batch_size))
print('valid_generator.n//valid_generator.batch_size is {}//{}'.format(valid_generator.n, valid_generator.batch_size))
print('test_generator.n//test_generator.batch_size is {}//{}'.format(test_generator.n, test_generator.batch_size))
print('train step size: {}, validation step size: {}, test step size: {}'.format(steps_train, steps_valid, steps_test))

train_generator.n//train_generator.batch_size is 4035//32
valid_generator.n//valid_generator.batch_size is 160//32
test_generator.n//test_generator.batch_size is 59//32
train step size: 127, validation step size: 6, test step size: 2


## Run models

This cell is reused for each tested model. The models are all added to the 'models' library and labeled with a number. See the noted results of each model in the library.

In [58]:
importlib.reload(models)

# Save the checkpoints to here. Make sure the file name reflects the model version.
model_hdf5 = 'saved_models/weights.best.v10_1.hdf5'

# Use the same model for each of the steps below. See 'models.py' for details.
model_creator = models.create_model_v10


In [26]:
importlib.reload(models)

checkpointer = ModelCheckpoint(filepath=model_hdf5, 
                               verbose=1, save_best_only=True)

# End trining if there's no improvement in four epochs.
early_stopper = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6)

train_model = model_creator(show_summary=True)
train_model.fit_generator(generator=train_generator,
                          steps_per_epoch=steps_train,
                          validation_data=valid_generator,
                          validation_steps=steps_valid,
                          epochs=30,
                          callbacks=[checkpointer,early_stopper],
                          workers=4
)

r_trained_orig = train_model.evaluate_generator(generator=valid_generator, steps=steps_valid)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_14 (Conv2D)           (None, 128, 128, 32)      896       
_________________________________________________________________
activation_16 (Activation)   (None, 128, 128, 32)      0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 128, 128, 64)      18496     
_________________________________________________________________
activation_17 (Activation)   (None, 128, 128, 64)      0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 128, 128, 64)      36928     
_________________________________________________________________
activation_18 (Activation)   (None, 128, 128, 64)      0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 126, 126, 64)     

# Results

NOTE: The results below are ONLY for the most recently tested model.

<b> Please see the `models.py` file for all of the models and the results for each.</b>

## Loss and Accuracy of the training

In [27]:
#print(train_model.metrics_names)
#print(r_trained_orig)
print('## Loss: ', r_trained_orig[0], ' Accuracy: ', r_trained_orig[1])

## Loss:  2.185720960299174  Accuracy:  0.15104167


## Evaluation of Untrained Model
### Loss and Accuracy

In [95]:
# first test untrained model
untrained_test_model = model_creator()

score_untrained = untrained_test_model.evaluate_generator(test_generator, steps=steps_test)
print('## Loss: ', score_untrained[0], ' Accuracy: ', score_untrained[1])

#for i, n in enumerate(test_generator.filenames):
#    print('file:', n, ' score: ', scores[i][0])


## Loss:  5.309322357177734  Accuracy:  0.016393442


## Prediction of the untrained model

In [96]:
## Testing the untrained model
importlib.reload(general_utils)

results = general_utils.run_prediction(untrained_test_model, test_generator, steps_test)

num_untrained_test_correct = np.count_nonzero(results)

print('## Total test records:', len(results))
print('## Number of correct:', num_untrained_test_correct)
print('## Percent correct:', num_untrained_test_correct/len(results))

## Total test records: 61
## Number of correct: 4
## Percent correct: 0.06557377049180328


## Evaluation of Trained Model
### Loss and Accuracy

In [78]:
# next test the trained model
trained_test_model = model_creator()
trained_test_model.load_weights(model_hdf5)

score_trained = trained_test_model.evaluate_generator(test_generator, steps=steps_test)
print('## Loss: ', score_trained[0], ' Accuracy: ', score_trained[1])

## Loss:  1.866533100605011  Accuracy:  0.36065573


In [62]:
print(model_hdf5)

saved_models/weights.best.v10_1.hdf5


## Prediction of the Trained model

In [93]:
## Testing the trained model
importlib.reload(general_utils)

results = general_utils.run_prediction(trained_test_model, test_generator, steps_test)

num_trained_test_correct = np.count_nonzero(results)
print('## Total test records:', len(results))
print('## Number of correct:', num_trained_test_correct)
print('## Percent correct:', num_trained_test_correct/len(results))

## Total test records: 61
## Number of correct: 12
## Percent correct: 0.19672131147540983
