In [5]:
import warnings
warnings.filterwarnings('ignore')
from utils import general_utils
from utils import audio_utils
import importlib
importlib.reload(general_utils)
import pandas as pd
import models
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np

In [16]:
importlib.reload(audio_utils)
importlib.reload(general_utils)

<module 'utils.general_utils' from '/home/ec2-user/git/udacity_capstone/utils/general_utils.py'>

# Using Deep Neural Network to Predict Musical Instrument Family

Goal: Use the NSynth Dataset by Google Inc. data to train a deep neural net to label the instrument playing a single note at any pitch or velocity.

Instrument Families: Bass, Brass, Flute, Guitar, Keyboard, Mallet, Organ, Reed, String, Synth Lead, Vocal

Dataset: https://magenta.tensorflow.org/datasets/nsynth

Benchmark Model: Naive Predictor: Given the distribution of instrument samples (seen below), the chances of predicting any one instrument correctly at random are approximately 9.09%
<pre>
Instrument       Samples      Proportion of Dataset
Bass              68,955             22.54%
Brass             13,830              4.52%
Flute              9,423              3.08%
Guitar            35,423             11.58%
Keyboard          54,991             17.97%
Mallet            35,066             11.46%
Organ             36,577             11.95%
Reed              14,866              4.86%
String            20,594              6.73%
Synth Lead         5,501              1.80%
Vocal             10,753              3.51%
</pre>

## Steps

- Extract all the sound samples from the NSynth dataset and generate spectrogram jpegs for each
- Load the the spectograms into three labeled sets (training, validation, testing)
- Train on a number of different models

## Inspect a tfrecord file's data

In [17]:
general_utils.list_data_in_tfrecord("data/nsynth-test.tfrecord",2)


RECORD  1 

qualities
int64_list {
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
  value: 0
}

instrument_source_str
bytes_list {
  value: "synthetic"
}

audio
omitted

instrument
int64_list {
  value: 417
}

qualities_str
bytes_list {
}

note
int64_list {
  value: 149013
}

instrument_str
bytes_list {
  value: "bass_synthetic_033"
}

instrument_family_str
bytes_list {
  value: "bass"
}

velocity
int64_list {
  value: 100
}

pitch
int64_list {
  value: 100
}

instrument_family
int64_list {
  value: 0
}

note_str
bytes_list {
  value: "bass_synthetic_033-100-100"
}

sample_rate
int64_list {
  value: 16000
}

instrument_source
int64_list {
  value: 2
}


RECORD  2 

velocity
int64_list {
  value: 127
}

pitch
int64_list {
  value: 100
}

instrument_family
int64_list {
  value: 0
}

note_str
bytes_list {
  value: "bass_synthetic_033-100-127"
}

instrument_source
int64_list {
  value: 2
}

sample_rate
int64_list {
  value: 16000
}

qual

## Convert Sound Samples to Spectogram Images

NOTE: I moved most of the code that was in this cell to the audio_utils library to keep the code clean. Here's what it does:

1. Load the Tensorflow recordset (tfrecord file) with two specicific features we need: note_str and audio).
    note_str contains the name of the note (brass_acoustic_059-062-050) and audio contains an array of floats.
2. Use an asynchronous parallelized process to convert the sound files to spectograms in batches.

TODO: Because sklearn.datasets.load_files uses subdirectory names as labels, I had to move the image files after they'd been written to disk. For next iteration, have each file automatically get written into a subdirectory that matches the label name so the manual moving of files isn't necessary.

In [None]:
audio_utils.write_spectograms_parallelized('data/nsynth-test.tfrecord', 'data/nsynth-test-spectrograms', 200)
audio_utils.write_spectograms_parallelized('data/nsynth-valid.tfrecord', 'data/nsynth-valid-spectrograms', 200)

## following took about 4 or 5 hours
audio_utils.write_spectograms_parallelized('data/nsynth-train.tfrecord', 'data/nsynth-train-spectrograms', 200)

#output was: "processed 289200 files out of 289205 in 1445 batches"

## Load the datasets (files, targets, and names)

In [20]:
train_files, train_targets, train_target_names = general_utils.load_dataset('data/nsynth-train-spectrograms')

print('info about training set')
print('number of samples: ', len(train_files))
print('categories are: ', sorted(set(train_target_names)))
#print(train_files[500:501])
#print(train_targets[500:501])
#print(train_target_names[500:501])

valid_files, valid_targets, valid_target_names = general_utils.load_dataset('data/nsynth-valid-spectrograms')

print('\ninfo about validation set')
print('number of samples: ', len(valid_files))
print('categories are: ', sorted(set(valid_target_names)))
#print(valid_files[500:501])
#print(valid_targets[500:501])
#print(valid_target_names[500:501])

# load the test data
test_files, test_targets, test_target_names = general_utils.load_dataset('data/nsynth-test-spectrograms')

print('\ninfo about test set')
print('number of samples: ', len(test_files))
print('categories are: ', sorted(set(test_target_names)))
#print(test_files[500:501])
#print(test_targets[500:501])
#print(test_target_names[500:501])

info about training set
number of samples:  283704
categories are:  ['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ', 'reed', 'string', 'vocal']

info about validation set
number of samples:  12678
categories are:  ['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ', 'reed', 'string', 'vocal']

info about test set
number of samples:  4096
categories are:  ['bass', 'brass', 'flute', 'guitar', 'keyboard', 'mallet', 'organ', 'reed', 'string', 'vocal']


## Build dataframes
build dataframes that will be part of the image generator construction

### PART 1: zip the data

In [21]:
# merge the three arrays into an array of tupples
train_data = list(zip(train_files, train_targets, train_target_names))
valid_data = list(zip(valid_files, valid_targets, valid_target_names))
test_data = list(zip(test_files, test_targets, test_target_names))

### PART 2: Limit the size of the datasets if needed

In [22]:
# uncomment / change the code here

#train_data = train_data[:50000]
#valid_data = valid_data[:2150]
#test_data = test_data[:4096]

### PART 3: Create the dataframes

In [23]:
train_df = pd.DataFrame(train_data, columns = ['file_paths', 'targets', 'target_names'])
valid_df = pd.DataFrame(valid_data, columns = ['file_paths', 'targets', 'target_names'])
test_df = pd.DataFrame(test_data, columns = ['file_paths', 'targets', 'target_names'])

In [24]:
print('### train shape', train_df.shape)
print('### valid shape', valid_df.shape)
print('### test shape', test_df.shape)

### train shape (283704, 3)
### valid shape (12678, 3)
### test shape (4096, 3)


## Create the ImageDataGenerators

This Keras tool generates batches of tensor image data. It can augment the images as well, but for this project I have not used that feature as spectograms are not 'in the wild' as photographs tend to be. I would be interesting to experiment with this in a further iteration, however.

One generator for each of the three datasets.

In [25]:
# create the data generator
from keras_preprocessing.image import ImageDataGenerator

datagen=ImageDataGenerator(rescale=1./255.,validation_split=0.25)

In [26]:
# make the training data generator
train_generator = datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col='file_paths',
        y_col='target_names',
        batch_size=32,
        seed=69,
        shuffle=True,
        class_mode='categorical',
        target_size=(64,64)
    )

Found 283704 images belonging to 10 classes.


In [27]:
# make the validation data generator
valid_generator = datagen.flow_from_dataframe(
        dataframe=valid_df,
        x_col='file_paths',
        y_col='target_names',
        batch_size=32,
        seed=69,
        shuffle=True,
        class_mode='categorical',
        target_size=(64,64)
    )

Found 12678 images belonging to 10 classes.


In [28]:
# make the validation data generator
test_generator = datagen.flow_from_dataframe(
        dataframe=test_df,
        x_col='file_paths',
        y_col='target_names',
        batch_size=32,
        #seed=69,
        shuffle=True,
        class_mode='categorical',
        target_size=(64,64)
    )

Found 4096 images belonging to 10 classes.


In [29]:
# USEFULL THINGS

#to go through all of the files to make sure they're findable
#import os.path
#for f in valid_files:#valid_generator.filepaths:
#    if not os.path.exists(f):
#        print(f, 'does not exist')

# get all of the file paths in the data generator
#generator_files = set(valid_generator.filepaths)

# make sure all of the actual files are accounted for in the generator
#i = 0
#for f in valid_files:
#    if f not in generator_files:
#        print('N', f)
#        i+=1
    #else:
        #print('Y', f)
#print(i, 'files do not match')

### Find the step sizes for each of the sets.

In [31]:
steps_train=train_generator.n//train_generator.batch_size
steps_valid=valid_generator.n//valid_generator.batch_size
steps_test=test_generator.n//test_generator.batch_size
#STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
print('train_generator.n//train_generator.batch_size is {}//{}'.format(train_generator.n, train_generator.batch_size))
print('valid_generator.n//valid_generator.batch_size is {}//{}'.format(valid_generator.n, valid_generator.batch_size))
print('test_generator.n//test_generator.batch_size is {}//{}'.format(test_generator.n, test_generator.batch_size))
print('train step size: {}, validation step size: {}, test step size: {}'.format(steps_train, steps_valid, steps_test))

train_generator.n//train_generator.batch_size is 283704//32
valid_generator.n//valid_generator.batch_size is 12678//32
test_generator.n//test_generator.batch_size is 4096//32
train step size: 8865, validation step size: 396, test step size: 128


## Run models

This cell is reused for each tested model. The models are all added to the 'models' library and labeled with a number. See the noted results of each model in the library.

In [64]:
importlib.reload(models)

# Save the checkpoints to here. Make sure the file name reflects the model version.
model_hdf5 = 'saved_models/weights.best.v7_1.hdf5'

# Use the same model for each of the steps below. See 'models.py' for details.
model_creator = models.create_model_v7

checkpointer = ModelCheckpoint(filepath=model_hdf5, 
                               verbose=1, save_best_only=True)

train_model = model_creator(show_summary=True)
train_model.fit_generator(generator=train_generator,
                          steps_per_epoch=steps_train,
                          validation_data=valid_generator,
                          validation_steps=steps_valid,
                          epochs=8,
                          callbacks=[checkpointer],
                          workers=4
)

r_trained_orig = train_model.evaluate_generator(generator=valid_generator, steps=steps_valid)

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_72 (Conv2D)           (None, 64, 64, 32)        896       
_________________________________________________________________
activation_84 (Activation)   (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_73 (Conv2D)           (None, 62, 62, 64)        18496     
_________________________________________________________________
activation_85 (Activation)   (None, 62, 62, 64)        0         
_________________________________________________________________
max_pooling2d_36 (MaxPooling (None, 31, 31, 64)        0         
_________________________________________________________________
dropout_48 (Dropout)         (None, 31, 31, 64)        0         
_________________________________________________________________
conv2d_74 (Conv2D)           (None, 31, 31, 64)      

# Results

NOTE: The results below are ONLY for the most recently tested model.

<b> Please see the `models.py` file for all of the models and the results for each.</b>

## Loss and Accuracy of the training

In [66]:
#print(train_model.metrics_names)
#print(r_trained_orig)
print('## Loss: ', r_trained_orig[0], ' Accuracy: ', r_trained_orig[1])

## Loss:  1.0000036166171835  Accuracy:  0.16414142


## Evaluation of Untrained Model
### Loss and Accuracy

In [67]:
# first test untrained model
untrained_test_model = model_creator()

score_untrained = untrained_test_model.evaluate_generator(test_generator, steps=steps_test)
print('## Loss: ', score_untrained[0], ' Accuracy: ', score_untrained[1])

#for i, n in enumerate(test_generator.filenames):
#    print('file:', n, ' score: ', scores[i][0])


## Loss:  1.0031691053882241  Accuracy:  0.037597656


## Prediction of the untrained model

In [68]:
## Testing the untrained model
importlib.reload(general_utils)

results = general_utils.run_prediction(untrained_test_model, test_generator, steps_test)
num_untrained_test_correct = np.count_nonzero(results)
print('## Total test records:', len(results))
print('## Number of correct:', num_untrained_test_correct)
print('## Percent correct:', num_untrained_test_correct/len(results))

## Total test records: 4096
## Number of correct: 226
## Percent correct: 0.05517578125


## Evaluation of Trained Model
### Loss and Accuracy

In [69]:
# next test the trained model
trained_test_model = model_creator()
trained_test_model.load_weights(model_hdf5)

score_trained = trained_test_model.evaluate_generator(test_generator, steps=steps_test)
print('## Loss: ', score_trained[0], ' Accuracy: ', score_trained[1])

## Loss:  1.0000030007213354  Accuracy:  0.034423828


## Prediction of the Trained model

In [70]:
## Testing the trained model
importlib.reload(general_utils)

results = general_utils.run_prediction(trained_test_model, test_generator, steps_test)
num_trained_test_correct = np.count_nonzero(results)
print('## Total test records:', len(results))
print('## Number of correct:', num_trained_test_correct)
print('## Percent correct:', num_trained_test_correct/len(results))

## Total test records: 4096
## Number of correct: 141
## Percent correct: 0.034423828125


## Using the NSynth Checkpoints

This is for the next iteration. I made an attempt to use the ckpt files, however, I was unable to use them properly. I believe I will have to apply transfer learning to their trained model, but since I was experimenting with my own and also unable to load their model, I haven't been able to test this yet.

In [None]:
# next test the model trained by NSynth (think I'll have to use a transfer learning approach)

###NOTE: Could not figure out how to use the ckpt checkpoint files to either evaluate or even use 
### . as the basis for transfer learning. I'd like to go back and do this, but could use some advice.

#trained_nsynth_test_model = models.create_model_v1()

#nsynth_checkpoint = tf.train.load_checkpoint('examples/model_files/model.ckpt-200000') #, latest_filename='model.ckpt-200000.index'
#print(nsynth_checkpoint)

### results of this was:
## NotImplementedError: Streaming restore not supported from name-based checkpoints when graph building. 
##   File a feature request if this limitation bothers you. 
##   As a workaround, consider either using tf.train.Checkpoint to load 
##   name-based checkpoints or enabling eager execution.

## I did find this. See link in the comment: https://github.com/tensorflow/magenta/issues/955

#path = 'examples/model_files/model.ckpt-200000'

#cp = tf.train.Checkpoint(path)


#trained_nsynth_test_model.load_weights(path)


#r_trained = trained_nsynth_test_model.evaluate_generator(test_generator, steps=STEP_SIZE_TEST)

