In [None]:
import inst_rec

In [None]:
%load_ext autoreload
%autoreload 2

## Make sure model can overfit on limited dataset

In [None]:
OVERFIT_CONFIG = dict(training_set_limit=2,
                      validation_set_limit=100,
                      learning_rate=5e-3,
                      steps_per_epoch=1000,
                      buffer_size=8,
                      batch_size=8,
                      epochs=1)

In [None]:
inst_rec.train_model(OVERFIT_CONFIG)

## Train model

In [None]:
DEFAULT_CONFIG = dict(training_set_limit=-1,
                      validation_set_limit=1000,
                      learning_rate=1e-3,
                      steps_per_epoch=100,
                      buffer_size=1000,
                      batch_size=32,
                      epochs=10)

In [None]:
inst_rec.train_model(DEFAULT_CONFIG)

## Load best model weights

In [None]:
best_model = inst_rec.build_model()
best_model.load_weights(inst_rec.MODEL_PATH)

## Test on a random track from test set

In [None]:
import mirdata, librosa
import numpy as np
import random
from IPython.display import Audio, display

In [None]:
msdb = mirdata.initialize('medley_solos_db', data_home=inst_rec.MIRDATA_MDSB_PATH)
track_ids = [t_id for t_id in msdb.track_ids if msdb.track(t_id).subset=='test']
random.shuffle(track_ids)
track_id = track_ids[0]

In [None]:
track = msdb.track(track_id)

In [None]:
audio, sr = librosa.load(track.audio_path, sr=inst_rec.SR)
x = librosa.util.frame(audio, frame_length=inst_rec.SR, hop_length=inst_rec.SR // 2).T

In [None]:
l = best_model.predict(x[:,np.newaxis,:])

In [None]:
y_pred = np.argmax(np.mean(l, axis=0))

In [None]:
INSTRUMENTS = ['clarinet', 
               'distorted electric guitar', 
               'female singer',
               'flute',
               'piano', 
               'tenor saxophone', 
               'trumpet', 
               'violin']

In [None]:
display(Audio(audio, rate=sr))
INSTRUMENTS[y_pred], INSTRUMENTS[track.instrument_id]

## Eval on test
(Note this function is pretty inefficient since it only predicts one track at a time)

In [None]:
def eval_on_test(model):
    msdb = mirdata.initialize('medley_solos_db', data_home=MIRDATA_MDSB_PATH)
    track_ids = [t_id for t_id in msdb.track_ids if msdb.track(t_id).subset=='test']

    y_pred = []
    y_true = []
    for track_id in tqdm(track_ids):
        track = msdb.track(track_id)
        audio, sr = librosa.load(track.audio_path, sr=SR)
        x = librosa.util.frame(audio, frame_length=SR, hop_length=SR // 2).T
        l = model.predict(x[:,np.newaxis,:])
        _y_pred = np.argmax(np.mean(l, axis=0))
        y_pred.append(_y_pred)
        y_true.append(track.instrument_id)

    print(sklearn.metrics.classification_report(y_true, y_pred))
    
    matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
    print(np.mean(matrix.diagonal()/matrix.sum(axis=1)))
        
    return y_true, y_pred

In [None]:
y_true, y_pred = eval_on_test(best_model)

## Fine tune model
Note this hasn not been optimized and will likely overfit. To help reduce that try:
1. Augmenting the dataset (e.g., small pitch shifts, time stretch/compression, added noise, compression, etc.)
1. Limiting fine-tuning to last layer (...though the last layer does have most of the weights)
1. Reducing learning rate (or try using lower learning rates for just earlier layers)

In [None]:
best_model = inst_rec.build_model(ol3_trainable=True)
best_model.load_weights(inst_rec.MODEL_PATH)

In [None]:
best_model.summary()

In [None]:
FINETUNE_CONFIG = dict(training_set_limit=-1,
                      validation_set_limit=1000,
                      learning_rate=1e-5,
                      steps_per_epoch=100,
                      buffer_size=1000,
                      batch_size=32,
                      epochs=10)

inst_rec.train_model(model=best_model, config=FINETUNE_CONFIG)