In [1]:
# Load the modules we're going to need
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import os
import glob
import librosa
import pandas as pd# Added
from IPython.display import Audio
from sklearn.externals import joblib
import mir_eval

In [2]:
# We've previously preprocessed our data and coverted all files to a sample rate of 44100
samplerate = 44100

In [3]:
# Specify where the audio files for training and testing reside
train_folder = './mir_class_train'
test_folder = './mir_class_test'

# Get a list of all the training audio files (must be .WAV files)
train_files = glob.glob(os.path.join(train_folder, '*.wav'))

# Get a list of all the test audio files (must be .WAV files)
test_files = glob.glob(os.path.join(test_folder, '*.wav'))

# Specify the labels (classes) we're going to classify the data into
label0 = 'abscent'
label1 = 'present'
labels = [label0, label1]

In [4]:
window_size = 2048
hop_size = 512
n_bands = 40
n_mfcc = 13

In [5]:
# some time later...
filename = 'finalized_model.sav' 
# load the model from disk
clf = joblib.load(filename)

filename = 'scaler.sav' 
# load the model from disk
scaler = joblib.load(filename)
#result = loaded_model.score(X_test, Y_test)
#print(result)

In [6]:
# Test file name
tf = "./mir_class_test/MusicDelta_Rockabilly_MIX.wav"

test_feat =[]
test_labels = []
print("filename: {:s}".format(os.path.basename(tf)))
    
# Load audio
audio, sr = librosa.load(tf, sr=samplerate, mono=True)

# Extract mfcc coefficients (remember we will discard the first one)
# To see all the relevant kwarg arugments consult the documentation for
# librosa.feature.mfcc, librosa.feature.melspectrogram and librosa.filters.mel
mfcc = librosa.feature.mfcc(audio, sr=sr, n_fft=window_size, hop_length=hop_size,
                            fmax=samplerate/2, n_mels=n_bands, n_mfcc=(n_mfcc + 1))
          
# Discard the first coefficient
mfcc = mfcc[1:,:]
print("mfcc matrix shape: {}".format(mfcc.shape))
    
    
# Calculate the feature vectors: transpose to make each mfcc vector as one sample
feature_vector = mfcc.transpose()
print("feature vector shape: {}".format(feature_vector.shape))
    
# Read labels for each frame
d2 = pd.read_csv(tf[:-7]+"MELODY1.csv",index_col=None, header=None)
d2 = pd.DataFrame.as_matrix(d2)[:,1]
# Adjust labels to our classes
tf_label = ['present' if x > 0 else 'abscent' for x in d2]+['abscent']
    
#Get labels index
tf_label_ind = np.array([labels.index(lbl) for lbl in tf_label])
print("file label size: {}".format(tf_label_ind.shape))
    
    
# Store the feature vector and corresponding label in integer format
for idx in range(len(feature_vector)-1):
    test_feat.append(feature_vector[idx])
    test_labels.append(tf_label_ind[idx*2])
print(" ")

filename: MusicDelta_Rockabilly_MIX.wav
mfcc matrix shape: (13, 2236)
feature vector shape: (2236, 13)
file label size: (4472,)
 


In [None]:
feat_scaled = scaler.transform(test_feat)
output = clf.predict(feat_scaled)
labels = test_labels

In [12]:
labels = np.array(labels)
output = np.array(output)

In [13]:
R, FA = mir_eval.melody.voicing_measures(labels, output)
print ("Recall", R, "False Alarme", FA)

Recall 0.5858395989974937 False Alarme 0.13145539906103287


In [14]:
import vad_cls

# EVALUATE
evaluation = vad_cls.evaluate_results(labels, output)
print (evaluation)

Precision:    0.9175662414131501
Recall:       0.5858395989974937
F-measure:    0.7151051625239006
Voicing False Alarm Rate:  0.13145539906103287
Voicing Recall Rate:       0.5858395989974937
Overall Accuracy:          0.6666666666666666
[935.0, 84.0, 661.0, 555]
