## Utils

In [1]:
import librosa
import math
import numpy as np
import re
import os
from keras.callbacks import Callback
import utils

Using Theano backend.


In [2]:
# Create Network Input and Target Data

def create_net_data(list_of_audiofiles, hop_length=512):

    timeseries_length_list = []
    for file in list_of_audiofiles:
        y, sr = librosa.load(file)
        timeseries_length_list.append(math.ceil(len(y) / hop_length))

    timeseries_length = min(timeseries_length_list)
    # Why do we use 27 as the z dimension?    
    data = np.zeros((len(list_of_audiofiles), timeseries_length, 27), dtype=np.float64)
    target = []
    for i, file in enumerate(list_of_audiofiles):
    
        y, sr = librosa.load(file)
        # What is n_mfcc here?
        mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
        spectral_center = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
        spectral_roll = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=hop_length)
    
        splits = re.split('[ .]', file)
        genre = re.split('[ /]', splits[1])[2]
        target.append(genre)
        
        data[i, :, 0:13] = mfcc.T[0:timeseries_length, :]
        data[i, :, 13:14] = spectral_center.T[0:timeseries_length, :]
        data[i, :, 14:26] = chroma.T[0:timeseries_length, :]
        data[i, :, 26:27] = spectral_roll.T[0:timeseries_length, :]
        
        print("Analyzed Track %i of %i." %(i+1, len(list_of_audiofiles)))
        
    return data, np.expand_dims(np.asarray(target), axis=1)

In [3]:
def path_to_audiofiles(dir_folder):
    list_of_audio = []
    for file in os.listdir(dir_folder):
        if file.endswith(".au"):
            directory = "%s/%s" %(dir_folder, file)
            list_of_audio.append(directory)
        
    return list_of_audio

In [4]:
def genreTag_toBinary(target_genreTag):
    #target_genreTag is a numpy array of shape (x, 1), where x is the number of instances in the dataset
    target_numberTag = np.zeros((target_genreTag.shape[0], 8))
    for i in range(target_genreTag.shape[0]):
        if target_genreTag[i, 0] == 'classical':
            target_numberTag[i, 0] = 1
        elif target_genreTag[i, 0] == 'country':
            target_numberTag[i, 1] = 1
        elif target_genreTag[i, 0] == 'disco':
            target_numberTag[i, 2] = 1
        elif target_genreTag[i, 0] == 'hiphop':
            target_numberTag[i, 3] = 1
        elif target_genreTag[i, 0] == 'jazz':
            target_numberTag[i, 4] = 1
        elif target_genreTag[i, 0] == 'metal':
            target_numberTag[i, 5] = 1
        elif target_genreTag[i, 0] == 'pop':
            target_numberTag[i, 6] = 1
        elif target_genreTag[i, 0] == 'reggae':
            target_numberTag[i, 7] = 1
        else:
            print("Genre Not Found!")
            
    return target_numberTag

In [5]:
def netOut_toBinary_and_genreTag(net_output):
    # net_output is the prediction the LSTM RNN made (numpy array of shape (x, 1), where x is the number of instances used for prediction)
    net_binary = np.zeros(net_output.shape)
    genreTag = []
    for i in range(net_output.shape[0]):
        max_value_index = np.argmax(net_output[i, :])
        net_binary[i, max_value_index] = 1
        
        if max_value_index == 0:
            genreTag.append('classical')
        elif max_value_index == 1:
            genreTag.append('country')
        elif max_value_index == 2:
            genreTag.append('disco')
        elif max_value_index == 3:
            genreTag.append('hiphop')
        elif max_value_index == 4:
            genreTag.append('jazz')
        elif max_value_index == 5:
            genreTag.append('metal')
        elif max_value_index == 6:
            genreTag.append('pop')
        elif max_value_index == 7:
            genreTag.append('reggae')
        else:
            print("Error! Index exceeds available genres.")
        
    return net_binary, np.expand_dims(np.asarray(genreTag), axis=1)

In [6]:
class LossHistory(Callback):
	def on_train_begin(self, logs={}):
		self.losses = []

	def on_batch_end(self, batch, logs={}):
		self.losses.append(logs.get('loss'))

## Data preprocessing

In [109]:
dir_testfolder = "./test_minus_country"
dir_devfolder = "./dev_minus_country"
dir_trainfolder = "./training_minus_country"

In [110]:
hop_length = 512

In [111]:
path_to_testfiles = path_to_audiofiles(dir_testfolder)
path_to_devfiles = path_to_audiofiles(dir_devfolder)
path_to_trainfiles = path_to_audiofiles(dir_trainfolder)

In [112]:
test_input, test_target = create_net_data(path_to_testfiles, hop_length=hop_length)
print(test_input.shape, test_target.shape)
print(test_input.dtype, test_target.dtype)
with open('data_test_input2.npy','wb') as f:
    np.save(f,test_input)
with open('data_test_target2.npy','wb') as f:
    np.save(f,test_target)

Analyzed Track 1 of 80.
Analyzed Track 2 of 80.
Analyzed Track 3 of 80.
Analyzed Track 4 of 80.
Analyzed Track 5 of 80.
Analyzed Track 6 of 80.
Analyzed Track 7 of 80.
Analyzed Track 8 of 80.
Analyzed Track 9 of 80.
Analyzed Track 10 of 80.
Analyzed Track 11 of 80.
Analyzed Track 12 of 80.
Analyzed Track 13 of 80.
Analyzed Track 14 of 80.
Analyzed Track 15 of 80.
Analyzed Track 16 of 80.
Analyzed Track 17 of 80.
Analyzed Track 18 of 80.
Analyzed Track 19 of 80.
Analyzed Track 20 of 80.
Analyzed Track 21 of 80.
Analyzed Track 22 of 80.
Analyzed Track 23 of 80.
Analyzed Track 24 of 80.
Analyzed Track 25 of 80.
Analyzed Track 26 of 80.
Analyzed Track 27 of 80.
Analyzed Track 28 of 80.
Analyzed Track 29 of 80.
Analyzed Track 30 of 80.
Analyzed Track 31 of 80.
Analyzed Track 32 of 80.
Analyzed Track 33 of 80.
Analyzed Track 34 of 80.
Analyzed Track 35 of 80.
Analyzed Track 36 of 80.
Analyzed Track 37 of 80.
Analyzed Track 38 of 80.
Analyzed Track 39 of 80.
Analyzed Track 40 of 80.
Analyzed 

KeyboardInterrupt: 

In [None]:
dev_input, dev_target = create_net_data(path_to_devfiles, hop_length=hop_length)
print(dev_input.shape, dev_target.shape)
print(dev_input.dtype, dev_target.dtype)
with open('data_dev_input2.npy','wb') as f:
    np.save(f,dev_input)
with open('data_dev_target2.npy','wb') as f:
    np.save(f,dev_target)

In [None]:
train_input, train_target = utils.create_net_data(path_to_trainfiles, hop_length=hop_length)
print(train_input.shape, train_target.shape)
print(train_input.dtype, train_target.dtype)
with open('data_train_input2.npy','wb') as f:
    np.save(f,train_input)
with open('data_train_target2.npy','wb') as f:
    np.save(f,train_target)

In [None]:
print("Done!")

## Train neural network

In [7]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Input, Dense
import time
import csv
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import pickle

In [8]:
#Hyperparameters
# loss_function = 'binary_crossentropy' # 'mse' for more options see: http://keras.io/objectives/
# optimizer = Adam(lr=0.0067, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # for more options see: http://keras.io/optimizers/
# activation_fct = 'sigmoid' # for more options see: http://keras.io/activations/
# num_epochs = 500
# batch_size = 128

In [9]:
train_input = np.load('./data_train_input2.npy')
train_target = np.load('./data_train_target2.npy')
# train_target = utils.genreTag_toBinary(train_target)

In [10]:
dev_input = np.load('./data_dev_input2.npy')
dev_target = np.load('./data_dev_target2.npy')
dev_target = genreTag_toBinary(dev_target)
train_target = genreTag_toBinary(train_target)
print(train_target)

[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]]


In [13]:
def train_lstm(num_hidden_layers,units,loss_function,optimizer,activation_fct,num_epochs,batch_size):
    print('===========================================')
    print('Training LSTM with '+str(num_hidden_layers)+' hidden layers.')
    # Create LSTM RNN - every "model.add" adds one hidden-layer and the last "model.add" adds the output-layer
    model = Sequential()
    # Keep that as the first Layer. Change the number of Units in the first hidden layer by setting the output_dim value
    model.add(LSTM(input_dim=dev_input.shape[2], output_dim=32, activation=activation_fct, return_sequences=True))

    # Add as many hidden layers as you want and set the number of units by setting the output_dim value
    # Try a range of hidden layers. How to find the range though?
    for index in range(0,num_hidden_layers):
        model.add(LSTM(output_dim=units[index], activation=activation_fct, return_sequences=False))

    # Keep that last layer as the output layer
    model.add(Dense(output_dim=dev_target.shape[1], activation='softmax'))

    print("Compiling...")
    model.compile(loss=loss_function, optimizer=optimizer, class_mode='binary')

    print("Training...")
    history = LossHistory()
#     scores = cross_validation.cross_val_score(model, train_input, train_target, cv=5)
    model.fit(train_input, train_target, batch_size=batch_size, nb_epoch=num_epochs, callbacks=[history])
    w = csv.writer(open("./history_csv/%s_%sepochs.csv" %(time.strftime("%Y%m%d_%H_%M"), num_epochs), "w"))
    for loss in history.losses:
        w.writerow([loss])
#     print ("Accuracy on training set: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return model
    

In [14]:
units = [16]
loss_function1 = 'binary_crossentropy' # 'mse' for more options see: http://keras.io/objectives/
optimizer1 = Adam(lr=0.0065, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # for more options see: http://keras.io/optimizers/
activation_fct1 = 'sigmoid' # for more options see: http://keras.io/activations/
num_epochs1 = 400
batch_size1 = 128
model1 = train_lstm(1,units,loss_function1,optimizer1,activation_fct1,num_epochs1,batch_size1)
net_output1 = model1.predict(dev_input)
pickle.dump( net_output1, open( "save_net_output1.p", "wb" ) )
print("Done!")  

Training LSTM with 1 hidden layers.
Compiling...
Training...
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400

In [15]:
loss_function2 = 'binary_crossentropy' # 'mse' for more options see: http://keras.io/objectives/
optimizer2 = Adam(lr=0.0066, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # for more options see: http://keras.io/optimizers/
activation_fct2 = 'sigmoid' # for more options see: http://keras.io/activations/
num_epochs2 = 400
batch_size2 = 128
model2 = train_lstm(1,units,loss_function2,optimizer2,activation_fct2,num_epochs2,batch_size2)
net_output2 = model2.predict(dev_input)
pickle.dump( net_output2, open( "save_net_output2.p", "wb" ) )
print("Done!")  

Training LSTM with 1 hidden layers.
Compiling...
Training...
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400

In [16]:
loss_function3 = 'binary_crossentropy' # 'mse' for more options see: http://keras.io/objectives/
optimizer3 = Adam(lr=0.0067, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # for more options see: http://keras.io/optimizers/
activation_fct3 = 'sigmoid' # for more options see: http://keras.io/activations/
num_epochs3 = 400
batch_size3 = 128
model3 = train_lstm(1,units,loss_function3,optimizer3,activation_fct3,num_epochs3,batch_size3)
net_output3 = model3.predict(dev_input)
pickle.dump( net_output3, open( "save_net_output3.p", "wb" ) )
print("Done!")  

Training LSTM with 1 hidden layers.
Compiling...
Training...
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400

In [None]:
# net_output1 = pickle.load( open( "save_net_output1.p", "rb" ) )
# net_output2 = pickle.load( open( "save_net_output2.p", "rb" ) )
# net_output3 = pickle.load( open( "save_net_output3.p", "rb" ) )

In [23]:
net_binary1, net_predictions1 = netOut_toBinary_and_genreTag(net_output1)
net_binary2, net_predictions2 = netOut_toBinary_and_genreTag(net_output2)
net_binary3, net_predictions3 = netOut_toBinary_and_genreTag(net_output3)
accuracy1 = accuracy_score(dev_target, net_binary1)
accuracy2 = accuracy_score(dev_target, net_binary2)
accuracy3 = accuracy_score(dev_target, net_binary3)
print(accuracy1)
print(accuracy2)
print(accuracy3)

0.65
0.725
0.708333333333


In [25]:
#Converting dev_target to the same format as net_output(s)
devtarget_binary, devtarget_label = netOut_toBinary_and_genreTag(dev_target)

In [26]:
# Using average for blending
net_output_avg = (net_output1+net_output2+net_output3)/3
net_binary_avg, net_predictions_avg = netOut_toBinary_and_genreTag(net_output_avg)
accuracy_avg = accuracy_score(dev_target, net_binary_avg)
print(accuracy_avg)

0.75


In [27]:
# Using max for blending
net_output_max = np.maximum.reduce([net_output1,net_output2,net_output3])
net_binary_max, net_predictions_max = netOut_toBinary_and_genreTag(net_output_max)
accuracy_max = accuracy_score(dev_target, net_binary_max)
print(accuracy_max)

0.758333333333


In [28]:
# Compute confusion matrix for average belnding
cm_avg = confusion_matrix(devtarget_label, net_predictions_avg)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm_avg)

#Plotting confusion matrix
labels = ['classical','hiphop','jazz','metal','pop','reggae']
fig = plt.figure()
ax = fig.add_subplot(111)
plt.title('Confusion matrix of the classifier')
res = ax.imshow(cm_avg, cmap=plt.cm.Blues, interpolation='nearest')
for i, cas in enumerate(cm_avg):
    for j, c in enumerate(cas):
        if c>0:
            plt.text(j-.2, i+.2, c, fontsize=14)

ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
cb = fig.colorbar(res)
plt.savefig("confmat_avg_ensbl_minus_country_disco.png", format="png")

Confusion matrix, without normalization
[[20  0  0  0  0  0]
 [ 0  9  0  9  1  1]
 [ 4  0 16  0  0  0]
 [ 0  0  0 20  0  0]
 [ 0  1  2  0 16  1]
 [ 0  9  1  1  0  9]]


In [31]:
# Compute confusion matrix for max belnding
cm_max = confusion_matrix(devtarget_label, net_predictions_max)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm_max)

#Plotting confusion matrix
labels = ['classical','hiphop','jazz','metal','pop','reggae']
fig = plt.figure()
ax = fig.add_subplot(111)
plt.title('Confusion matrix of the classifier')
res = ax.imshow(cm_max, cmap=plt.cm.Blues, interpolation='nearest')
for i, cas in enumerate(cm_max):
    for j, c in enumerate(cas):
        if c>0:
            plt.text(j-.2, i+.2, c, fontsize=14)

ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
cb = fig.colorbar(res)
plt.savefig("confmat_max_ensbl_minus_country_disco.png", format="png")

Confusion matrix, without normalization
[[20  0  0  0  0  0]
 [ 0 11  0  7  1  1]
 [ 6  0 14  0  0  0]
 [ 0  0  0 20  0  0]
 [ 0  0  2  0 17  1]
 [ 0  9  1  1  0  9]]
