In [118]:
import numpy as np
import pandas as pd
import kapre
import keras
import os
import soundfile as sf
import time
import progressbar



In [2]:
# Load metadata to extract labels
metadata = pd.read_csv('/data/music/musicnet/metadata.csv')

In [3]:
metadata.head()

Unnamed: 0,id,composer,composition,movement,ensemble,source,transcriber,catalog_name,seconds
0,1727,Schubert,Piano Quintet in A major,2. Andante,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,447
1,1728,Schubert,Piano Quintet in A major,3. Scherzo: Presto,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,251
2,1729,Schubert,Piano Quintet in A major,4. Andantino - Allegretto,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,444
3,1730,Schubert,Piano Quintet in A major,5. Allegro giusto,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,368
4,1733,Schubert,Piano Sonata in A major,2. Andantino,Solo Piano,Museopen,Segundo G. Yogore,D959,546


In [7]:
# Construct a dictionary id --> composer, which will be used later to set the labels for all sound snippets
composers = pd.Series(metadata.composer.values,index=metadata.id).to_dict()
examples = [1755, 2211, 2368]
[composers.get(example) for example in examples]

['Schubert', 'Bach', 'Beethoven']

In [81]:
# Creating dataset
# First we define a function to load the audio snippets and to create the labels
def load_audio(path, duration_sec, composers):
        '''Requires the soundfile package, imported as sf '''
        files = os.listdir(path)
        message = "Processing {0} audio files...".format(len(files))
        print(message)
        
        # Initialise empty arrays
        data = np.zeros((len(files), 1, 88200))
        labels = np.zeros(len(files), dtype = "<U10")
        
        with progressbar.ProgressBar(max_value=len(files)) as bar:
            
            for i, file in enumerate(files):
                # load and process file, then add to array
                audio_clip, sr = sf.read(path + file)
                audio_clip = audio_clip[:int(sr*duration_sec)]
                audio_clip = audio_clip[np.newaxis, :]
                data[i, :audio_clip.shape[0],:audio_clip.shape[1]] = audio_clip                   
            
                # look up label and add to array
                file_id = file.split("-")[0]
                label = composers[int(file_id)]
                labels[i] = label            
                bar.update(i)
            
        return labels, data
        

In [82]:
# Load audio files, process them and create dataset and labels
labels, data = load_audio("/data/music/musicnet/data_chunks/", 2, composers)

  0% (210 of 61596) |                    | Elapsed Time: 0:00:00 ETA:   0:00:29

Processing 61596 audio files...


100% (61596 of 61596) |##################| Elapsed Time: 0:03:18 Time:  0:03:18


In [92]:
print(labels.shape)
print(data.shape)

(61596,)
(61596, 1, 88200)


In [199]:
# Turn labels to categorical (one-hot encoding)
labels_pd = pd.DataFrame(labels)
onehot = pd.get_dummies(labels_pd)
targets = onehot.as_matrix()

In [201]:
target.shape

(61596, 10)

In [None]:
# Split dataset and labels into training, validation and test data

In [None]:
# [CONTINUE HERE!]

In [126]:
# Define a model whose first layer is a mel-spectrogram (from Kapre)
from keras.models import Sequential
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
from kapre.augmentation import AdditiveNoise

In [146]:
# Code inspired by https://github.com/keunwoochoi/kapre
input_shape = (1, 88200)
sr = 44100

model = Sequential()
# A mel-spectrogram layer
model.add(Melspectrogram(n_dft=512, n_hop=256, input_shape=input_shape,
                         padding='same', sr=sr, n_mels=64,
                         fmin=0.0, fmax=sr/2, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False,
                         name='trainable_stft'))
# Maybe some additive white noise.
model.add(AdditiveNoise(power=0.2))
# If you wanna normalise it per-frequency
model.add(Normalization2D(str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample'
# After this, it's just a usual keras workflow. For example..
# Add some layers, e.g., model.add(some convolution layers..)
# Compile the model
model.compile('adam', 'categorical_crossentropy') # if single-label classification
# train it with raw audio sample inputs
#x = load_x() # e.g., x.shape = (10000, 6, 44100)
#y = load_y() # e.g., y.shape = (10000, 10) if it's 10-class classification
# and train it
#model.fit(x, y)

In [147]:
# Compile model
model.compile('adam', 'categorical_crossentropy')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
trainable_stft (Melspectrogr (None, 64, 345, 1)        279616    
_________________________________________________________________
additive_noise_3 (AdditiveNo (None, 64, 345, 1)        0         
_________________________________________________________________
normalization2d_2 (Normaliza (None, 64, 345, 1)        0         
Total params: 279,616
Trainable params: 0
Non-trainable params: 279,616
_________________________________________________________________


In [148]:
# Turn labels to categorical (one-hot encoding)

In [None]:
# Split dataset in training and test