In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import kapre
import keras
import os
import soundfile as sf
import time
import progressbar
import 

Using TensorFlow backend.


In [2]:
# Load metadata to extract labels
metadata = pd.read_csv('/data/music/musicnet/metadata.csv')

In [3]:
metadata.head()

Unnamed: 0,id,composer,composition,movement,ensemble,source,transcriber,catalog_name,seconds
0,1727,Schubert,Piano Quintet in A major,2. Andante,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,447
1,1728,Schubert,Piano Quintet in A major,3. Scherzo: Presto,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,251
2,1729,Schubert,Piano Quintet in A major,4. Andantino - Allegretto,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,444
3,1730,Schubert,Piano Quintet in A major,5. Allegro giusto,Piano Quintet,European Archive,http://tirolmusic.blogspot.com/,OP114,368
4,1733,Schubert,Piano Sonata in A major,2. Andantino,Solo Piano,Museopen,Segundo G. Yogore,D959,546


In [4]:
# Construct a dictionary id --> composer, which will be used later to set the labels for all sound snippets
composers = pd.Series(metadata.composer.values,index=metadata.id).to_dict()
examples = [1755, 2211, 2368]
[composers.get(example) for example in examples]

['Schubert', 'Bach', 'Beethoven']

In [6]:
# Creating dataset
# First we define a function to load the audio snippets and to create the labels
def load_audio(path, downsampling_rate, duration_sec, composers):
        '''Requires the soundfile package, imported as sf '''
        files = os.listdir(path)
        message = "Processing {0} audio files...".format(len(files))
        print(message)
        message = "Downsampling dataset to {0}% (equivalent to {1} audio files).".format(downsampling_rate*100, int(downsampling_rate * len(files)) )
        print(message)
        
        ds = np.random.choice(files, int(len(files) * downsampling_rate), replace=False)
        
        # Initialise empty arrays
        data = np.zeros((len(ds), 1, 88200))
        labels = np.zeros(len(ds), dtype = "<U10")
        
        with progressbar.ProgressBar(max_value=len(ds)) as bar:
            
            for i, file in enumerate(ds):
                # load and process file, then add to array
                audio_clip, sr = sf.read(path + file)
                audio_clip = audio_clip[:int(sr*duration_sec)]
                audio_clip = audio_clip[np.newaxis, :]
                data[i, :audio_clip.shape[0],:audio_clip.shape[1]] = audio_clip
                audio_clip = None
            
                # look up label and add to array
                file_id = file.split("-")[0]
                label = composers[int(file_id)]
                labels[i] = label 
                label = None
                bar.update(i)
            
        return labels, data
        

In [7]:
# Load audio files, process them and create dataset and labels
labels, data = load_audio("/data/music/musicnet/data_chunks/", 1, 2, composers)

  0% (26 of 61596) |                     | Elapsed Time: 0:00:00 ETA:   0:03:58

Processing 61596 audio files...
Downsampling dataset to 100% (equivalent to 61596 audio files).


100% (61596 of 61596) |##################| Elapsed Time: 0:03:52 Time:  0:03:52


In [8]:
import h5py


In [12]:
h5f = h5py.File('/data/music/data.hdf5', 'w')

In [13]:
h5f.create_dataset('dataset', data=data)

<HDF5 dataset "dataset": shape (61596, 1, 88200), type "<f8">

In [21]:
import gc
gc.collect()

205

In [22]:
f = h5py.File('/data/music/data.hdf5', 'r')

In [24]:
data = f['dataset']

In [None]:
data[1:20000,:]

In [45]:
print(labels.shape)
print(data.shape)

(30798,)
(30798, 1, 88200)
The history saving thread hit an unexpected error (OperationalError('database or disk is full',)).History will not be written to the database.


In [40]:
# Turn labels to categorical (one-hot encoding)
labels_pd = pd.DataFrame(labels)
onehot = pd.get_dummies(labels_pd)
targets = onehot.as_matrix()
targets.shape

(30798, 10)

In [41]:
# Split dataset and labels into training, validation and test data
indices = np.random.permutation(data.shape[0])
training_test_split = 0.8
size = int(len(indices) * training_test_split)

In [17]:
test_data.shape

(12320, 1, 88200)

In [42]:
training_idx, test_idx = indices[:size], indices[size:]
training_data = data[training_idx,:]

In [43]:

test_data = data[training_idx,:], data[test_idx,:]
training_labels, test_labels = labels[training_idx,:], labels[test_ids,:]

MemoryError: 

In [20]:
data = None
data = np.zeros((61596, 1, 88200))

In [18]:
x = np.random.rand(100, 5)

In [21]:
indices = np.random.permutation(x.shape[0])

In [23]:
training_idx, test_idx = indices[:80], indices[80:]

In [27]:
training_idx.shape

(80,)

In [26]:
test.shape

(20, 5)

In [None]:
# [CONTINUE HERE!]

In [126]:
# Define a model whose first layer is a mel-spectrogram (from Kapre)
from keras.models import Sequential
from kapre.time_frequency import Melspectrogram
from kapre.utils import Normalization2D
from kapre.augmentation import AdditiveNoise

In [146]:
# Code inspired by https://github.com/keunwoochoi/kapre
input_shape = (1, 88200)
sr = 44100

model = Sequential()
# A mel-spectrogram layer
model.add(Melspectrogram(n_dft=512, n_hop=256, input_shape=input_shape,
                         padding='same', sr=sr, n_mels=64,
                         fmin=0.0, fmax=sr/2, power_melgram=1.0,
                         return_decibel_melgram=False, trainable_fb=False,
                         trainable_kernel=False,
                         name='trainable_stft'))
# Maybe some additive white noise.
model.add(AdditiveNoise(power=0.2))
# If you wanna normalise it per-frequency
model.add(Normalization2D(str_axis='freq')) # or 'channel', 'time', 'batch', 'data_sample'
# After this, it's just a usual keras workflow. For example..
# Add some layers, e.g., model.add(some convolution layers..)
# Compile the model
model.compile('adam', 'categorical_crossentropy') # if single-label classification
# train it with raw audio sample inputs
#x = load_x() # e.g., x.shape = (10000, 6, 44100)
#y = load_y() # e.g., y.shape = (10000, 10) if it's 10-class classification
# and train it
#model.fit(x, y)

In [147]:
# Compile model
model.compile('adam', 'categorical_crossentropy')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
trainable_stft (Melspectrogr (None, 64, 345, 1)        279616    
_________________________________________________________________
additive_noise_3 (AdditiveNo (None, 64, 345, 1)        0         
_________________________________________________________________
normalization2d_2 (Normaliza (None, 64, 345, 1)        0         
Total params: 279,616
Trainable params: 0
Non-trainable params: 279,616
_________________________________________________________________


In [148]:
# Turn labels to categorical (one-hot encoding)

In [None]:
# Split dataset in training and test