In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import gc
import glob
import math
import os
import random

import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras import backend as K
from keras import metrics
import mido
from mido import Message, MetaMessage, MidiFile, MidiTrack
import numpy
import sklearn.utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
random.seed(0)
numpy.random.seed(0)

In [4]:
midi_dir = '/home/santiago/Projects/ProjectEuterpe/data/midi/classical/'

In [5]:
files = sorted(glob.glob(os.path.join(midi_dir, '*.mid')) + glob.glob(os.path.join(midi_dir, '*.midi')))

In [6]:
len(files)

1506

In [7]:
INPUT_WIDTH = 1 + 8 + 8 + 16
LOOKBACK = 128
DEFAULT_TICKS = 480
DEFAULT_TEMPO = 500000
CORES = 4
BATCH_SIZE = 128

In [8]:
def filter_files(midi_files):
    keep = []
    for file in midi_files:
        try:
            midi = MidiFile(file)
            if len(midi.tracks) == 1:
                keep.append(file)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            pass
    return sorted(keep)

In [9]:
filtered = filter_files(files)

In [10]:
len(filtered)

282

In [11]:
def load_midi(midi_file):
    data = []
    midi = mido.MidiFile(midi_file)
    assert len(midi.tracks) == 1
    for message in midi.tracks[0]:
        if message.type in ['note_on', 'note_off']:
            data.append([1 if message.type == 'note_on' else 0, message.note, message.velocity, int(round(message.time * DEFAULT_TICKS / midi.ticks_per_beat))])
    assert data
    return numpy.array(data, dtype=numpy.uint8)

In [12]:
def encode(data):
    encoded = []
    for sequence in data:
        encoded_sequence = numpy.zeros((len(sequence), INPUT_WIDTH), dtype=numpy.uint8)
        for i, event in enumerate(sequence):
            encoded_sequence[i, 0] = event[0]
            encoded_sequence[i, 1:9] = [int(x) for x in format(event[1], '08b')]
            encoded_sequence[i, 9:17] = [int(x) for x in format(event[2], '08b')]
            encoded_sequence[i, 17:] = [int(x) for x in format(event[3], '016b')]
        encoded.append(encoded_sequence)
    return encoded

In [13]:
def prepare(data):
    X = []
    for sequence in data:
        for i in range(len(sequence) - LOOKBACK):
            segment = sequence[i:i + LOOKBACK, :]
            assert len(segment) == LOOKBACK
            X.append(segment)
    X = numpy.array(X, dtype=numpy.uint8)
    return X

In [14]:
def load_all(midi_files):
    X = []
    for midi_file in midi_files:
        data = prepare(encode([load_midi(midi_file)]))
        X.extend(data)
    X = numpy.array(X, dtype=numpy.uint8)
    return X

In [15]:
data = load_all(filtered)

In [16]:
data.shape

(1222982, 128, 33)

In [17]:
class VAE(object):
    def create(self, vocab_size=INPUT_WIDTH, max_length=LOOKBACK, latent_rep_size=128, lr=0.001):
        self.encoder = None
        self.decoder = None
        #self.sentiment_predictor = None
        self.autoencoder = None

        x = Input(shape=(max_length, vocab_size))
        #x_embed = Embedding(vocab_size, 64, input_length=max_length)(x)

        vae_loss, encoded = self._build_encoder(x, latent_rep_size=latent_rep_size, max_length=max_length)
        self.encoder = Model(inputs=x, outputs=encoded)

        encoded_input = Input(shape=(latent_rep_size,))
#         predicted_sentiment = self._build_sentiment_predictor(encoded_input)
#         self.sentiment_predictor = Model(encoded_input, predicted_sentiment)

        decoded = self._build_decoder(encoded_input, vocab_size, max_length)
        self.decoder = Model(encoded_input, decoded)

        self.autoencoder = Model(inputs=x, outputs=self._build_decoder(encoded, vocab_size, max_length))
        self.autoencoder.compile(optimizer=Adam(lr=lr),
                                 loss=vae_loss,
                                 metrics=['accuracy'])
    
    def _build_encoder(self, x, latent_rep_size=128, max_length=None, epsilon_std=0.01):
        h = Bidirectional(LSTM(500, return_sequences=True), merge_mode='concat')(x)
        h = Dropout(0.5)(h)
        h = Bidirectional(LSTM(500, return_sequences=False), merge_mode='concat')(h)
        h = Dropout(0.5)(h)
        h = Dense(435, activation='relu')(h)

        def sampling(args):
            z_mean_, z_log_var_ = args
            batch_size = K.shape(z_mean_)[0]
            epsilon = K.random_normal(shape=(batch_size, latent_rep_size), mean=0., stddev=epsilon_std)
            return z_mean_ + K.exp(z_log_var_ / 2) * epsilon

        z_mean = Dense(latent_rep_size, name='z_mean', activation='linear')(h)
        z_log_var = Dense(latent_rep_size, name='z_log_var', activation='linear')(h)

        def vae_loss(x, x_decoded_mean):
            x = K.flatten(x)
            x_decoded_mean = K.flatten(x_decoded_mean)
            xent_loss = max_length * metrics.binary_crossentropy(x, x_decoded_mean)
            kl_loss = - 0.5 * K.mean(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
            return xent_loss + kl_loss

        return (vae_loss, Lambda(sampling, output_shape=(latent_rep_size,), name='lambda')([z_mean, z_log_var]))
    
    def _build_decoder(self, encoded, vocab_size, max_length):
#         h = Dense(100, activation='relu')(encoded)
#         h = Dropout(0.25)(h)
#         h = Dense(50, activation='relu')(h)
#         pred = Dense(INPUT_WIDTH, activation='sigmoid', name='pred')(h)
        repeated_context = RepeatVector(max_length)(encoded)

        h = LSTM(500, return_sequences=True, name='dec_lstm_1')(repeated_context)
        #h = Dropout(0.5, name='dec_dropout_1')(h)
        h = LSTM(500, return_sequences=True, name='dec_lstm_2')(h)
        #h = Dropout(0.5, name='dec_dropout_2')(h)

        decoded = TimeDistributed(Dense(vocab_size, activation='sigmoid'), name='decoded_mean')(h)

        return decoded
    
#     def _build_sentiment_predictor(self, encoded):
#         h = Dense(100, activation='linear')(encoded)

#         return Dense(INPUT_WIDTH, activation='sigmoid', name='pred')(h)

In [18]:
checkpoint_dir = '/home/santiago/Projects/ProjectEuterpe/checkpoints/vae_classical/'

In [19]:
def bits_to_int(bits):
    out = 0
    for bit in bits:
        out = (out << 1) | bit
    return out

In [20]:
def generator(X, batch_size=32, shuffle=True):
    while True:
        if shuffle:
            numpy.random.shuffle(X)
        for i in range(0, len(X), batch_size):
            yield X[i:i + batch_size, :, :], X[i:i + batch_size, :, :]

In [21]:
def to_midi(data):
    midi = MidiFile()
    track = MidiTrack()
    midi.tracks.append(track)
    for event in data:
        onoff = 'note_on' if event[0] == 1 else 'note_off'
        note = bits_to_int(event[1:9])
        velocity = bits_to_int(event[9:17])
        time = bits_to_int(event[17:])
        track.append(Message(onoff, note=note, velocity=velocity, time=time))
    return midi

In [22]:
gen = generator(data, batch_size=BATCH_SIZE, shuffle=True)

In [23]:
model = VAE()
model.create(lr=0.0001)
model.autoencoder.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 128, 33)       0                                            
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 128, 1000)     2136000     input_1[0][0]                    
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 128, 1000)     0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
bidirectional_2 (Bidirectional)  (None, 1000) 

In [24]:
epoch = 0
while True:
    epoch += 1
    print('Epoch', epoch)
    gc.collect()
    model.autoencoder.fit_generator(gen, len(data) // BATCH_SIZE, epochs=1)
    model.autoencoder.save_weights(os.path.join(checkpoint_dir, 'epoch{}.hdf5'.format(epoch)))

Epoch 1
Epoch 1/1
Epoch 2
Epoch 1/1
Epoch 3
Epoch 1/1
1400/9554 [===>..........................] - ETA: 9615s - loss: 39.2542 - acc: 0.3184

KeyboardInterrupt: 

In [25]:
model.autoencoder.save_weights(os.path.join(checkpoint_dir, 'epoch3_partial.hdf5'.format(epoch)))

In [37]:
originals = data[:100, :, :]
pred = numpy.round(model.autoencoder.predict(originals)).astype(int)

In [44]:
midi = to_midi(data[99])
midi.save('/home/santiago/Projects/ProjectEuterpe/data/test/vae2_0.mid')