In [2]:
import os
import glob
from tqdm import tqdm
from music21 import converter, instrument, note, chord, stream, pitch
import numpy as np
from keras.utils import to_categorical
import tensorflow as tf
from keras import backend as K
from keras.metrics import Mean
from keras.layers import Input, LSTM, Dense, Lambda, Layer
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import KLDivergence, CategoricalCrossentropy

### Load and preprocess dataset

In [4]:
# Corrupt files:
test_midi = converter.parseFile('./Jazz_midi/Andy.mid')
print('Step 1')
test_notes_to_parse = None

test_parts = instrument.partitionByInstrument(test_midi)
print('Step 2')
if test_parts: 
    notes_to_parse = test_parts.parts[0].recurse()
else: 
    notes_to_parse = test_midi.flat.notes

KeyError: 2303681894240

In [4]:
def preprocess_midi_files(input_folder, output_file):
    notes = []

    for file in tqdm(glob.glob(os.path.join(input_folder, "*.mid"))):
        try:
            midi = converter.parse(file)
            notes_to_parse = None

            parts = instrument.partitionByInstrument(midi)
            if parts: 
                notes_to_parse = parts.parts[0].recurse()
            else: 
                notes_to_parse = midi.flat.notes

            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    notes.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes.append('.'.join(str(n) for n in element.normalOrder))
        except Exception as e:
            print(f'Error loading {file}: {e}')

    with open(output_file, 'w') as f:
        f.write('\n'.join(notes))

input_folder = './Jazz_midi/'
output_file = "jazz.txt"
preprocess_midi_files(input_folder, output_file)

  5%|▍         | 46/934 [01:46<44:12,  2.99s/it]  

Error loading ./Jazz_midi\Andy.mid: 1606126296032


 18%|█▊        | 166/934 [06:00<22:03,  1.72s/it]

Error loading ./Jazz_midi\Chromazone.mid: 1606081379776


 30%|██▉       | 279/934 [10:20<18:06,  1.66s/it]

Error loading ./Jazz_midi\Fragile.mid: 1606163902816


 30%|███       | 281/934 [10:32<42:15,  3.88s/it]

Error loading ./Jazz_midi\FreedoomOfSpeech.mid: 1606056338912


 32%|███▏      | 298/934 [11:08<32:58,  3.11s/it]

Error loading ./Jazz_midi\GlasgowKiss.mid: 1606220636896


 34%|███▍      | 321/934 [12:13<22:09,  2.17s/it]

Error loading ./Jazz_midi\HelpMe.mid: 1606055012480


 65%|██████▌   | 609/934 [29:36<27:20,  5.05s/it]  

Error loading ./Jazz_midi\PlayingForTime.mid: 1606203842816


 93%|█████████▎| 864/934 [35:55<02:25,  2.07s/it]

Error loading ./Jazz_midi\WhatsNewInBaltimore.mid: 1605937139472


100%|██████████| 934/934 [37:27<00:00,  2.41s/it]


In [4]:
def prepare_sequences(notes, sequence_length, n_vocab):
    pitchnames = sorted(set(item for item in notes))
    note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

    network_input = []
    network_output = []

    for i in range(0, len(notes) - sequence_length, 1):
        sequence_in = notes[i:i + sequence_length]
        sequence_out = notes[i + sequence_length]
        network_input.append([note_to_int[char] for char in sequence_in])
        network_output.append(note_to_int[sequence_out])

    n_patterns = len(network_input)
    network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
    network_input = network_input / float(n_vocab)
    network_output = to_categorical(network_output)

    return network_input, network_output

with open("jazz.txt", 'r') as f:
    notes = f.read().split('\n')

sequence_length = 100
n_vocab = len(set(notes))
network_input, network_output = prepare_sequences(notes, sequence_length, n_vocab)


In [46]:
print(type(network_input[0][0]))
print(network_output[0])
network_input.shape, network_output.shape

<class 'numpy.ndarray'>
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.

((300693, 100, 1), (300693, 787))

### Define VAE architecture

In [36]:
class Sampling(Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [37]:
latent_dim = 64

encoder_inputs = Input(shape=(sequence_length, 1))
x = LSTM(256, return_sequences=True)(encoder_inputs)
x = LSTM(256)(x)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)
z = Sampling()([z_mean, z_log_var])

encoder = Model(encoder_inputs, [z_mean, z_log_var, z])
encoder.summary()

Model: "model_32"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_31 (InputLayer)          [(None, 100, 1)]     0           []                               
                                                                                                  
 lstm_66 (LSTM)                 (None, 100, 256)     264192      ['input_31[0][0]']               
                                                                                                  
 lstm_67 (LSTM)                 (None, 256)          525312      ['lstm_66[0][0]']                
                                                                                                  
 dense_67 (Dense)               (None, 64)           16448       ['lstm_67[0][0]']                
                                                                                           

In [38]:
latent_inputs = Input(shape=(latent_dim,))
x = Dense(sequence_length * 256, activation='relu')(latent_inputs)
x = tf.reshape(x, (-1, sequence_length, 256))
x = LSTM(256, return_sequences=True)(x)
x = LSTM(256)(x)
decoder_outputs = Dense(n_vocab, activation='softmax')(x)

decoder = Model(latent_inputs, decoder_outputs)
decoder.summary()

Model: "model_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_32 (InputLayer)       [(None, 64)]              0         
                                                                 
 dense_69 (Dense)            (None, 25600)             1664000   
                                                                 
 tf.reshape_9 (TFOpLambda)   (None, 100, 256)          0         
                                                                 
 lstm_68 (LSTM)              (None, 100, 256)          525312    
                                                                 
 lstm_69 (LSTM)              (None, 256)               525312    
                                                                 
 dense_70 (Dense)            (None, 787)               202259    
                                                                 
Total params: 2,916,883
Trainable params: 2,916,883
Non-tr

In [58]:
class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = Mean(name="total_loss")
        self.reconstruction_loss_tracker = Mean(name="reconstruction_loss")
        self.kl_loss_tracker =  Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        input, output = data
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(input)
            reconstruction = self.decoder(z)
            # print(f'Here: {input.shape}, {output.shape}, {reconstruction.shape}')
            # print(f'{tf.keras.losses.categorical_crossentropy(output, reconstruction)}')
            reconstruction_loss = tf.reduce_mean(
                    tf.keras.losses.categorical_crossentropy(output, reconstruction)
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [60]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=Adam())
vae.fit(network_input, network_output, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x127357462e0>

In [62]:
vae.save_weights('jazz_vae.hdf5')

### Postprocess and save generated music

In [82]:
def generate_music(vae_model, network_input, int_to_note, n_vocab, sequence_length):
    start = np.random.randint(0, len(network_input) - 1)
    pattern = network_input[start]
    prediction_output = []

    for note_index in range(100):
        prediction_input = np.reshape(pattern, (1, sequence_length, 1))
        prediction = vae_model.predict([prediction_input, prediction_input], verbose=0)
        index = np.argmax(prediction)
        result = int_to_note[index]
        prediction_output.append(result)
        pattern = np.append(pattern[1:], [index / float(n_vocab)])
    return prediction_output

int_to_note = dict((number, note) for number, note in enumerate(sorted(set(item for item in notes))))
generated_notes = generate_music(vae_model, network_input, int_to_note, n_vocab, sequence_length)

shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)
shape: [(None, 100, 1)] vs (1, 100, 1) & (1, 787)


In [84]:
generated_notes

['8.10.0.2.3',
 '8.10.0.2.3',
 'F#-1',
 'F#-1',
 '8.10.0.2.3',
 '11.0.2.6',
 '11.0.2.6',
 'F#-1',
 'F#-1',
 '11.0.2.6',
 'F#-1',
 'F#-1',
 '3.5.6',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 '8.10',
 '1.2.4.5.6.9',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 '3.7',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 '8.10.0.2.3',
 'F#-1',
 '11.0.2.6',
 'F#-1',
 'F#-1',
 'F#-1',
 '8.10.0.2.3',
 'F#-1',
 '10.3',
 'F#-1',
 'F#-1',
 '2.5.6',
 '8.10.0.2.3',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 '6.7.9.11.2',
 'F#-1',
 '6.7.9.11.2',
 'F#-1',
 '2.5.6',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 '6.7.9.11.2',
 'F#-1',
 '8.10.0.2.3',
 '8.10.0.2.3',
 'F#-1',
 '8.10.0.2.3',
 'F#-1',
 '8.10.3',
 'F#-1',
 'F#-1',
 '11.0.2.6',
 'F#-1',
 'F#-1',
 'F#-1',
 '11.0.2.6',
 'D3',
 'F#-1',
 'F#-1',
 'F#-1',
 '5.8.10',
 'D3',
 'F#-1',
 '7.9.11.3',
 'F#-1',
 'F#-1',
 'F#-1',
 'F#-1',
 '3.5.6',
 'F#-1',
 'F#-1',
 '11.0.2

In [95]:
new_note = note.Note('F#-1') # why is it not recognized?

In [96]:
def create_midi(prediction_output, output_file):
    offset = 0
    output_notes = []

    for pattern in prediction_output:
        try:
            if ('.' in pattern) or pattern.isdigit():
                notes_in_chord = pattern.split('.')
                notes = []
                for current_note in notes_in_chord:
                    new_note = note.Note(int(current_note))
                    new_note.storedInstrument = instrument.Piano()
                    notes.append(new_note)
                new_chord = chord.Chord(notes)
                new_chord.offset = offset
                output_notes.append(new_chord)
            else:
                print(pattern)
                new_note = note.Note()
                new_note.pitch = pitch.Pitch(pattern)
                new_note.offset = offset
                new_note.storedInstrument = instrument.Piano()
                output_notes.append(new_note)

            offset += 0.5
        except Exception as e:
            print(f'Error loading pattern: {pattern}: {e}')

    midi_stream = stream.Stream(output_notes)
    midi_stream.write('midi', fp=output_file)

output_file = "generated_music.mid"
create_midi(generated_notes, output_file)

F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accidental type
F#-1
Error loading pattern: F#-1: #- is not a supported accident