In [49]:
import os
import glob
import numpy as np
from music21 import converter, instrument, note, chord, stream, pitch
import tensorflow as tf
from tqdm import tqdm
from keras import Model
from keras.layers import Dense, Reshape, InputLayer, Embedding, GlobalAveragePooling1D,\
                            Layer, MultiHeadAttention, LayerNormalization, LSTM, Dropout,\
                            Conv1D, RepeatVector, GRU, TimeDistributed
from keras.optimizers import Adam
from keras import Input
from keras.losses import CategoricalCrossentropy, binary_crossentropy, categorical_crossentropy
from keras.models import Sequential, load_model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [2]:
with open("jazz.txt", 'r') as f:
    notes = f.read().split('\n')

pitchnames = sorted(set(item for item in notes))
pitchnames

['0',
 '0.1',
 '0.1.2',
 '0.1.2.3',
 '0.1.2.3.6.9',
 '0.1.2.5',
 '0.1.2.7',
 '0.1.3.4.6.9',
 '0.1.3.5.7.8',
 '0.1.3.7.8',
 '0.1.4.5',
 '0.1.4.7',
 '0.1.5',
 '0.1.5.6.7',
 '0.1.5.7',
 '0.1.5.8',
 '0.1.6',
 '0.2',
 '0.2.3',
 '0.2.3.6',
 '0.2.3.7',
 '0.2.4',
 '0.2.4.5',
 '0.2.4.5.7',
 '0.2.4.5.7.9',
 '0.2.4.6',
 '0.2.4.6.7',
 '0.2.4.6.9',
 '0.2.4.7',
 '0.2.4.7.8',
 '0.2.4.7.9',
 '0.2.4.8',
 '0.2.5',
 '0.2.5.6',
 '0.2.5.7',
 '0.2.5.7.8',
 '0.2.5.8',
 '0.2.6',
 '0.2.6.7',
 '0.2.6.8',
 '0.2.7',
 '0.3',
 '0.3.4',
 '0.3.5',
 '0.3.5.7',
 '0.3.5.8',
 '0.3.6',
 '0.3.6.8',
 '0.3.6.9',
 '0.3.7',
 '0.4',
 '0.4.5',
 '0.4.5.6',
 '0.4.5.7',
 '0.4.6',
 '0.4.7',
 '0.4.8',
 '0.5',
 '0.5.6',
 '0.6',
 '1',
 '1.2',
 '1.2.3',
 '1.2.4',
 '1.2.4.5.6',
 '1.2.4.5.6.9',
 '1.2.4.6',
 '1.2.4.6.8',
 '1.2.4.6.9',
 '1.2.4.7',
 '1.2.4.8',
 '1.2.4.8.9',
 '1.2.5',
 '1.2.5.8',
 '1.2.5.9',
 '1.2.6',
 '1.2.6.7',
 '1.2.6.8',
 '1.2.6.9',
 '1.2.7',
 '1.2.7.8',
 '1.3',
 '1.3.4',
 '1.3.4.6',
 '1.3.4.8',
 '1.3.5.6.8',
 '1.3.5.7.10

In [10]:
def prepare_sequences(notes, sequence_length, num_notes):
    # Create a dictionary to map pitches to integers
    pitchnames = sorted(set(item for item in notes))
    note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
    
    network_input = []
    network_output = []

    # Create input sequences and the corresponding outputs
    for i in range(0, len(notes) - sequence_length, 1):
        sequence_in = notes[i: i + sequence_length]
        sequence_out = notes[i + sequence_length]
        network_input.append([note_to_int[char] for char in sequence_in])
        network_output.append(note_to_int[sequence_out])

    # One-hot encode the input sequences
    network_input = to_categorical(network_input, num_notes)
    network_input = np.reshape(network_input, (-1, sequence_length, num_notes))
    
    # One-hot encode the output notes
    network_output = to_categorical(network_output, num_notes)

    return network_input, network_output


In [28]:
# Define the model parameters
latent_dim = 64
sequence_length = 100
num_notes = len(set(notes))

class Decoder(Model):
    def __init__(self, latent_dim, sequence_length, num_notes, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.latent_dim = latent_dim
        self.sequence_length = sequence_length
        self.num_notes = num_notes

        # Change dense layer to take sequence input
        self.dense = Dense(units=self.latent_dim, activation='relu')
        self.reshape = Reshape(target_shape=(self.sequence_length, self.latent_dim))

        self.gru = GRU(units=self.latent_dim, return_sequences=True)
        self.final_dense = TimeDistributed(Dense(self.num_notes, activation='softmax'))

    def call(self, inputs):
        x = self.dense(inputs)
        x = self.reshape(x)
        x = self.gru(x)
        x = self.final_dense(x)
        return x


class VAE(Model):
    def __init__(self, sequence_length, num_notes, latent_dim=64, num_heads=4, ff_dim=128, **kwargs):
        super(VAE, self).__init__(**kwargs)
        
        self.sequence_length = sequence_length
        self.num_notes = num_notes
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim

        self.encoder = self.build_encoder()
        self.decoder = self.build_decoder()

    def build_encoder(self):
        inputs = Input(shape=(self.sequence_length, self.num_notes))
        x = self.transformer_encoder(inputs, self.latent_dim, self.num_heads, self.ff_dim)
        self.z_mean = Dense(self.latent_dim, name="z_mean")(x)
        self.z_log_var = Dense(self.latent_dim, name="z_log_var")(x)
        return Model(inputs, [self.z_mean, self.z_log_var], name="encoder")

    def build_decoder(self):
        latent_inputs = Input(shape=(self.sequence_length, self.latent_dim,))
        x = self.transformer_encoder(latent_inputs, self.latent_dim, self.num_heads, self.ff_dim)
        outputs = TimeDistributed(Dense(self.num_notes, activation="softmax"))(x)
        return Model(latent_inputs, outputs, name="decoder")


    def transformer_encoder(self, inputs, head_size, num_heads, ff_dim, dropout=0):
        x = LayerNormalization(epsilon=1e-6)(inputs)
        x = MultiHeadAttention(
            key_dim=head_size, num_heads=num_heads, dropout=dropout
        )(x, x)
        x = Dropout(dropout)(x)
        res = x + inputs
        x = LayerNormalization(epsilon=1e-6)(res)
        x = Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
        x = Dropout(dropout)(x)
        x = Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
        return x + res

    def call(self, inputs):
        z_mean, z_log_var = self.encoder(inputs)
        z = self.reparameterize(z_mean, z_log_var)
        # No need to reshape z, pass it directly to the decoder
        decoded = self.decoder(z)
        return decoded

    def reparameterize(self, mean, log_var):
        eps = tf.random.normal(shape=tf.shape(mean))
        return eps * tf.exp(log_var * 0.5) + mean

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var = self.encoder(data)
            z = self.reparameterize(z_mean, z_log_var)
            reconstruction = self.decoder(z)  # Now, z does not need to be flattened
            reconstruction_loss = tf.reduce_mean(
                categorical_crossentropy(data, reconstruction)
            )
            reconstruction_loss *= self.num_notes
            kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
            kl_loss = tf.reduce_mean(kl_loss)
            kl_loss *= -0.5
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

vae = VAE(sequence_length, num_notes)
vae.compile(optimizer='adam')

In [16]:
def data_generator(notes, sequence_length, num_notes, batch_size):
    # Create a dictionary to map pitches to integers
    pitchnames = sorted(set(item for item in notes))
    note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
    
    while True:
        for i in range(0, len(notes) - sequence_length, batch_size):
            network_input = []
            network_output = []
            
            # Create input sequences and the corresponding outputs in batches
            for j in range(i, min(i + batch_size, len(notes) - sequence_length)):
                sequence_in = notes[j: j + sequence_length]
                sequence_out = notes[j + sequence_length]
                network_input.append([note_to_int[char] for char in sequence_in])
                network_output.append(note_to_int[sequence_out])
            
            # One-hot encode the input sequences
            network_input = to_categorical(network_input, num_notes)
            network_input = np.reshape(network_input, (-1, sequence_length, num_notes))
            
            # One-hot encode the output notes
            network_output = to_categorical(network_output, num_notes)
            
            yield network_input, network_output


In [31]:
# RUN HERE:

batch_size = 64  # adjust to your system memory
generator = data_generator(notes, sequence_length, num_notes=num_notes, batch_size=batch_size)

steps_per_epoch = (len(notes) - sequence_length) // batch_size  # adjust to your data size and batch size

vae.fit(generator, epochs=2, steps_per_epoch=steps_per_epoch)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2c00acf9360>

In [43]:
def generate_music(model, seed, sequence_length, num_notes, temperature=1.0):
    """Generate a sequence of notes from a seed sequence."""
    # Copy the seed sequence
    sequence = list(seed)
    
    # Limit the sequence to the sequence_length
    sequence = sequence[:sequence_length]
    
    for i in range(100):  # Generate 100 notes
        # Prepare the seed sequence
        seed_sequence = sequence[-sequence_length:]
        
        # One-hot encode the seed sequence
        seed_sequence_one_hot = to_categorical(seed_sequence, num_notes)
        seed_sequence_one_hot = np.reshape(seed_sequence_one_hot, (1, sequence_length, num_notes))

        # Get the output from the decoder
        output_sequence = model(seed_sequence_one_hot)
        
        # Get the last note from the output sequence
        last_note = output_sequence.numpy()[0, -1, :]
        
        # Apply temperature
        prediction = np.log(last_note + 1e-7) / temperature
        prediction = np.exp(prediction) / np.sum(np.exp(prediction))
        
        # Sample the next note from the adjusted prediction
        # Check if prediction contains NaN values
        if np.isnan(prediction).any():
            # print("Prediction contains NaN values:", prediction)
            pass
        else:
            next_note = np.random.choice(range(num_notes), p=prediction)
        
            # Append the note to the sequence
            sequence.append(next_note)

    return sequence


In [44]:
# Inverse of the note_to_int dictionary
note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
int_to_note = {i: n for n, i in note_to_int.items()}

# The first sequence in the first batch of the generator
first_batch = next(generator)
seed_sequence_one_hot = first_batch[0][0]

# Convert the one-hot encoded sequence to integer format
seed_sequence = [int_to_note[np.argmax(note)] for note in seed_sequence_one_hot]

# Convert the note names in seed_sequence to integers using the note_to_int dictionary
seed_sequence = [note_to_int[note] for note in seed_sequence]

# Now you can use seed_sequence as the seed for the generate_music function
generated_sequence = generate_music(vae, seed_sequence, sequence_length, num_notes=num_notes, temperature=0.7)
generated_sequence

[734,
 199,
 734,
 717,
 734,
 199,
 734,
 708,
 717,
 694,
 759,
 694,
 750,
 360,
 717,
 774,
 733,
 783,
 774,
 726,
 734,
 784,
 775,
 727,
 775,
 784,
 734,
 726,
 774,
 783,
 733,
 774,
 733,
 783,
 774,
 726,
 734,
 784,
 775,
 727,
 775,
 784,
 734,
 726,
 774,
 783,
 733,
 766,
 725,
 733,
 741,
 766,
 726,
 734,
 742,
 767,
 727,
 767,
 742,
 734,
 726,
 766,
 741,
 733,
 725,
 766,
 725,
 733,
 741,
 766,
 726,
 734,
 742,
 767,
 727,
 767,
 742,
 734,
 726,
 774,
 733,
 783,
 774,
 726,
 734,
 784,
 775,
 727,
 775,
 784,
 734,
 726,
 774,
 783,
 733,
 774,
 733,
 783,
 774,
 726]

In [46]:
int_to_note = dict((number, note) for number, note in enumerate(sorted(set(item for item in notes))))
note_sequence = [int_to_note[i] for i in generated_sequence]
note_sequence

['D5',
 '11.1',
 'D5',
 'C#5',
 'D5',
 '11.1',
 'D5',
 'B4',
 'C#5',
 'A4',
 'F#4',
 'A4',
 'E5',
 '4.10',
 'C#5',
 'G#3',
 'D4',
 'G4',
 'G#3',
 'C5',
 'D5',
 'G5',
 'G#4',
 'C6',
 'G#4',
 'G5',
 'D5',
 'C5',
 'G#3',
 'G4',
 'D4',
 'G#3',
 'D4',
 'G4',
 'G#3',
 'C5',
 'D5',
 'G5',
 'G#4',
 'C6',
 'G#4',
 'G5',
 'D5',
 'C5',
 'G#3',
 'G4',
 'D4',
 'F3',
 'C4',
 'D4',
 'E-4',
 'F3',
 'C5',
 'D5',
 'E-5',
 'F4',
 'C6',
 'F4',
 'E-5',
 'D5',
 'C5',
 'F3',
 'E-4',
 'D4',
 'C4',
 'F3',
 'C4',
 'D4',
 'E-4',
 'F3',
 'C5',
 'D5',
 'E-5',
 'F4',
 'C6',
 'F4',
 'E-5',
 'D5',
 'C5',
 'G#3',
 'D4',
 'G4',
 'G#3',
 'C5',
 'D5',
 'G5',
 'G#4',
 'C6',
 'G#4',
 'G5',
 'D5',
 'C5',
 'G#3',
 'G4',
 'D4',
 'G#3',
 'D4',
 'G4',
 'G#3',
 'C5']

In [48]:
n = note.Note('D5')

In [50]:
def create_midi(prediction_output, output_file):
    offset = 0
    output_notes = []

    for pattern in prediction_output:
        try:
            if ('.' in pattern) or pattern.isdigit():
                notes_in_chord = pattern.split('.')
                notes = []
                for current_note in notes_in_chord:
                    new_note = note.Note(int(current_note))
                    new_note.storedInstrument = instrument.Guitar()
                    notes.append(new_note)
                new_chord = chord.Chord(notes)
                new_chord.offset = offset
                output_notes.append(new_chord)
            else:
                print(pattern)
                new_note = note.Note()
                new_note.pitch = pitch.Pitch(pattern)
                new_note.offset = offset
                new_note.storedInstrument = instrument.Piano()
                output_notes.append(new_note)

            offset += 0.5
        except Exception as e:
            print(f'Error loading pattern: {pattern}: {e}')

    midi_stream = stream.Stream(output_notes)
    midi_stream.write('midi', fp=output_file)

output_file = "generated_trash.mid"
create_midi(note_sequence, output_file)

D5
D5
C#5
D5
D5
B4
C#5
A4
F#4
A4
E5
C#5
G#3
D4
G4
G#3
C5
D5
G5
G#4
C6
G#4
G5
D5
C5
G#3
G4
D4
G#3
D4
G4
G#3
C5
D5
G5
G#4
C6
G#4
G5
D5
C5
G#3
G4
D4
F3
C4
D4
E-4
F3
C5
D5
E-5
F4
C6
F4
E-5
D5
C5
F3
E-4
D4
C4
F3
C4
D4
E-4
F3
C5
D5
E-5
F4
C6
F4
E-5
D5
C5
G#3
D4
G4
G#3
C5
D5
G5
G#4
C6
G#4
G5
D5
C5
G#3
G4
D4
G#3
D4
G4
G#3
C5
