In [18]:
import os
import glob
import numpy as np
from music21 import converter, instrument, note, chord, stream
import mido
import tensorflow as tf
from tqdm import tqdm
from keras import Model
from keras.layers import Dense, Reshape, InputLayer, Embedding, GlobalAveragePooling1D,\
                            Layer, MultiHeadAttention, LayerNormalization, LSTM
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.models import Sequential, load_model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

### Preprocess MIDI files -> tokenize the notes and chords.

In [3]:
def preprocess_midi_files(folder_path, output_file):
    notes = []

    for file in tqdm(glob.glob(os.path.join(folder_path, "*.mid"))):
        try:
            midi = converter.parse(file)
            notes_in_file = None

            parts = instrument.partitionByInstrument(midi)
            if parts:
                notes_in_file = parts.parts[0].recurse()
            else:
                notes_in_file = midi.flat.notes

            for element in notes_in_file:
                if isinstance(element, note.Note):
                    notes.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes.append('.'.join(str(n) for n in element.normalOrder))
        except Exception as e:
            print(f'Error loading {file}: {e}')

    with open(output_file, 'w') as f:
        f.write('\n'.join(notes))

input_folder = './Jazz_midi/'
output_file = 'jazz.txt'
preprocess_midi_files(input_folder, output_file)

  5%|▍         | 46/934 [00:55<20:35,  1.39s/it]

Error loading ./Jazz_midi\Andy.mid: 1714211658288


 18%|█▊        | 166/934 [03:20<13:19,  1.04s/it]

Error loading ./Jazz_midi\Chromazone.mid: 1714079026912


 30%|██▉       | 279/934 [05:43<11:33,  1.06s/it]

Error loading ./Jazz_midi\Fragile.mid: 1714131821280


 30%|███       | 281/934 [05:48<20:13,  1.86s/it]

Error loading ./Jazz_midi\FreedoomOfSpeech.mid: 1714219085008


 32%|███▏      | 298/934 [06:07<16:05,  1.52s/it]

Error loading ./Jazz_midi\GlasgowKiss.mid: 1714133881808


 34%|███▍      | 321/934 [06:43<12:44,  1.25s/it]

Error loading ./Jazz_midi\HelpMe.mid: 1714119524656


 65%|██████▌   | 609/934 [13:17<21:10,  3.91s/it]

Error loading ./Jazz_midi\PlayingForTime.mid: 1714174530512


 93%|█████████▎| 864/934 [19:04<02:05,  1.79s/it]

Error loading ./Jazz_midi\WhatsNewInBaltimore.mid: 1714124881680


100%|██████████| 934/934 [20:32<00:00,  1.32s/it]


### Encode the notes -> create a dictionary to map the unique notes and chords to integers:

In [6]:
def create_note_to_int_mapping(notes):
    unique_notes = sorted(set(notes))
    return dict((note, number) for number, note in enumerate(unique_notes))

### Prepare the dataset

In [16]:
def prepare_dataset(notes, sequence_length=100):
    note_to_int = create_note_to_int_mapping(notes)
    vocab_size = len(note_to_int)

    network_input = []
    network_output = []
    
    for i in range(len(notes) - sequence_length):
        sequence_in = notes[i: i + sequence_length]
        sequence_out = notes[i + sequence_length]
        network_input.append([note_to_int[char] for char in sequence_in])
        network_output.append(note_to_int[sequence_out])

    n_patterns = len(network_input)

    # Reshape the input and output data
    network_input = np.reshape(network_input, (n_patterns, sequence_length, 1))
    network_input = network_input / float(vocab_size)
    network_output = to_categorical(network_output)

    return network_input, network_output, note_to_int


### Build VAE-Transformer model

In [None]:
class SimpleVAE(Model):
    def __init__(self, input_shape, latent_dim):
        super(SimpleVAE, self).__init__()
        self.encoder = tf.keras.Sequential([
            InputLayer(input_shape=input_shape),
            Dense(256, activation='relu'),
            Dense(128, activation='relu'),
            Dense(latent_dim * 2),
        ])

        self.decoder = tf.keras.Sequential([
            InputLayer(input_shape=(latent_dim,)),
            Dense(128, activation='relu'),
            Dense(256, activation='relu'),
            Dense(np.prod(input_shape), activation='sigmoid'),
            Reshape(input_shape),
        ])

    def encode(self, x):
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        eps = tf.random.normal(shape=mean.shape)
        return eps * tf.exp(logvar * 0.5) + mean

    def decode(self, z):
        return self.decoder(z)

    def call(self, inputs, training=None, mask=None):
        mean, logvar = self.encode(inputs)
        z = self.reparameterize(mean, logvar)
        x_recon = self.decoder(z)
        return x_recon, mean, logvar


In [None]:
class TransformerBlock(Layer):
    def __init__(self, d_model, num_heads, d_ff):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            Dense(d_ff, activation='relu'),
            Dense(d_model),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training=None, mask=None):
        attn_output = self.att(inputs, inputs)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class Sampling(Layer):
    def call(self, inputs):
        mean, logvar = inputs
        eps = tf.random.normal(shape=mean.shape)
        return eps * tf.exp(logvar * 0.5) + mean

In [None]:
class VAETransformer(Model):
    def __init__(self, input_shape, latent_dim, num_heads, d_ff, num_layers, num_tokens):
        super(VAETransformer, self).__init__()

        # VAE Encoder
        self.encoder = Sequential([
            InputLayer(input_shape=input_shape),
            LSTM(512, return_sequences=True),
            LSTM(256),
            Dense(latent_dim * 2),
        ])

        # Sampling layer
        self.sampling = Sampling()

        # Transformer Decoder
        self.embedding = Embedding(input_dim=num_tokens, output_dim=512)
        self.pos_encoding = positional_encoding(num_tokens, 512)
        self.transformer_layers = [
            TransformerBlock(d_model=512, num_heads=num_heads, d_ff=d_ff)
            for _ in range(num_layers)
        ]
        self.decoder_lstm = LSTM(512, return_sequences=True)
        self.output_layer = Dense(num_tokens, activation='softmax')

    def encode(self, x):
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return mean, logvar

    def decode(self, z):
        x = self.embedding(z)
        x *= tf.math.sqrt(tf.cast(self.embedding.output_dim, tf.float32))
        x += self.pos_encoding[:, :x.shape[1], :]

        for layer in self.transformer_layers:
            x = layer(x)

        x = self.decoder_lstm(x)
        return self.output_layer(x)

    def call(self, inputs, training=None, mask=None):
        mean, logvar = self.encode(inputs)
        z = self.sampling((mean, logvar))
        x_recon = self.decode(z)
        return x_recon, mean, logvar



def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates


### Training: I changed the model, check again the steps from here

In [None]:
def vae_loss(x, x_recon, mean, logvar, beta=1.0):
    recon_loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(x, x_recon))
    kl_loss = -0.5 * tf.reduce_mean(1 + logvar - tf.square(mean) - tf.exp(logvar))
    return recon_loss + beta * kl_loss

In [None]:
def train_models(input_data, output_data, vae, transformer, epochs=100, batch_size=64):
    vae.compile(optimizer=Adam(), loss=vae_loss)
    transformer.compile(optimizer=Adam(), loss=CategoricalCrossentropy())

    checkpoint_vae = ModelCheckpoint("vae_best_model.h5", save_best_only=True)
    checkpoint_transformer = ModelCheckpoint("transformer_best_model.h5", save_best_only=True)

    vae.fit(input_data, input_data, epochs=epochs, batch_size=batch_size, callbacks=[checkpoint_vae])
    transformer.fit(input_data, output_data, epochs=epochs, batch_size=batch_size, callbacks=[checkpoint_transformer])

    vae.save("vae_final_model.h5")
    transformer.save("transformer_final_model.h5")

### Generate music:

In [None]:
def generate_music(vae, transformer, int_to_note, seed_sequence, sequence_length=100, output_length=500):
    generated_notes = []

    for i in range(output_length):
        seed_input = np.reshape(seed_sequence, (1, sequence_length, 1))
        seed_input = seed_input / float(len(int_to_note))

        # Use VAE to encode the input sequence
        encoded_seed, _, _ = vae(seed_input)
        
        # Use Transformer to predict the next note based on the encoded seed
        prediction = transformer(encoded_seed)

        # Sample the predicted note
        predicted_note = np.random.choice(range(len(int_to_note)), p=prediction.numpy().ravel())

        # Update the seed sequence with the predicted note
        seed_sequence = np.append(seed_sequence[1:], predicted_note)

        generated_notes.append(int_to_note[predicted_note])

    return generated_notes

### To generate music in a specific genre, use a dataset with midi files in that genre

In [None]:
# Preprocess MIDI files
folder_path = "path/to/your/midi/files"
notes = preprocess_midi_files(folder_path)

# Prepare the dataset
sequence_length = 100
network_input, network_output, note_to_int = prepare_dataset(notes, sequence_length)
int_to_note = {i: n for n, i in note_to_int.items()}

# Create the VAE and Transformer models
input_shape = (sequence_length, 1)
latent_dim = 32
num_heads = 8
ff_dim = 32
num_layers = 2

vae = SimpleVAE(input_shape, latent_dim)
transformer = SimpleTransformer(input_shape, num_heads, ff_dim, num_layers)

# Train the models
epochs = 100
batch_size = 64
train_models(network_input, network_output, vae, transformer, epochs, batch_size)

# Load the best models
vae_best = load_model("vae_best_model.h5", custom_objects={"SimpleVAE": SimpleVAE})
transformer_best = load_model("transformer_best_model.h5", custom_objects={"SimpleTransformer": SimpleTransformer})

# Generate music
seed_sequence = network_input[np.random.randint(0, len(network_input) - 1)]
generated_notes = generate_music(vae_best, transformer_best, int_to_note, seed_sequence)

# Convert generated notes to a MIDI file
output_midi = stream.Stream()

for n in generated_notes:
    if '.' in n:
        notes_in_chord = n.split('.')
        chord_notes = []
        for current_note in notes_in_chord:
            new_note = note.Note(int(current_note))
            new_note.storedInstrument = instrument.Piano()
            chord_notes.append(new_note)
        new_chord = chord.Chord(chord_notes)
        output_midi.append(new_chord)
    else:
        new_note = note.Note(n)
        new_note.storedInstrument = instrument.Piano()
        output_midi.append(new_note)

output_midi.write('midi', fp='generated_music.mid')
