In [1]:
!pip install gdown
!pip install pretty_midi
!pip install miditok
!pip install midi-clip

!wget https://raw.githubusercontent.com/roostico/NesGen/refs/heads/main/utility.py

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25ldone
[?25h  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592292 sha256=5901c20573859af0d61f697697fd9a66b44f77625e8ae18f9

In [2]:
import os
import random
import shutil
from tqdm import tqdm
from pathlib import Path
import pretty_midi
import numpy as np
from miditok import REMI, TokenizerConfig
import json
import tensorflow as tf
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
import random
from random import shuffle

import sys
import pickle
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, Reshape, Dropout, LSTM, Bidirectional
from tensorflow.keras.layers import BatchNormalization, Activation, ZeroPadding2D
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
     

# Data section

In [3]:
!wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip
!unzip "maestro-v3.0.0-midi.zip"
!rm "maestro-v3.0.0-midi.zip"

dataset_path = "/kaggle/working/maestro-v3.0.0"

  pid, fd = os.forkpty()


--2024-12-17 09:41:31--  https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.117.207, 74.125.20.207, 74.125.195.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.117.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58416533 (56M) [application/octet-stream]
Saving to: 'maestro-v3.0.0-midi.zip'


2024-12-17 09:41:32 (87.7 MB/s) - 'maestro-v3.0.0-midi.zip' saved [58416533/58416533]

Archive:  maestro-v3.0.0-midi.zip
  inflating: maestro-v3.0.0/2004/MIDI-Unprocessed_XP_08_R1_2004_01-02_ORIG_MID--AUDIO_08_R1_2004_01_Track01_wav.midi  
  inflating: maestro-v3.0.0/2004/MIDI-Unprocessed_XP_09_R1_2004_05_ORIG_MID--AUDIO_09_R1_2004_06_Track06_wav.midi  
  inflating: maestro-v3.0.0/2004/MIDI-Unprocessed_XP_14_R1_2004_01-03_ORIG_MID--AUDIO_14_R1_2004_01_Track01_wav.midi  
  inflating: maestro-v3.0.0/2004/MIDI-Unprocessed_XP_01_R1_2004

## Prepare the dataset

In [4]:
# Paths to the files of the dataset

midi_paths = list(Path(dataset_path).resolve().glob("**/*.mid")) + list(Path(dataset_path).resolve().glob("**/*.midi"))

midis_dir = "midis"
os.makedirs(midis_dir, exist_ok=True)


for i, midi_path in enumerate(midi_paths):
  new_midi_path = os.path.join(midis_dir, f"{i}.midi")
  shutil.move(str(midi_path), new_midi_path)


midis = list(Path("/kaggle/working/midis").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/midis").resolve().glob("**/*.midi"))

def sample():
  return str(random.choice(midis))

In [5]:
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}

TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": BEAT_RES,
    "num_velocities": 6,
    "special_tokens": ["BOS", "EOS"],
    "use_chords": True,
    "use_rests": True,
    "use_tempos": True,
    "num_tempos": 8,
    "tempo_range": (50, 200),  # (min_tempo, max_tempo),
}

config = TokenizerConfig(**TOKENIZER_PARAMS)

tokenizer = REMI(config)

In [7]:
vocab_size = 1000
tokenizer.train(vocab_size=vocab_size, files_paths=midis)






In [6]:
processed = [Path(f"{s}") for s in midis]
print(len(processed))

1276


In [8]:
valid_perc = 0.05
augment = False

total_num_files = len(processed)
num_files_valid = round(total_num_files * valid_perc)
shuffle(processed)
midi_paths_valid = processed[:num_files_valid]
midi_paths_train = processed[num_files_valid:]

# Chunk MIDIs and perform data augmentation on each subset independently

for files_paths, subset_name in (
    (midi_paths_train, "train"),
    (midi_paths_valid, "valid"),
):
    print(files_paths[0])

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens

    subset_chunks_dir = Path(f"Maestro_{subset_name}")

    split_files_for_training(
        files_paths=files_paths,
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

    # Perform data augmentation
    if augment:
        augment_dataset(
            subset_chunks_dir,
            pitch_offsets=[-12, 12],
            velocity_offsets=[-3, 3],
            duration_offsets=[-0.5, 0.5],
        )

midi_paths_train = list(Path("Maestro_train").glob("**/*.mid")) + list(Path("Maestro_train").glob("**/*.midi"))
midi_paths_valid = list(Path("Maestro_valid").glob("**/*.mid")) + list(Path("Maestro_valid").glob("**/*.midi"))

/kaggle/working/midis/1241.midi


Splitting music files (Maestro_train): 100%|██████████| 1212/1212 [00:17<00:00, 67.71it/s]


/kaggle/working/midis/253.midi


Splitting music files (Maestro_valid): 100%|██████████| 64/64 [00:00<00:00, 69.48it/s]


In [9]:
def midi_valid(midi) -> bool:

    if any(ts.numerator != 4 for ts in midi.time_signature_changes):

        return False  # time signature different from 4/*, 4 beats per bar

    return True



if os.path.exists("tokenized"):

  shutil.rmtree("tokenized")


for dir in ("train", "valid"):
    tokenizer.tokenize_dataset(        
    
        Path(f"/kaggle/working/Maestro_{dir}"),
        Path(f"/kaggle/working/tokenized_{dir}"),
        midi_valid,
    
    )

Tokenizing music files (working/tokenized_train): 100%|██████████| 15857/15857 [05:23<00:00, 49.07it/s]
Tokenizing music files (working/tokenized_valid): 100%|██████████| 815/815 [00:16<00:00, 49.64it/s]


In [10]:
def read_json(path: str) -> dict:

  with open(path, "r") as f:

    return json.load(f)

def read_json_files(json_file_paths):
    """Reads a list of JSON files and returns a list of objects.
    Args:
        json_file_paths: A list of file paths to JSON files.
    Returns:
        A list of objects, where each object represents the data from a JSON file.
        Returns an empty list if any error occurs during file processing.
    """

    objects = []

    for file_path in tqdm(json_file_paths):

        try:

            objects.append(read_json(file_path))

        except FileNotFoundError:

            print(f"Error: File not found - {file_path}")

            return [] # Return empty list on error

        except json.JSONDecodeError:

            print(f"Error decoding JSON in file: {file_path}")

            return [] # Return empty list on error

    return objects


In [11]:
tokenized_train = list(Path("tokenized_train").resolve().glob("**/*.json"))
data_objects_train = read_json_files(tokenized_train)

tokenized_valid = list(Path("tokenized_valid").resolve().glob("**/*.json"))
data_objects_valid = read_json_files(tokenized_valid)

if data_objects_train:
    print(f"\nSuccessfully read {len(data_objects_train)} training JSON files.")
else:
    print("Error reading JSON files.")

100%|██████████| 15857/15857 [00:02<00:00, 6696.15it/s]
100%|██████████| 815/815 [00:00<00:00, 6996.57it/s]


Successfully read 15857 training JSON files.





In [12]:
encoded_train = [np.array(song["ids"][0]) for song in data_objects_train]
encoded_valid = [np.array(song["ids"][0]) for song in data_objects_valid]

### (Optional) decode one piece

In [None]:
tokenizer.decode([encoded_train[0][:1024]]).dump_midi("sample.mid")

In [13]:
all_ids_train = np.concatenate(encoded_train)
all_ids_valid = np.concatenate(encoded_valid)

In [14]:
import datetime
today = datetime.datetime.today()
day = today.day
month = today.month
name = "tokenizer{:d}_{:02d}{:02d}.json".format(vocab_size, month, day)
tokenizer.save(name)
np.savetxt("ids_train_{:02d}{:02d}.txt".format(month, day), all_ids_train)
np.savetxt("ids_valid_{:02d}{:02d}.txt".format(month, day), all_ids_valid)


In [15]:
all_ids_train = all_ids_train.astype(dtype=np.int32)
all_ids_valid = all_ids_valid.astype(dtype=np.int32)

## ... or skip all the data preparation

In [3]:
!gdown 1ZIPjenm4tEzAKPc-ONE4gYLzILR3YYqe # tokenizer1000_1217.json
!gdown 1LN8wrTcUOzlPkQs7Gh-RD9Z2ftbua_E6 # ids_train_1217.txt
!gdown 12SOuWNUM9ofo5dhGWvNEj09c_dYisB7g # ids_valid_1217.txt

  pid, fd = os.forkpty()


Downloading...
From: https://drive.google.com/uc?id=1ZIPjenm4tEzAKPc-ONE4gYLzILR3YYqe
To: /kaggle/working/tokenizer1000_1217.json
100%|██████████████████████████████████████| 64.2k/64.2k [00:00<00:00, 83.8MB/s]
100%|████████████████████████████████████████| 397M/397M [00:08<00:00, 49.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=12SOuWNUM9ofo5dhGWvNEj09c_dYisB7g
To: /kaggle/working/ids_valid_1217.txt
100%|██████████████████████████████████████| 20.6M/20.6M [00:00<00:00, 32.8MB/s]


In [4]:
tokenizer = REMI(params="tokenizer1000_1217.json")
all_ids_train = np.loadtxt("ids_train_1217.txt").astype(dtype=np.int32)
all_ids_valid = np.loadtxt("ids_valid_1217.txt").astype(dtype=np.int32)

## Tensorflow datasets

### Recommended: limit arrays

In [5]:
perc = 1
all_ids_train = all_ids_train[:int(perc * len(all_ids_train))]
all_ids_valid = all_ids_valid[:int(perc * len(all_ids_valid))]
print(f"Loaded {len(all_ids_train)} training ids")

Loaded 15864667 training ids


In [6]:
ids_dataset_train = tf.data.Dataset.from_tensor_slices(all_ids_train)
ids_dataset_valid = tf.data.Dataset.from_tensor_slices(all_ids_valid)

In [7]:
seq_length = 512
vocab_size = len(tokenizer)
BATCH_SIZE = 64
BUFFER_SIZE = 60000

def normalize(sequence):
    # Convert to float32
    input_seq = tf.cast(sequence, tf.float32)
    normalized_seq = (input_seq - vocab_size / 2) / (vocab_size / 2)
    return normalized_seq

train_ds = (
    ids_dataset_train
    .batch(seq_length, drop_remainder=True)  # Create sequences of shape (seq_length,)
    .map(normalize)
    .map(lambda x: (tf.expand_dims(x, -1)))  # Add channel dimension: (seq_length, 1)
    .batch(BATCH_SIZE, drop_remainder=True)  # Batch for training: (batch_size, seq_length, 1)
    .shuffle(BUFFER_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

valid_ds = (
    ids_dataset_valid
    .batch(seq_length, drop_remainder=True)  # Create sequences of shape (seq_length,)
    .map(normalize)
    .map(lambda x: (tf.expand_dims(x, -1)))  # Add channel dimension: (seq_length, 1)
    .batch(BATCH_SIZE, drop_remainder=True)  # Batch for training: (batch_size, seq_length, 1)
    .shuffle(BUFFER_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

for real_seqs in train_ds.take(1):
    print(f"Input Shape: {real_seqs.shape}, Input Type: {real_seqs.dtype}")
    

Input Shape: (64, 512, 1), Input Type: <dtype: 'float32'>


# The model

In [8]:
def generator(latent_dim, seq_shape): 
    model = Sequential()
    model.add(Input(shape=(latent_dim, 1)))
    model.add(LSTM(512, input_shape=(latent_dim, 1), return_sequences=True))
    model.add(Bidirectional(LSTM(512)))
    model.add(Dense(256))
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(1024))
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(seq_shape), activation='tanh'))
    model.add(Reshape(seq_shape))
    return model

def discriminator(seq_shape):
    model = Sequential()
    model.add(Input(shape=seq_shape))
    model.add(LSTM(512, input_shape=seq_shape, return_sequences=True))
    model.add(Bidirectional(LSTM(512, return_sequences=True)))
    model.add(Dense(512))
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(Dense(512))
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(Dense(1))
    return model

### Smaller version

In [None]:
def generator(latent_dim, seq_shape): 
    model = Sequential()
    model.add(Input(shape=(latent_dim, 1)))
    model.add(LSTM(128, input_shape=(latent_dim, 1), return_sequences=True))  # Reduced units
    model.add(Bidirectional(LSTM(128)))  # Reduced units
    model.add(Dense(64))  # Reduced units
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(128))  # Reduced units
    model.add(LeakyReLU(negative_slope=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(seq_shape), activation='tanh'))
    model.add(Reshape(seq_shape))
    return model


def discriminator(seq_shape):
    model = Sequential()
    model.add(Input(shape=seq_shape))
    model.add(LSTM(256, input_shape=seq_shape, return_sequences=True))  # Maintain timestep output
    model.add(Bidirectional(LSTM(256, return_sequences=True)))         # Maintain timestep output
    model.add(Dense(1, activation='sigmoid'))         # Predict for each timestep
    return model

In [9]:
class GAN():
  def __init__(self, vocab_size, seq_length, latent_dim = 512):
    self.vocab_size = vocab_size
    self.seq_length = seq_length
    self.seq_shape = (self.seq_length, 1)
    self.latent_dim = latent_dim
    self.disc_loss = []
    self.gen_loss = []

    self.loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    self.generator_opt = tf.keras.optimizers.Adam(1e-4)
    self.discriminator_opt = tf.keras.optimizers.Adam(1e-4)
    self.disc_accuracy = tf.keras.metrics.BinaryAccuracy() 

    # Build and compile the discriminator
    self.discriminator = discriminator(self.seq_shape)
      
    # Build the generator
    self.generator = generator(self.latent_dim, self.seq_shape)

    # The generator takes noise as input and generates note sequences
    z = Input(shape=(self.latent_dim, 1))
    generated_seq = self.generator(z)

    validity = self.discriminator(generated_seq)

    # The combined model  (stacked generator and discriminator)
    # Trains the generator to fool the discriminator
    self.combined = Model(z, validity)
      
  def _discriminator_loss(self, real_output, fake_output):
    real_loss = self.loss_fun(tf.ones_like(real_output), real_output)
    fake_loss = self.loss_fun(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss
      
  def _generator_loss(self, fake_output):
    return self.loss_fun(tf.ones_like(fake_output), fake_output)

  @tf.function
  def _train_step(self, real_batch_x, batch_size):
    # One step (=1 batch)
    gen_loss = 0.0
    disc_loss = 0.0
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        # GENERATOR -> FAKE BATCH
        noise = np.random.normal(0, 1, (batch_size, self.latent_dim, 1))
        fake_batch_x = self.generator(noise, training=True)  # Shape: (batch_size, seq_length, 1)

        real_output = self.discriminator(real_batch_x, training=True)
        fake_output = self.discriminator(fake_batch_x, training=True)

        gen_loss = self._generator_loss(fake_output)
        disc_loss = self._discriminator_loss(real_output, fake_output)

        disc_labels = tf.concat((tf.ones_like(real_output), tf.zeros_like(fake_output)), axis=0)
        disc_output = tf.concat((real_output, fake_output), axis=0)
        self.disc_accuracy.update_state(disc_labels, disc_output)
    
    gradients_of_generator = gen_tape.gradient(gen_loss, self.generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, self.discriminator.trainable_variables)

    self.generator_opt.apply_gradients(zip(gradients_of_generator, self.generator.trainable_variables))
    self.discriminator_opt.apply_gradients(zip(gradients_of_discriminator, self.discriminator.trainable_variables))
    return (gen_loss, disc_loss)
      
  def train(self, epochs, batch_size, train_dataset, valid_dataset, steps_each_print=50):
    print("\nStarting Training\n")
    iteration_count = len(train_dataset)

    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch + 1,))
        pbar = tqdm(total=iteration_count, position=0, leave=True)
        
        for step, real_seqs in enumerate(train_dataset):
            gen_loss, disc_loss = self._train_step(real_seqs, batch_size)
            
            if step % steps_each_print == 0:
                pbar.set_description(
                    f"D Loss: {disc_loss:.4f}, " +
                    f"D Accuracy: {100 * self.disc_accuracy.result():.2f}%, " +
                    f"G Loss: {gen_loss:.4f}"
                )
                pbar.update(steps_each_print)
        pbar.n = pbar.total  
        pbar.refresh()    
        pbar.close()
        print(f"Epoch {epoch} complete. Discriminator accuracy: {self.disc_accuracy.result()}")
    
    print("\nTraining Complete.\n")
        
  def save(self):
    # create Model directory if there isn't exist
    if not os.path.exists('Model/'):
      os.makedirs('Model/')

    # save discriminator and generator trained model
    self.discriminator.save('Model/discriminator.h5')
    self.generator.save('Model/generator.h5')
    print("The trained C-RNN-GAN model (generator and discriminator) have been saved in the Model folder.")


  def generate(self):
    """ Use random noise to generate music"""
    
    # random noise for network input
    noise = np.random.normal(0, 1, (BATCH_SIZE, self.latent_dim, 1))
    predictions = self.generator.predict(noise)

    # transfer sequence numbers to notes
    boundary = int(self.vocab_size / 2)
    pred_nums = [x * boundary + boundary for x in predictions[0]]
    return pred_nums


  def plot_loss(self):
    """ Plot and save discriminator and generator loss functions per epoch diagram"""
    plt.plot(self.disc_loss, c='red')
    plt.plot(self.gen_loss, c='blue')
    plt.title("GAN Loss per Epoch")
    plt.legend(['Discriminator', 'Generator'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()
    plt.savefig('Result/GAN_Loss_per_Epoch_final.png', transparent=True)
    plt.close()

model = GAN(vocab_size, seq_length)

  super().__init__(**kwargs)


### Hide prints

In [None]:
tf.get_logger().setLevel('ERROR')

In [None]:
EPOCHS = 50


model.train(EPOCHS, BATCH_SIZE, train_ds, valid_ds, steps_each_print=2)


Starting Training


Start of epoch 1


D Loss: 1.4613, D Accuracy: 52.66%, G Loss: 0.4899:  21%|██▏       | 104/484 [02:19<06:56,  1.10s/it]

In [None]:
model.save()

In [None]:
!zip -r model.zip Model/

In [None]:
generated_ids = np.concatenate(model.generate()).astype(np.int32)
print(generated_ids)
for index, id in enumerate(generated_ids):
    if id >= len(tokenizer):
        print(f"Found id {id}, setting to {len(tokenizer) - 1}")
        generated_ids[index] = len(tokenizer) - 1
decoded = tokenizer.decode([generated_ids])
decoded.dump_midi("generated.mid")