In [None]:
!pip install gdown
!pip install pretty_midi
!pip install miditok
!pip install midi-clip

!wget https://raw.githubusercontent.com/roostico/NesGen/refs/heads/main/utility.py

In [None]:
!nvidia-smi

In [None]:
import os
import random
import shutil
from tqdm import tqdm
from pathlib import Path
import pretty_midi
import numpy as np
from miditok import REMI, TokenizerConfig
import json
import tensorflow as tf
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
import random
from random import shuffle

import sys
import pickle
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, Reshape, Dropout, LSTM, Bidirectional
from tensorflow.keras.layers import BatchNormalization, Activation, ZeroPadding2D
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
     

In [None]:
import tensorflow.keras.mixed_precision as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Data section

## Prepare the dataset

In [None]:
# Paths to the files of the dataset

midi_paths = list(Path(dataset_path).resolve().glob("**/*.mid")) + list(Path(dataset_path).resolve().glob("**/*.midi"))

midis_dir = "midis"
os.makedirs(midis_dir, exist_ok=True)


for i, midi_path in enumerate(midi_paths):
  new_midi_path = os.path.join(midis_dir, f"{i}.midi")
  shutil.move(str(midi_path), new_midi_path)


midis = list(Path("/kaggle/working/midis").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/midis").resolve().glob("**/*.midi"))

def sample():
  return str(random.choice(midis))

In [None]:
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}

TOKENIZER_PARAMS = {

    "pitch_range": (21, 109),
    "beat_res": BEAT_RES,
    "num_velocities": 24,
    "special_tokens": ["PAD", "BOS", "EOS"],
    "use_chords": True,
    "use_rests": True,
    "use_tempos": True,
    "num_tempos": 32,
    "tempo_range": (50, 200),  # (min_tempo, max_tempo),
}

config = TokenizerConfig(**TOKENIZER_PARAMS)

tokenizer = REMI(config)

In [None]:
tokenizer.train(vocab_size=30000, files_paths=midis)

In [None]:
processed = [Path(f"{s}") for s in midis]
print(len(processed))

In [None]:
valid_perc = 0.3

total_num_files = len(processed)
num_files_valid = round(total_num_files * valid_perc)
shuffle(processed)
midi_paths_valid = processed[:num_files_valid]
midi_paths_train = processed[num_files_valid:]

# Chunk MIDIs and perform data augmentation on each subset independently

for files_paths, subset_name in (
    (midi_paths_train, "train"),
    (midi_paths_valid, "valid"),
):
    print(files_paths[0])

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens

    subset_chunks_dir = Path(f"Maestro_{subset_name}")

    split_files_for_training(
        files_paths=files_paths,
        tokenizer=tokenizer,
        save_dir=subset_chunks_dir,
        max_seq_len=1024,
        num_overlap_bars=2,
    )

    # Perform data augmentation

    augment_dataset(
        subset_chunks_dir,
        pitch_offsets=[-12, 12],
        velocity_offsets=[-4, 4],
        duration_offsets=[-0.5, 0.5],
    )

midi_paths_train = list(Path("Maestro_train").glob("**/*.mid")) + list(Path("Maestro_train").glob("**/*.midi"))
midi_paths_valid = list(Path("Maestro_valid").glob("**/*.mid")) + list(Path("Maestro_valid").glob("**/*.midi"))

In [None]:
def midi_valid(midi) -> bool:

    if any(ts.numerator != 4 for ts in midi.time_signature_changes):

        return False  # time signature different from 4/*, 4 beats per bar

    return True



if os.path.exists("tokenized"):

  shutil.rmtree("tokenized")


for dir in ("train", "valid"):
    tokenizer.tokenize_dataset(        
    
        Path(f"/kaggle/working/Maestro_{dir}"),
        Path(f"/kaggle/working/tokenized_{dir}"),
        midi_valid,
    
    )

In [None]:
def read_json(path: str) -> dict:

  with open(path, "r") as f:

    return json.load(f)

def read_json_files(json_file_paths):
    """Reads a list of JSON files and returns a list of objects.
    Args:
        json_file_paths: A list of file paths to JSON files.
    Returns:
        A list of objects, where each object represents the data from a JSON file.
        Returns an empty list if any error occurs during file processing.
    """

    objects = []

    for file_path in tqdm(json_file_paths):

        try:

            objects.append(read_json(file_path))

        except FileNotFoundError:

            print(f"Error: File not found - {file_path}")

            return [] # Return empty list on error

        except json.JSONDecodeError:

            print(f"Error decoding JSON in file: {file_path}")

            return [] # Return empty list on error

    return objects


In [None]:
tokenized_train = list(Path("tokenized_train").resolve().glob("**/*.json"))
data_objects_train = read_json_files(tokenized_train)

tokenized_valid = list(Path("tokenized_valid").resolve().glob("**/*.json"))
data_objects_valid = read_json_files(tokenized_valid)

if data_objects_train:
    print(f"\nSuccessfully read {len(data_objects_train)} training JSON files.")
else:
    print("Error reading JSON files.")

In [None]:
encoded_train = [np.array(song["ids"][0]) for song in data_objects_train]
encoded_valid = [np.array(song["ids"][0]) for song in data_objects_valid]

In [None]:
all_ids_train = np.concatenate(encoded_train)
all_ids_valid = np.concatenate(encoded_valid)

## ... or skip all the data preparation

In [None]:
!gdown 1SDRkoWwyuSl4udoCHdcitjLLm9d0kfxS # tokenizer_maestro0612.json
!gdown 1IQToXD9s8g4L-AlK-MY4qvGoLZ-p7bMw # ids_train
!gdown 1DWjViUKpW07LfbGimlhhhGdK7oQaJpj- # ids_valid

In [None]:
tokenizer = REMI(params="tokenizer_maestro0612.json")
all_ids_train = np.loadtxt("ids_train").astype(dtype=np.int32)
all_ids_valid = np.loadtxt("ids_valid").astype(dtype=np.int32)

## Tensorflow datasets

### Recommended: limit arrays

In [None]:
perc = 0.3
all_ids_train = all_ids_train[:int(perc * len(all_ids_train))]
all_ids_valid = all_ids_valid[:int(perc * len(all_ids_valid))]
print(f"Loaded {len(all_ids_train)} training ids")

In [None]:
ids_dataset_train = tf.data.Dataset.from_tensor_slices(all_ids_train)
ids_dataset_valid = tf.data.Dataset.from_tensor_slices(all_ids_valid)

## Tensorflow dataset version

In [None]:
seq_length = 512
vocab_size = len(tokenizer)
BATCH_SIZE = 128
BUFFER_SIZE = 10000

def normalize_and_split(sequence):
    # Convert to float32
    input_seq = tf.cast(sequence, tf.float32)
    normalized_seq = (input_seq - vocab_size / 2) / (vocab_size / 2)
    target = tf.ones_like(normalized_seq)  # Create target tensor with all 1s
    return normalized_seq, target

train_ds = (
    ids_dataset_train
    .batch(seq_length, drop_remainder=True)  # Create sequences of shape (seq_length,)
    .map(normalize_and_split)
    .map(lambda x, y: (tf.expand_dims(x, -1), y))  # Add channel dimension: (seq_length, 1)
    .batch(BATCH_SIZE, drop_remainder=True)  # Batch for training: (batch_size, seq_length, 1)
    .prefetch(tf.data.AUTOTUNE)
)

valid_ds = (
    ids_dataset_valid
    .batch(seq_length, drop_remainder=True)  # Create sequences of shape (seq_length,)
    .map(normalize_and_split)
    .map(lambda x, y: (tf.expand_dims(x, -1), y))  # Add channel dimension: (seq_length, 1)
    .batch(BATCH_SIZE, drop_remainder=True)  # Batch for training: (batch_size, seq_length, 1)
    .prefetch(tf.data.AUTOTUNE)
)

for real_seqs, targets in train_ds.take(1):
    print(f"Input Shape: {real_seqs.shape}, Input Type: {real_seqs.dtype}")
    print(f"Target Shape: {targets.shape}, Target Type: {targets.dtype}")
    

## Numpy array version (not updated)

In [None]:
seq_length = 512
vocab_size = len(tokenizer)
BATCH_SIZE = 64

def normalize(sequence, vocab_size):
    return (sequence - vocab_size / 2) / (vocab_size / 2)

def prepare_numpy_dataset(array, seq_length, vocab_size, batch_size, buffer_size):
    # 1. Trim the array to be divisible by `seq_length`
    num_sequences = len(array) // seq_length
    array = array[:num_sequences * seq_length]
    
    # 2. Reshape into sequences
    sequences = array.reshape((-1, seq_length))
    print(sequences.shape)
    
    # 3. Normalize sequences
    sequences = normalize(sequences, vocab_size)
    
    # 5. Shuffle the sequences
    np.random.shuffle(sequences)
    
    # 6. Create batches
    num_batches = len(sequences) // batch_size
    sequences = sequences[:num_batches * batch_size]  # Trim to make divisible by batch size
    batches = sequences.reshape((num_batches, batch_size, seq_length))
    
    return batches

# Prepare datasets
train_ds = prepare_numpy_dataset(all_ids_train, seq_length, vocab_size, BATCH_SIZE, BUFFER_SIZE)
valid_ds = prepare_numpy_dataset(all_ids_valid, seq_length, vocab_size, BATCH_SIZE, BUFFER_SIZE)
print("Train batch shape:", train_ds[0].shape)  # Expected: (64, 512, 1)

# The model

In [None]:
def generator(latent_dim, seq_shape): 
    model = Sequential()
    model.add(LSTM(256, input_shape=(latent_dim, 1), return_sequences=True))
    model.add(Bidirectional(LSTM(256)))
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(seq_shape), activation='tanh'))
    model.add(Reshape(seq_shape))
    return model

def discriminator(seq_shape):
    model = Sequential()
    model.add(LSTM(256, input_shape=seq_shape, return_sequences=True))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model

### Smaller version

In [None]:

def generator(latent_dim, seq_shape): 
    model = Sequential()
    model.add(LSTM(128, input_shape=(latent_dim, 1), return_sequences=True))  # Reduced units
    model.add(Bidirectional(LSTM(128)))  # Reduced units
    model.add(Dense(64))  # Reduced units
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(128))  # Reduced units
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(np.prod(seq_shape), activation='tanh'))
    model.add(Reshape(seq_shape))
    return model


def discriminator(seq_shape):
    model = Sequential()
    model.add(LSTM(256, input_shape=seq_shape, return_sequences=True))  # Maintain timestep output
    model.add(Bidirectional(LSTM(256, return_sequences=True)))         # Maintain timestep output
    model.add(Dense(1, activation='sigmoid'))         # Predict for each timestep
    return model

## GAN class

In [None]:
class GAN():
  def __init__(self, vocab_size, seq_length, latent_dim = 1000):
    self.vocab_size = vocab_size
    self.seq_length = seq_length
    self.seq_shape = (self.seq_length, 1)
    self.latent_dim = latent_dim
    self.disc_loss = []
    self.gen_loss = []

    optimizer = Adam(0.0002, 0.5)

    # Build and compile the discriminator
    self.discriminator = self.build_discriminator()
    self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Build the generator
    self.generator = self.build_generator()

    # The generator takes noise as input and generates note sequences
    z = Input(shape=(self.latent_dim, 1))
    generated_seq = self.generator(z)

    # For the combined model we will only train the generator
    self.discriminator.trainable = False

    # The discriminator takes generated images as input and determines validity
    validity = self.discriminator(generated_seq)

    # The combined model  (stacked generator and discriminator)
    # Trains the generator to fool the discriminator
    self.combined = Model(z, validity)
    self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)

  def build_discriminator(self):
    model = discriminator(self.seq_shape)

    seq = Input(shape=self.seq_shape)
    validity = model(seq)

    return Model(seq, validity)

  def build_generator(self):
    model = generator(self.latent_dim, self.seq_shape)

    noise = Input(shape=(self.latent_dim, 1))
    seq = model(noise)

    return Model(noise, seq)

  def train(self, epochs, batch_size, train_dataset, valid_dataset, sample_interval=50):
    print("\nStarting Training\n")

    # Training the model
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch + 1,))
        
        with tqdm(enumerate(train_dataset), total=len(train_dataset)) as pbar:
            for step, (real_seqs, targets) in pbar:
                
                # Random noise for generator input
                noise = np.random.normal(0, 1, (batch_size, self.latent_dim, 1))

                # Generate a batch of new note sequences
                gen_seqs = self.generator.predict(noise, verbose=0)  # Shape: (batch_size, seq_length, 1)
                
                # Create targets for fake sequences
                fake_targets = tf.zeros_like(targets)  # Match target shape
                
                # Train the discriminator
                d_loss_real = self.discriminator.train_on_batch(real_seqs, targets)
                d_loss_fake = self.discriminator.train_on_batch(gen_seqs, fake_targets)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
                
                # Train the generator
                # The generator is trained to produce sequences that the discriminator classifies as "real"
                noise = np.random.normal(0, 1, (batch_size, self.latent_dim, 1))
                g_loss = self.combined.train_on_batch(noise, targets)  # Use real targets here
                
                # Update tqdm description every step
                pbar.set_description(
                    f"Step {step} | " + 
                    f"D Loss: {d_loss[0]:.4f}, " +
                    f"D Accuracy: {100 * d_loss[1]:.2f}%, " +
                    f"G Loss: {g_loss[0]:.4f}"
                )
                
        # Print progress and save losses every few epochs
        if epoch % sample_interval == 0:
            print(
                f"{epoch + 1} / {epochs} [D loss: {d_loss[0]:.4f}, acc.: {100 * d_loss[1]:.2f}%] "
                f"[G loss: {g_loss[0]:.4f}]"
            )
            self.disc_loss.append(d_loss[0])
            self.gen_loss.append(g_loss)
    
    print("\nTraining Complete.\n")
        
  def save(self):
    # create Model directory if there isn't exist
    if not os.path.exists('Model/'):
      os.makedirs('Model/')

    # save discriminator and generator trained model
    self.discriminator.save('Model/discriminator.h5')
    self.generator.save('Model/generator.h5')
    print("The trained C-RNN-GAN model (generator and discriminator) have been saved in the Model folder.")


  def generate(self):
    """ Use random noise to generate music"""
    
    # random noise for network input
    noise = np.random.normal(0, 1, (BATCH_SIZE, self.latent_dim, 1))
    predictions = self.generator.predict(noise)

    # transfer sequence numbers to notes
    boundary = int(self.vocab_size / 2)
    pred_nums = [x * boundary + boundary for x in predictions[0]]
    return pred_nums


  def plot_loss(self):
    """ Plot and save discriminator and generator loss functions per epoch diagram"""
    plt.plot(self.disc_loss, c='red')
    plt.plot(self.gen_loss, c='blue')
    plt.title("GAN Loss per Epoch")
    plt.legend(['Discriminator', 'Generator'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()
    plt.savefig('Result/GAN_Loss_per_Epoch_final.png', transparent=True)
    plt.close()

model = GAN(vocab_size, seq_length)

## Training

In [None]:
EPOCHS = 1


model.train(EPOCHS, BATCH_SIZE, train_ds, valid_ds, sample_interval=1)

## Generation

In [None]:
generated_ids = np.concatenate(model.generate()).astype(np.int32)
print(generated_ids)
decoded = tokenizer.decode([generated_ids])
decoded.dump_midi("generated.mid")