In [None]:
!wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip
!unzip "maestro-v3.0.0-midi.zip"
!rm "maestro-v3.0.0-midi.zip"

dataset_path = "/kaggle/working/maestro-v3.0.0"

In [None]:
!pip install gdown
!pip install pretty_midi
!pip install miditok
!pip install pretty_midi midi-clip
!pip install transformers
!pip install accelerate

## Download NESDB dataset

In [None]:
!gdown 1gIli7G1wu0QWDLzRc-CPWB8C4Hu0XVn3
!unzip nesmdb_midi.zip
!rm nesmdb_midi.zip

In [None]:
# Utility library

!wget https://raw.githubusercontent.com/roostico/NesGen/refs/heads/main/utility.py
!wget https://raw.githubusercontent.com/roostico/NesGen/refs/heads/main/transformer.py

from utility import *

## Import libraries

In [None]:
import os
import random
import shutil
from tqdm import tqdm
from pathlib import Path
import pretty_midi
import numpy as np
from miditok import REMI, TokenizerConfig
import json
import keras_nlp.layers as nlp_layers
from tensorflow import keras
import tensorflow as tf
from miditok.utils import split_files_for_training
from miditok.data_augmentation import augment_dataset
import random
from random import shuffle

## Move files and rename them

In [None]:
# Paths to the files of the dataset

midi_paths = list(Path(dataset_path).resolve().glob("**/*.mid")) + list(Path(dataset_path).resolve().glob("**/*.midi"))
nesmdb_paths = list(Path("/kaggle/working/nesmdb_midi").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/nesmdb_midi").resolve().glob("**/*.midi"))

midis_dir = "midis"
os.makedirs(midis_dir, exist_ok=True)

nesmdb_dir = "nesmdb"
os.makedirs(nesmdb_dir, exist_ok=True)


for i, midi_path in enumerate(midi_paths):
  new_midi_path = os.path.join(midis_dir, f"{i}.midi")
  shutil.move(str(midi_path), new_midi_path)

for i, midi_path in enumerate(nesmdb_paths):
  new_midi_path = os.path.join(nesmdb_dir, f"{i}.midi")
  shutil.move(str(midi_path), new_midi_path)

midis = list(Path("/kaggle/working/midis").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/midis").resolve().glob("**/*.midi"))
nes_midis = list(Path("/kaggle/working/nesmdb").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/nesmdb").resolve().glob("**/*.midi"))

def sample():
  return str(random.choice(midis))

## Tokenizer




In [None]:
BEAT_RES = {(0, 1): 12, (1, 2): 4, (2, 4): 2, (4, 8): 1}

TOKENIZER_PARAMS = {

    "pitch_range": (21, 109),

    "beat_res": BEAT_RES,

    "num_velocities": 24,

    "special_tokens": ["PAD", "BOS", "EOS"],

    "use_chords": True,

    "use_rests": True,

    "use_tempos": True,

    "use_time_signatures": True,

    "use_programs": False,  # no multitrack here

    "num_tempos": 32,

    "tempo_range": (50, 200),  # (min_tempo, max_tempo)

}

config = TokenizerConfig(**TOKENIZER_PARAMS)

tokenizer = REMI(config)

### (Optional): load a trained tokenizer

In [None]:
!gdown 1XUgih6NF5mNOma5tUF1Ep7pTyc5Ps_FW
tokenizer = REMI(params=Path("/kaggle/working/tokenizer.json"))
print(f"Vocab size: {len(tokenizer)}")

### (Optional): train the tokenizer

In [None]:
tokenizer.train(vocab_size=30000, files_paths=midis + nes_midis)

In [None]:
processed = [Path(f"{s}") for s in midis]
print(len(processed))

In [None]:
total_num_files = len(processed)
num_files_valid = round(total_num_files * 0.15)
num_files_test = round(total_num_files * 0.15)
shuffle(processed)
midi_paths_valid = processed[:num_files_valid]
midi_paths_test = processed[num_files_valid:num_files_valid + num_files_test]
midi_paths_train = processed[num_files_valid + num_files_test:]

# Chunk MIDIs and perform data augmentation on each subset independently

for files_paths, subset_name in (

    (midi_paths_train, "train"), (midi_paths_valid, "valid"), (midi_paths_test, "test")

):
    print(files_paths[0])

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens

    subset_chunks_dir = Path(f"Maestro_{subset_name}")

    split_files_for_training(

        files_paths=files_paths,

        tokenizer=tokenizer,

        save_dir=subset_chunks_dir,

        max_seq_len=1024,

        num_overlap_bars=2,

    )



    # Perform data augmentation

    augment_dataset(

        subset_chunks_dir,

        pitch_offsets=[-12, 12],

        velocity_offsets=[-4, 4],

        duration_offsets=[-0.5, 0.5],

    )
midi_paths_train = list(Path("Maestro_train").glob("**/*.mid")) + list(Path("Maestro_train").glob("**/*.midi"))
midi_paths_valid = list(Path("Maestro_valid").glob("**/*.mid")) + list(Path("Maestro_valid").glob("**/*.midi"))
midi_paths_test = list(Path("Maestro_test").glob("**/*.mid")) + list(Path("Maestro_test").glob("**/*.midi"))


### Tokenizer the dataset

In [None]:
def midi_valid(midi) -> bool:

    if any(ts.numerator != 4 for ts in midi.time_signature_changes):

        return False  # time signature different from 4/*, 4 beats per bar

    return True



if os.path.exists("tokenized"):

  shutil.rmtree("tokenized")


for dir in ("train", "valid", "test"):
    tokenizer.tokenize_dataset(        
    
        Path(f"/kaggle/working/Maestro_{dir}"),
        Path(f"/kaggle/working/tokenized_{dir}"),
        midi_valid,
    
    )

### Save the tokenizer

In [None]:
tokenizer.save("/kaggle/working/tokenizer.json")

### Utility function to read a JSON tokenized file

In [None]:
def read_json(path: str) -> dict:

  with open(path, "r") as f:

    return json.load(f)

def read_json_files(json_file_paths):
    """Reads a list of JSON files and returns a list of objects.
    Args:
        json_file_paths: A list of file paths to JSON files.
    Returns:
        A list of objects, where each object represents the data from a JSON file.
        Returns an empty list if any error occurs during file processing.
    """

    objects = []

    for file_path in tqdm(json_file_paths):

        try:

            objects.append(read_json(file_path))

        except FileNotFoundError:

            print(f"Error: File not found - {file_path}")

            return [] # Return empty list on error

        except json.JSONDecodeError:

            print(f"Error decoding JSON in file: {file_path}")

            return [] # Return empty list on error

    return objects


## Read the tokenized version of files from the JSON

In [None]:
tokenized_train = list(Path("tokenized_train").resolve().glob("**/*.json"))
data_objects_train = read_json_files(tokenized_train)

tokenized_valid = list(Path("tokenized_valid").resolve().glob("**/*.json"))
data_objects_valid = read_json_files(tokenized_valid)

tokenized_test = list(Path("tokenized_test").resolve().glob("**/*.json"))
data_objects_test = read_json_files(tokenized_test)


if data_objects_train:
    print(f"\nSuccessfully read {len(data_objects_train)} training JSON files.")
else:
    print("Error reading JSON files.")

## Create the list of tokenized songs, taking the IDs of each one

In [None]:
encoded_train = [np.array(song["ids"][0]) for song in data_objects_train]
encoded_valid = [np.array(song["ids"][0]) for song in data_objects_valid]
encoded_test = [np.array(song["ids"][0]) for song in data_objects_test]

End of pre-processing, proceding with data and model preparation with Tensorflow



---




# Tensorflow data and model setup

## Creating a Tensorflow dataset with all IDs

In [None]:
all_ids_train = np.concatenate(encoded_train)
all_ids_valid = np.concatenate(encoded_valid)
all_ids_test = np.concatenate(encoded_test)


In [None]:
len(all_ids_valid)

### Save numpy arrays

In [None]:
np.savetxt('ids_train', all_ids_train, delimiter=',')
np.savetxt('ids_valid', all_ids_valid, delimiter=',')
np.savetxt('ids_test', all_ids_test, delimiter=',')

### Load numpy arrays

In [None]:
!gdown 1mYPtsOMIKj0WO_oAYswZSycfKNoQNvzZ
!gdown 1FXyv6ONlswDqc34SRPAgo5smpCPL3R-N
!gdown 1iNhcdDBduwUCS8YZZXdVtB9zjlqgyWgP

In [None]:
all_ids_train = np.loadtxt("ids_train", dtype=np.int32)
all_ids_valid = np.loadtxt("ids_valid", dtype=np.int32)
all_ids_test = np.loadtxt("ids_test", dtype=np.int32)

### (recommended): limit arrays

In [None]:
all_ids_train = all_ids_train[:int(0.75 * len(all_ids_train))]
all_ids_valid = all_ids_valid[:int(0.75 * len(all_ids_valid))]
all_ids_test = all_ids_test[:int(0.75 * len(all_ids_test))]

### Create Tensorflow datasets

In [None]:
ids_dataset_train = tf.data.Dataset.from_tensor_slices(all_ids_train)
ids_dataset_valid = tf.data.Dataset.from_tensor_slices(all_ids_valid)
ids_dataset_test = tf.data.Dataset.from_tensor_slices(all_ids_test)

### Convert into sequences

In [None]:
seq_length = 1024 

sequences_train = ids_dataset_train.batch(seq_length+1, drop_remainder=True)
sequences_valid = ids_dataset_valid.batch(seq_length+1, drop_remainder=True)
sequences_test = ids_dataset_test.batch(seq_length+1, drop_remainder=True)

## Preparing labels

In [None]:
def split_input_target(sequence):
    # Convert to float32
    input_seq = tf.cast(sequence[:-1], tf.int32)
    target_seq = tf.cast(sequence[1:], tf.int32)
    return input_seq, target_seq



train_ds = sequences_train.map(split_input_target)
valid_ds = sequences_valid.map(split_input_target)
test_ds = sequences_test.map(split_input_target)

### Creating training batches

In [None]:
# Batch size

BATCH_SIZE = 16 

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).

BUFFER_SIZE = 10000



train_ds = (
    train_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
valid_ds = (
    valid_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
test_ds = (
    test_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

# Building the model

In [None]:
from transformers import TFAutoModelForCausalLM, MistralConfig

# Define the model configuration
model_config = MistralConfig(
    vocab_size=len(tokenizer),
    hidden_size=512,
    intermediate_size=1024,
    num_hidden_layers=8,
    num_attention_heads=8,
    num_key_value_heads=4,
    sliding_window=256,
    max_position_embeddings=8192,
    pad_token_id=tokenizer['PAD_None'],
    bos_token_id=tokenizer['BOS_None'],
    eos_token_id=tokenizer['EOS_None'],
)

# Initialize the TensorFlow model
model = TFAutoModelForCausalLM.from_config(model_config)
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss,
              optimizer="adam",
              weighted_metrics=["sparse_categorical_accuracy"],
              jit_compile=True,
              )

### Check if model is doing what expected

In [None]:
for input_example_batch, target_example_batch in train_ds.take(1):

  example_batch_predictions = model(input_example_batch)
  logits = example_batch_predictions.logits
  print(logits.shape, "# (batch_size, sequence_length, vocab_size)")



# Check shapes
print("Prediction shape:", logits.shape)
print("Target shape:", target_example_batch.shape)

# Ensure reduction is feasible
predicted_classes = tf.argmax(logits, axis=-1)  # (batch_size, seq_length)
print("Reduced prediction shape:", predicted_classes.shape)

# Compare shapes after reduction
if predicted_classes.shape == target_example_batch.shape:
    print("Shapes are compatible for comparison.")
else:
    print("Shapes are NOT compatible for comparison.")

# Verify dtype compatibility
print("Prediction dtype:", logits.dtype)
print("Target dtype:", target_example_batch.dtype)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Directory where the checkpoints will be saved

checkpoint_dir = './training_checkpoints'

# Name of the checkpoint files

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

early_stopping = EarlyStopping(

    monitor='val_loss',

    patience=5,

    restore_best_weights=True

)



checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(

    filepath=checkpoint_prefix,

    save_weights_only=True

)

In [None]:
EPOCHS = 15

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    #callbacks=[early_stopping]
)

In [None]:
model.save_pretrained("/kaggle/working/nesgen-automodel") 
!zip -r nesgen-automodel.zip /kaggle/working/nesgen-automodel

## Load previous trained model

In [None]:
!pip install gdown
!gdown 1kZe0BCf7EWyp7HEXyRtP-D37AsnLjT2Y
!unzip nesgen-automodel.zip

model.from_pretrained("/kaggle/working/kaggle/working/nesgen-automodel")

## Save the history

In [None]:
import pickle
with open('/kaggle/working/trainHistoryDict', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

## Load the history

In [None]:
with open('/kaggle/working/trainHistoryDict', "rb") as file_pi:
    history = pickle.load(file_pi)

## Generation

### Select a seed from the test set

In [None]:
def get_seed():
    for seed_ids, _ in test_ds.take(1):
    
      seed = seed_ids
    
    return seed[0]

### Generate and dump MIDI

In [None]:
dump_seed = False

seed = get_seed()
input_ids = tf.convert_to_tensor(seed)  # Assuming seed is a 1D tensor of token IDs
input_ids = tf.expand_dims(input_ids, 0)  # Add an extra dimension to represent batch size

if dump_seed:
    midi = tokenizer.decode([seed])
    midi.dump_midi("seed.mid")

# Generate continuation
outputs = model.generate(
    input_ids=input_ids,
    max_new_tokens=256,  # Maximum length of generated sequence
    num_return_sequences=1,  # Number of sequences to return
    do_sample=True,  # Use sampling (True) or greedy decoding (False)
    temperature=0.7  # Sampling temperature (lower is more conservative)
)

input_length = input_ids.shape[1]
generated_tokens = outputs[:, input_length:] # skip seed

# Decode and print the generated text
generated = tokenizer.decode([generated_tokens[0]])
generated.dump_midi("nesgen2.mid")

# Second part: Fine tuning

In [None]:
# Paths to the files of the dataset

nesmdb_paths = list(Path("/kaggle/working/nesmdb_midi").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/nesmdb_midi").resolve().glob("**/*.midi"))

nesmdb_dir = "nesmdb"
os.makedirs(nesmdb_dir, exist_ok=True)

for i, midi_path in enumerate(nesmdb_paths):
  new_midi_path = os.path.join(nesmdb_dir, f"{i}.midi")
  shutil.move(str(midi_path), new_midi_path)

nes_midis = list(Path("/kaggle/working/nesmdb").resolve().glob("**/*.mid")) + list(Path("/kaggle/working/nesmdb").resolve().glob("**/*.midi"))

In [None]:
processed = [Path(f"{s}") for s in nes_midis]
print(len(processed))

### Print info of a MIDI

In [None]:
from utility import show_midi_info, playMidi

show_midi_info(str(processed[0]))

In [None]:
total_num_files = len(processed)

num_files_valid = round(total_num_files * 0.15)
num_files_test = round(total_num_files * 0.15)
shuffle(processed)
midi_paths_valid = processed[:num_files_valid]
midi_paths_test = processed[num_files_valid:num_files_valid + num_files_test]
midi_paths_train = processed[num_files_valid + num_files_test:]



# Chunk MIDIs and perform data augmentation on each subset independently

for files_paths, subset_name in (

    (midi_paths_train, "train"), (midi_paths_valid, "valid"), (midi_paths_test, "test")

):
    print(files_paths[0])

    # Split the MIDIs into chunks of sizes approximately about 1024 tokens

    subset_chunks_dir = Path(f"Nesmdb_{subset_name}")

    split_files_for_training(

        files_paths=files_paths,

        tokenizer=tokenizer,

        save_dir=subset_chunks_dir,

        max_seq_len=1024,

        num_overlap_bars=2,

    )



    # Perform data augmentation

    augment_dataset(

        subset_chunks_dir,

        pitch_offsets=[-12, 12],

        velocity_offsets=[-4, 4],

        duration_offsets=[-0.5, 0.5],

    )
midi_paths_train = list(Path("Nesmdb_train").glob("**/*.mid")) + list(Path("Nesmdb_train").glob("**/*.midi"))
midi_paths_valid = list(Path("Nesmdb_valid").glob("**/*.mid")) + list(Path("Nesmdb_valid").glob("**/*.midi"))
midi_paths_test = list(Path("Nesmdb_test").glob("**/*.mid")) + list(Path("Nesmdb_test").glob("**/*.midi"))

## Tokenize the augmented NESMDB dataset into JSON files

In [None]:
def midi_valid(midi) -> bool:

    if any(ts.numerator != 4 for ts in midi.time_signature_changes):

        return False  # time signature different from 4/*, 4 beats per bar

    return True



if os.path.exists("tokenized_train"):
  shutil.rmtree("tokenized_train")
if os.path.exists("tokenized_valid"):
  shutil.rmtree("tokenized_valid")
if os.path.exists("tokenized_test"):
  shutil.rmtree("tokenized_test")


for dir in ("train", "valid", "test"):
    tokenizer.tokenize_dataset(        
        Path(f"/kaggle/working/Nesmdb_{dir}"),
        Path(f"/kaggle/working/tokenized_{dir}"),
        midi_valid,
    )

## Read the tokenized versions from JSON files

In [None]:
tokenized_train = list(Path("tokenized_train").resolve().glob("**/*.json"))
data_objects_train = read_json_files(tokenized_train)

tokenized_valid = list(Path("tokenized_valid").resolve().glob("**/*.json"))
data_objects_valid = read_json_files(tokenized_valid)

tokenized_test = list(Path("tokenized_test").resolve().glob("**/*.json"))
data_objects_test = read_json_files(tokenized_test)


if data_objects_train:
    print(f"\nSuccessfully read {len(data_objects_train)} training JSON files.")
else:
    print("Error reading JSON files.")

In [None]:
encoded_train = [np.array(song["ids"][0]) for song in data_objects_train]
encoded_valid = [np.array(song["ids"][0]) for song in data_objects_valid]
encoded_test = [np.array(song["ids"][0]) for song in data_objects_test]

In [None]:
all_ids_train = np.concatenate(encoded_train)
all_ids_valid = np.concatenate(encoded_valid)
all_ids_test = np.concatenate(encoded_test)

### Save numpy arrays

In [None]:
np.savetxt('ids_train.txt', all_ids_train, delimiter=',')
np.savetxt('ids_valid.txt', all_ids_valid, delimiter=',')
np.savetxt('ids_test.txt', all_ids_test, delimiter=',')

### Load numpy arrays

In [None]:
# Download the files
!gdown 1RRuql2uT_HFZSX9gau3ffHU3DFGxlpCw # train
!gdown 1itYNImS7mdXm-If8818I9N2HR-PMUksL # valid
!gdown 1H_rnDT8YCn-yRVQWXJdMd7SoREJFoXBt # test

In [None]:
all_ids_train = np.loadtxt("ids_train.txt", dtype=np.int32)
all_ids_valid = np.loadtxt("ids_valid.txt", dtype=np.int32)
all_ids_test = np.loadtxt("ids_test.txt", dtype=np.int32)

In [None]:
len(all_ids_train)

### Create Tensorflow datasets

In [None]:
ids_dataset_train = tf.data.Dataset.from_tensor_slices(all_ids_train)
ids_dataset_valid = tf.data.Dataset.from_tensor_slices(all_ids_valid)
ids_dataset_test = tf.data.Dataset.from_tensor_slices(all_ids_test)

### Convert into sequences

In [None]:
seq_length = 1024 

sequences_train = ids_dataset_train.batch(seq_length+1, drop_remainder=True)
sequences_valid = ids_dataset_valid.batch(seq_length+1, drop_remainder=True)
sequences_test = ids_dataset_test.batch(seq_length+1, drop_remainder=True)

## Preparing labels

In [None]:
def split_input_target(sequence):
    # Convert to float32
    input_seq = tf.cast(sequence[:-1], tf.int32)
    target_seq = tf.cast(sequence[1:], tf.int32)
    return input_seq, target_seq



train_ds = sequences_train.map(split_input_target)
valid_ds = sequences_valid.map(split_input_target)
test_ds = sequences_test.map(split_input_target)

### Creating training batches

In [None]:
# Batch size

BATCH_SIZE = 16 

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).

BUFFER_SIZE = 10000



train_ds = (
    train_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
valid_ds = (
    valid_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))
test_ds = (
    test_ds
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

## Get the model

In [None]:
!gdown 1jJnzLC66vhuraBmf7FaysPUC4tuvq9jR

!unzip nesgen-automodel.zip
!mv kaggle/working/nesgen-automodel nesgen-automodel
!rm -rf kaggle
!rm -rf nesgen-automodel.zip

## Load the pretrained model

In [None]:
from transformers import TFAutoModelForCausalLM
model = TFAutoModelForCausalLM.from_pretrained("./nesgen-automodel")

loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss,
              optimizer="adam",
              weighted_metrics=["sparse_categorical_accuracy"],
              jit_compile=True,
              )

In [None]:
EPOCHS = 5

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
)

In [None]:
model.save_pretrained("nesgen_v1") 
!zip -r nesgen-automodel.zip nesgen-automodel