## **Questions:**
1. There were missing keys in the checkpoint model loaded: ['lm_head.weight'].

In [None]:
import locale
print(locale.getpreferredencoding())

UTF-8


In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
# Install necessary libraries

!pip install datasets transformers[torch]
!pip install wandb
!pip install note_seq
!pip install evaluate
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, accelerate, datasets
Successfully installed accelerate-0.26.1 datas

In [None]:
# Check if the code is running in Google Colab environment
if "google.colab" in str(get_ipython()):
    # Inform the user about installing dependencies in Colab
    print("Installing dependencies...")

    # Install fluidsynth and its development libraries using pip
    !apt-get install fluidsynth
    !apt-get install -qq libasound2-dev libjack-dev

    # Install the pyfluidsynth library using pip
    !pip install -qU pyfluidsynth

Installing dependencies...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fluidsynth is already the newest version (2.2.5-1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [None]:
# Import necessary libraries and modules

import os  # Operating system library for interacting with the file system
import wandb  # Library for experiment tracking and visualization
from huggingface_hub import notebook_login  # Log in to the Hugging Face Hub from a notebook
import note_seq  # Music generation library

from argparse import Namespace  # Namespace class for organizing command-line arguments

from datasets import load_dataset  # Load datasets for training and evaluation
from transformers import AutoTokenizer  # AutoTokenizer for automatically selecting tokenization method
from transformers import DataCollatorForLanguageModeling  # Data collator for language modeling
from transformers import set_seed  # Set seed for reproducibility
from transformers import Trainer, TrainingArguments  # Trainer and TrainingArguments for model training

import evaluate  # Custom module for evaluation (assuming it's part of the project)
import numpy as np  # NumPy for numerical operations

from transformers import AutoConfig, GPT2LMHeadModel  # AutoConfig and GPT2LMHeadModel for GPT-2 model

In [None]:
# Set the Protocol Buffers Python implementation to "python"
# This line is used to resolve compatibility issues related to Protocol Buffers (protobuf)
# It explicitly selects the pure Python implementation of the protobuf library
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [None]:
# Set parameters for WandB (Weights & Biases) integration
wandb_project = "lmd_musicgen"
entity = "musicgen"
data_processed = "lmd_processed"

In [None]:
# This line sets the environment variable WANDB_LOG_MODEL to 'checkpoint',
# which configures WandB to log the model checkpoints during training
%env WANDB_LOG_MODEL='checkpoint'

env: WANDB_LOG_MODEL='checkpoint'


In [None]:
# Login into wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Log in to the Hugging Face Hub from this notebook
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Code to log also some audio in the raw data

NOTE_LENGTH_16TH_120BPM = 0.25 * 60 / 120
BAR_LENGTH_120BPM = 4.0 * 60 / 120

def token_sequence_to_note_sequence(token_sequence, use_program=True, use_drums=True, instrument_mapper=None, only_piano=False):
    """
    Convert a token sequence to a NoteSequence.

    Args:
    - token_sequence: List of tokens representing a musical sequence.
    - use_program: Flag to use program information for non-drum instruments.
    - use_drums: Flag to use drums information.
    - instrument_mapper: Dictionary to map instrument names.
    - only_piano: Flag to keep only piano instruments.

    Returns:
    A NoteSequence object representing the musical sequence.
    """

    if isinstance(token_sequence, str):
        token_sequence = token_sequence.split()

    note_sequence = empty_note_sequence()

    # Render all notes.
    current_program = 1
    current_is_drum = False
    current_instrument = 0
    track_count = 0
    for token_index, token in enumerate(token_sequence):

        if token == "PIECE_START":
            pass
        elif token == "PIECE_END":
            print("The end.")
            break
        elif token == "TRACK_START":
            current_bar_index = 0
            track_count += 1
            pass
        elif token == "TRACK_END":
            pass
        elif token == "KEYS_START":
            pass
        elif token == "KEYS_END":
            pass
        elif token.startswith("KEY="):
            pass
        elif token.startswith("INST"):
            instrument = token.split("=")[-1]
            if instrument != "DRUMS" and use_program:
                if instrument_mapper is not None:
                    if instrument in instrument_mapper:
                        instrument = instrument_mapper[instrument]
                current_program = int(instrument)
                current_instrument = track_count
                current_is_drum = False
            if instrument == "DRUMS" and use_drums:
                current_instrument = 0
                current_program = 0
                current_is_drum = True
        elif token == "BAR_START":
            current_time = current_bar_index * BAR_LENGTH_120BPM
            current_notes = {}
        elif token == "BAR_END":
            current_bar_index += 1
            pass
        elif token.startswith("NOTE_ON"):
            pitch = int(token.split("=")[-1])
            note = note_sequence.notes.add()
            note.start_time = current_time
            note.end_time = current_time + 4 * NOTE_LENGTH_16TH_120BPM
            note.pitch = pitch
            note.instrument = current_instrument
            note.program = current_program
            note.velocity = 80
            note.is_drum = current_is_drum
            current_notes[pitch] = note
        elif token.startswith("NOTE_OFF"):
            pitch = int(token.split("=")[-1])
            if pitch in current_notes:
                note = current_notes[pitch]
                note.end_time = current_time
        elif token.startswith("TIME_DELTA"):
            delta = float(token.split("=")[-1]) * NOTE_LENGTH_16TH_120BPM
            current_time += delta
        elif token.startswith("DENSITY="):
            pass
        elif token == "[PAD]":
            pass
        else:
            #print(f"Ignored token {token}.")
            pass

    # Make the instruments right.
    instruments_drums = []
    for note in note_sequence.notes:
        pair = [note.program, note.is_drum]
        if pair not in instruments_drums:
            instruments_drums += [pair]
        note.instrument = instruments_drums.index(pair)

    if only_piano:
        for note in note_sequence.notes:
            if not note.is_drum:
                note.instrument = 0
                note.program = 0

    return note_sequence

def empty_note_sequence(qpm=120.0, total_time=0.0):
    """
    Create an empty NoteSequence with specified tempo and total time.

    Args:
    - qpm: Quarter notes per minute (tempo).
    - total_time: Total time of the sequence.

    Returns:
    An empty NoteSequence object.
    """
    note_sequence = note_seq.protobuf.music_pb2.NoteSequence()
    note_sequence.tempos.add().qpm = qpm
    note_sequence.ticks_per_quarter = note_seq.constants.STANDARD_PPQ
    note_sequence.total_time = total_time
    return note_sequence

## Download Dataset and tokenizer from Hugging Face

In the previos notebook, we trained a tokenizer. We'll use it here first to do some basic EDA to understand our data and what type of model size is better (number of layers, heads, etc.)

In [None]:
# first create a custom trainer to log prediction distribution
# Set the sample rate for audio processing
SAMPLE_RATE=44100

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def evaluation_loop(
        self,
        dataloader,
        description,
        prediction_loss_only=None,
        ignore_keys=None,
        metric_key_prefix="eval",
    ):
        # Call super class method to get the eval outputs
        eval_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix,
        )

         # Log the prediction distribution using `wandb.Histogram` method.
        if wandb.run is not None:
            # Encode a starting token to begin the generation
            input_ids = self.tokenizer.encode("PIECE_START", return_tensors="pt").cuda()

            # Generate more tokens for each voice
            for voice_num in range(1, 5):
                generated_ids = self.model.generate(
                    input_ids,
                    max_length=2048,
                    do_sample=True,
                    temperature=0.75, # Set temperature for sampling (higher values for more randomness, lower for more determinism)
                    top_p = 0.9, # Set top-p sampling parameters (nucleus sampling) to control diversity
                    top_k = 50, # Set top-k sampling parameters to restrict generation to the top-k most likely tokens
                    eos_token_id=self.tokenizer.encode("TRACK_END")[0]
                )

                # Decode the generated tokens into a token sequence
                token_sequence = self.tokenizer.decode(generated_ids[0])

                # Convert the token sequence into a NoteSequence
                note_sequence = token_sequence_to_note_sequence(token_sequence)

                # Synthesize the audio from the NoteSequence
                synth = note_seq.fluidsynth
                array_of_floats = synth(note_sequence, sample_rate=SAMPLE_RATE)

                # Convert the float audio samples to int16 format
                int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)

                # Log the generated audio using the wandb.Audio method
                wandb.log({"Generated_audio_voice_" + str(voice_num): wandb.Audio(int16_data, SAMPLE_RATE)})

        # Return the evaluation output
        return eval_output

In [None]:
CONTEXT_LENGTH = 2048

In [None]:
# Configuration parameters for the training process
# Commented parameters correspond to the small model

config = {
    "output_dir": "output",  # Directory to save the model and training outputs
    "num_train_epochs": 1,   # Number of training epochs
    "per_device_train_batch_size": 4,  # Batch size for training on each device
    "per_device_eval_batch_size": 2,   # Batch size for evaluation on each device
    "evaluation_strategy": "steps",   # Evaluation strategy during training
    "save_strategy": "steps",         # Save strategy during training
    "eval_steps": 2497,               # Number of steps before evaluation
    "logging_steps": 2497,            # Number of steps before logging
    "logging_first_step": True,       # Log metrics on the first training step
    "save_total_limit": 2,            # Limit on the total number of checkpoints to save
    "save_steps": 2497,               # Number of steps before saving a checkpoint
    "lr_scheduler_type": "cosine",    # Learning rate scheduler type (cosine)
    "learning_rate": 5e-4,            # Initial learning rate
    "warmup_ratio": 0.01,             # Ratio of warmup steps during learning rate warmup
    "weight_decay": 0.01,             # Weight decay for optimization
    "seed": 1,                        # Random seed for reproducibility
    "load_best_model_at_end": True,   # Load the best model at the end of training
    "report_to": "wandb",            # Reporting to WandB (Weights and Biases)
    "prediction_loss_only": False,    # Whether to compute only the prediction loss during training
    "gradient_accumulation_steps": 1  # Number of steps for gradient accumulation
}

In [None]:
def compute_metrics_fn(eval_pred):
    """
    Compute evaluation metrics for the model predictions.

    Args:
    - eval_pred: The output of the model during evaluation.

    Returns:
    A dictionary containing computed evaluation metrics.
    """
    metrics = dict()

    # Load the accuracy metric from the 'evaluate' module
    accuracy_metric = evaluate.load("accuracy")

    # Extract predictions and labels from the evaluation predictions
    logits, labels = eval_pred.predictions, eval_pred.label_ids

    # Filter out padding tokens (tokens with label -100)
    not_pad_mask = labels != -100
    logits, labels = logits[not_pad_mask], labels[not_pad_mask]

    # Compute predictions by taking the argmax along the last dimension
    predictions = np.argmax(logits, axis=-1)

    # Flatten the predictions and labels to match the expected format
    flat_predictions = predictions.flatten()  # Shape: (num_samples * sequence_length,)
    flat_labels = labels.flatten()  # Shape: (num_samples * sequence_length,)

    # Compute accuracy metric and add it to the metrics dictionary
    metrics.update(
        accuracy_metric.compute(references=flat_labels, predictions=flat_predictions)
    )

    # Return the computed metrics
    return metrics

In [None]:
def get_raw_data_and_tokenizer():
    """
    Load raw data and tokenizer for the MMM Track LMD 8-bars dataset.

    Returns:
    - raw_datasets: A Hugging Face Dataset containing the raw data split into training and testing sets.
    - tokenizer: A Hugging Face AutoTokenizer for tokenizing the dataset.
    """
    # Load the MMM Track LMD 8-bars dataset from Hugging Face's datasets library
    ds = load_dataset("juancopi81/mmm_track_lmd_8bars_nots", split="train")

    # Split the dataset into training and testing sets with 10% for testing and shuffle the data
    raw_datasets = ds.train_test_split(test_size=0.1, shuffle=True)

    # Change the tokenizer based on the specific model or task requirements
    # In this case, it's using a tokenizer from Hugging Face's model hub
    tokenizer = AutoTokenizer.from_pretrained("aimusicgen/lmd_tokenizer")

    # Return the raw datasets and the tokenizer
    return raw_datasets, tokenizer

In [None]:
def tokenize(element, tokenizer):
    """
    Tokenize a given element using a specified tokenizer.

    Args:
    - element: The input element to be tokenized.
    - tokenizer: The tokenizer to be used for tokenization.

    Returns:
    A dictionary containing the tokenized representation of the input element.
    """
    # Replace this based on Dataset
    context_length = CONTEXT_LENGTH

    # Use the tokenizer to tokenize the text in the input element
    # Truncate long elements, but no effect in JSB
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        padding=False
    )

    # Return a dictionary containing the tokenized input IDs
    return {"input_ids": outputs["input_ids"]}

In [None]:
def create_tokenized_dataset(raw_datasets, tokenizer):
    """
    Tokenize the raw datasets using a specified tokenizer.

    Args:
    - raw_datasets: The raw datasets to be tokenized.
    - tokenizer: The tokenizer to be used for tokenization.

    Returns:
    A tokenized version of the input datasets.
    """
    # Create a tokenized dataset using the 'tokenize' function on each batch
    tokenized_datasets = raw_datasets.map(
        tokenize,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        fn_kwargs={"tokenizer": tokenizer}
    )

    # Return the tokenized datasets
    return tokenized_datasets

In [None]:
from transformers import AutoConfig, GPT2LMHeadModel

def create_model(tokenizer):
    """
    Create a GPT-2 language model for the specified tokenizer.

    Args:
    - tokenizer: The tokenizer for tokenizing the input.

    Returns:
    A GPT-2 language model.
    """
    # Change these values based on the size of the data
    n_layer = 6
    n_head = 8
    n_emb = 512

    # Create a configuration for the GPT-2 model
    config = AutoConfig.from_pretrained(
        "gpt2",
        vocab_size=len(tokenizer),
        n_positions=CONTEXT_LENGTH,
        n_layer=n_layer,
        n_head=n_head,
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        n_embd=n_emb
    )

    # Set the num_attention_heads after creating the configuration
    #config.num_attention_heads = 0

    # Instantiate the GPT-2 language model using the specified configuration
    model = GPT2LMHeadModel(config)

    # Return the created GPT-2 model
    return model


In [None]:
def train(config):
    """
    Train a language model based on the provided configuration.

    Args:
    - config: Configuration parameters for training.

    Returns:
    None
    """
    # Set a seed for reproducibility
    set_seed(config["seed"])

    # Load raw data and tokenizer
    raw_datasets, tokenizer = get_raw_data_and_tokenizer()

    # Tokenize the datasets
    tokenized_datasets = create_tokenized_dataset(raw_datasets=raw_datasets, tokenizer=tokenizer)

    # Create the GPT-2 language model
    model = create_model(tokenizer)

    # Initialize WandB for logging and monitoring
    run = wandb.init(project=wandb_project, job_type="training", config=config)

    # Create a data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    # Create training arguments
    train_args = TrainingArguments(**config)

    # Initialize the custom trainer for training the model
    trainer = CustomTrainer(
        model=model,
        tokenizer=tokenizer,
        args=train_args,
        data_collator=data_collator,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        # compute_metrics=compute_metrics_fn
    )

    # Start training
    trainer.train()

    # Finish logging with WandB
    wandb.finish()

In [None]:
# Trigger the training process with the specified configuration
train(config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33256 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/146 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/5.79k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Map:   0%|          | 0/29930 [00:00<?, ? examples/s]

Map:   0%|          | 0/3326 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mnaomitunstead[0m ([33mmusicgen[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
2497,3.005,2.589697
4994,2.063,1.010919


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


VBox(children=(Label(value='11.211 MB of 11.232 MB uploaded\r'), FloatProgress(value=0.9981161511236186, max=1…

0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▃▃▆▆█
train/global_step,▁▃▃▃▃▃▃▆▆▆▆▆▆█
train/learning_rate,▁█▃
train/loss,█▃▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,1.01092
eval/runtime,19.7773
eval/samples_per_second,168.173
eval/steps_per_second,84.086
train/epoch,1.0
train/global_step,7483.0
train/learning_rate,0.00013
train/loss,2.063
train/total_flos,2444369578229760.0
train/train_loss,2.06452
