# Qualitative Evaluation of Model Performance

Given the same prefix sequence of notes, does adding expressive features change model output? That is, do models respect expressive features?

## Imports

Import necessary packages.

In [None]:
from os.path import exists, basename
from os import makedirs, mkdir, remove
import numpy as np
import pandas as pd
import representation
from typing import Tuple
from IPython.display import display, HTML, Audio
from time import sleep
import utils
import torch
import dataset
import music_x_transformers
import train
import encode
import decode
from tqdm import tqdm
from read_mscz.read_mscz import read_musescore
from read_mscz.music import MusicExpress
from evaluate_baseline import unpad_prefix

## Constants

Some constants like filepaths and encodings for running this notebook.

In [None]:
# choose the data directory
DATA_DIR = "/home/pnlong/musescore/datav"
DEVICE = "cpu"
RATING = False # are we rating the different models against each other
SHOW_SEQUENCES = False


# filepaths
TEST_DATA_DIR = "/home/pnlong/musescore/test_data/evalqual"
if not exists(TEST_DATA_DIR):
    makedirs(TEST_DATA_DIR)
PREFIX_MSCZ_FILEPATH = f"{TEST_DATA_DIR}/simple.mscz"
if not exists(PREFIX_MSCZ_FILEPATH):
    raise FileNotFoundError("Must provide a valid MuseScore prefix filepath.")
PREFIX_OUTPUT = basename(PREFIX_MSCZ_FILEPATH).split(".")[0]

## Load in Encoding Data

Various encoding-related data such as the use of absolute time and velocity.

In [None]:
# load the encoding
encoding = representation.load_encoding(filepath = f"{DATA_DIR}/encoding.json")

# some special tokens and variables
include_velocity = encoding["include_velocity"]
use_absolute_time = encoding["use_absolute_time"]
type_dim = encoding["dimensions"].index("type")
sos = encoding["type_code_map"]["start-of-song"]
eos_type_string = "end-of-song"
eos = encoding["type_code_map"][eos_type_string]
_, unidimensional_decoding_function = representation.get_unidimensional_coding_functions(encoding = encoding)

# define the device to load models on
device = torch.device(DEVICE)

# helper function for displaying sequences
def show(sequence: torch.tensor, columns: list = encoding["dimensions"], show_index: bool = True, show_sequences: bool = False):
    """Display data tables."""
    if not show_sequences:
        return
    if len(sequence.shape) == 3:
        sequence = sequence.squeeze(0)
    sequence = pd.DataFrame(data = sequence, columns = columns)
    # sequence.style.hide(axis = "index")
    display(HTML(sequence.to_html(index = show_index)))


## Prepare Prefix Sequence

Prepare the prefix sequence by extracting relevant data from the MuseScore file.

In [None]:
# path to save prepared prefix sequence filepaths to
paths = f"{TEST_DATA_DIR}/paths.txt"
remove(paths)

# helper function to prepare a prefix
def prepare_prefix(music: MusicExpress, prefix_path: str, include_expressive_features: bool = True):

    # extract data from MusicExpress object
    music.tracks = [music.tracks[0],] # make sure it is just one track
    data = encode.extract_data(music = music, use_implied_duration = True, include_velocity = include_velocity, use_absolute_time = use_absolute_time)
    if (not include_expressive_features):
        data = data[data[:, type_dim] != representation.EXPRESSIVE_FEATURE_TYPE_STRING]
    show(sequence = data, columns = representation.DIMENSIONS)

    # save encoded data
    np.save(file = prefix_path, arr = data)

    # text file with just the prefix path inside
    with open(paths, "a") as paths_output:
        paths_output.write(prefix_path + "\n")

# get MusicExpress object, both normal and expressive-feature-realized
music = read_musescore(path = PREFIX_MSCZ_FILEPATH, timeout = 10)
for track in music.tracks:
    for note in track.notes:
        note.velocity = track.notes[0].velocity
prepare_prefix(music = music, prefix_path = f"{TEST_DATA_DIR}/{PREFIX_OUTPUT}.npy", include_expressive_features = False)
music = read_musescore(path = PREFIX_MSCZ_FILEPATH, timeout = 10)
music.realize_expressive_features()
prepare_prefix(music = music, prefix_path = f"{TEST_DATA_DIR}/{PREFIX_OUTPUT}.realized.npy", include_expressive_features = True)

## Evaluate a Model

Given the model name, generate two samples:

- One with a prefix of only constant-velocity notes.
- One with a prefix of notes and expressive features.

Do the two samples differ? Does the latter respect expressive feature markings?

In [None]:
def evaluate(model: str, show_sequences: bool = False) -> Tuple[str, str]:
    """
    Given the model name (`model`), generate two samples with a prefix of...
    - just notes at constant velocity.
    - notes and expressive features.
    Returns the filepaths to both samples.
    
    Do the two samples differ? Does the expressive feature sample respect the provided expressive features?
    """

    # LOAD IN MODEL
    ##################################################
    # Load in a model.

    # get directories
    model_dir = f"{DATA_DIR}/{model}"
    if not exists(model_dir):
        raise FileNotFoundError(f"{model_dir} does not exist.")
    evalqual_output_dir = f"{model_dir}/evalqual"
    if not exists(evalqual_output_dir):
        mkdir(evalqual_output_dir)

    # load training configurations
    train_args = utils.load_json(filepath = f"{model_dir}/train_args.json")

    # create the dataset
    max_seq_len = train_args["max_seq_len"]
    conditioning = train_args["conditioning"]
    unidimensional = train_args.get("unidimensional", False)
    n_tokens_per_event = len(encoding["dimensions"]) if unidimensional else 1
    is_anticipation = (conditioning == encode.CONDITIONINGS[-1])
    sigma = train_args["sigma"]
    test_dataset = dataset.MusicDataset(paths = paths, encoding = encoding, conditioning = conditioning, max_seq_len = max_seq_len, use_augmentation = False, is_baseline = False, unidimensional = unidimensional, for_generation = True)

    # create the model
    # print("Creating model...")
    model = music_x_transformers.MusicXTransformer(
        dim = train_args["dim"],
        encoding = encoding,
        depth = train_args["layers"],
        heads = train_args["heads"],
        max_seq_len = max_seq_len,
        max_temporal = encoding["max_" + ("time" if use_absolute_time else "beat")],
        rotary_pos_emb = train_args["rel_pos_emb"],
        use_abs_pos_emb = train_args["abs_pos_emb"],
        emb_dropout = train_args["dropout"],
        attn_dropout = train_args["dropout"],
        ff_dropout = train_args["dropout"],
        unidimensional = unidimensional,
    ).to(device)

    # load the checkpoint
    checkpoint_filepath = f"{model_dir}/checkpoints/best_model.{train.PARTITIONS[1]}.pth"
    model_state_dict = torch.load(f = checkpoint_filepath, map_location = device)
    model.load_state_dict(state_dict = model_state_dict)
    # print(f"Loaded model weights from: {checkpoint_filepath}")
    model.eval()

    # create data loader, get the singular batch
    test_data_loader = torch.utils.data.DataLoader(dataset = test_dataset, num_workers = 4, collate_fn = test_dataset.collate, batch_size = 2, shuffle = False)
    test_iter = iter(test_data_loader)
    
    ##################################################


    # GENERATE SAMPLES
    ##################################################

    # get prefix
    prefix = next(test_iter)
    prefix = prefix["seq"]
    prefix = prefix.to(device)
    # show(prefix[0].reshape(-1, len(encoding["dimensions"])), show_sequences = True)

    # generate
    generated = model.generate(
        seq_in = prefix,
        seq_len = train.DEFAULT_MAX_SEQ_LEN,
        eos_token = eos,
        temperature = 1.0,
        filter_logits_fn = "top_k",
        filter_thres = 0.9,
        monotonicity_dim = ("type", "time" if use_absolute_time else "beat"),
        joint = False,
        notes_are_controls = False,
        is_anticipation = is_anticipation,
        sigma = sigma
    )
    generated = torch.cat(tensors = (prefix, generated), dim = 1).cpu().numpy()

    # helper function to synthesize a sequence
    suffixes = ["note", "total"]
    audio_output_filepaths = tuple(f"{evalqual_output_dir}/{PREFIX_OUTPUT}.{suffix}.wav" for suffix in suffixes)
    symbolic_output_filepaths = tuple(f"{evalqual_output_dir}/{PREFIX_OUTPUT}.{suffix}.xml" for suffix in suffixes)
    def synthesize_generation(notes_only: bool = True):
        """Synthesize the generated sequence as audio, saving the sequence in the process."""
        i = int(not notes_only)
        generation = unpad_prefix(prefix = generated[i], sos_token = sos, pad_value = dataset.PAD_VALUE, n_tokens_per_event = n_tokens_per_event) # wrangle a bit
        if show_sequences:
            print("NOTES ONLY" if notes_only else "EXPRESSIVE FEATURES")
            show(sequence = generation.reshape(int(generation.shape[0] / n_tokens_per_event), n_tokens_per_event) if unidimensional else generation,
                 columns = encoding["unidimensional_encoding_order" if unidimensional else "dimensions"],
                 show_sequences = show_sequences)
        np.save(file = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.{suffixes[i]}.npy", arr = generation) # save as a numpy array

        # convert to audio
        music = decode.decode(codes = generation, encoding = encoding, unidimensional_decoding_function = unidimensional_decoding_function) # convert to a MusicExpress object
        music.write(path = audio_output_filepaths[i])
        music.write(path = symbolic_output_filepaths[i])
    
    # save audios
    synthesize_generation(notes_only = True)
    synthesize_generation(notes_only = False)

    # return audio filepaths
    return audio_output_filepaths

    ##################################################
    

## Test Generations

Given a model, test the quality of the generated music.

In [None]:
if not RATING:
    
    # generate audio for a given model
    model = "prefix_conditional_ape_20M"
    audio_output_note, audio_output_total = evaluate(model = model, show_sequences = False)

    # audio with notes only, constant velocity prefix
    print(f"NOTES ONLY: {audio_output_note}")
    display(Audio(audio_output_note))

    # audio with expressive features prefix
    print(f"EXPRESSIVE FEATURES: {audio_output_total}")
    display(Audio(audio_output_total))

## Generate Samples

Get the available models in the provided data directory, then generate samples for each of the models.

In [None]:
samples_output_filepath = f"{DATA_DIR}/evalqual.csv"

# create if samples are not generated yet
regenerate = False
if (not exists(samples_output_filepath)) or regenerate:

    # generate a list of available models
    with open(f"{DATA_DIR}/models.txt", "r") as models_output: # read in list of trained models
        models = [model.strip() for model in models_output.readlines()]
        models = sorted(models)

    # generate samples
    audio_outputs_note, audio_outputs_total = utils.rep(x = "", times = len(models)), utils.rep(x = "", times = len(models))
    for i in (progress_bar := tqdm(iterable = range(len(models)), desc = "Generating samples for models")):
        progress_bar.set_postfix(model = f"{models[i]}")
        audio_outputs_note[i], audio_outputs_total[i] = evaluate(model = models[i])
    samples = pd.DataFrame(data = {"model": models, "note": audio_outputs_note, "total": audio_outputs_total})
    samples.to_csv(path_or_buf = samples_output_filepath, sep = ",", na_rep = train.NA_VALUE, header = True, index = False, mode = "w") # write data frame

    del models, audio_outputs_note, audio_outputs_total

# if samples have previously been generated
else:

    # load in the data frame
    samples = pd.read_csv(filepath_or_buffer = samples_output_filepath, sep = ",", na_values = train.NA_VALUE, header = 0, index_col = False)

# display
display(HTML(samples.to_html(index = False))) # show data frame

## Rate the Models

Listen to the `.wav` files -- did adding expressive features make a difference?

In [None]:
# get ratings
print("Playing generated samples, please rate 1-5.\nEnter 0 if the sample is incoherent to sanity check the baseline.")
ratings = [None] * len(samples)
for i, model in enumerate(samples["model"]):
    display(Audio(samples.at[i, "total"]))
    rating = input(f"Rate {model}: ")
    if len(rating) == 0:
        continue
    while True:
        try:
            rating = int(rating)
            break
        except ValueError:
            rating = input("Please enter a numeric value: ")
    if (1 <= rating) and (rating <= 5):
        ratings[i] = rating
    else:
        print(f"Playing baseline version of {model}: ")
        display(Audio(samples.at[i, "total"]))
        sleep(seconds = 2)
samples["rating"] = ratings

# save rating to file
ratings_output_filepath = f"{DATA_DIR}/ratings.csv"
samples.to_csv(path_or_buf = ratings_output_filepath, sep = ",", na_rep = train.NA_VALUE, header = True, index = False, mode = "w") # write data frame