# Qualitative Evaluation of Model Performance

Given the same prefix sequence of notes, does adding expressive features change model output? That is, do models respect expressive features?

## Constants

Some constants like filepaths and encodings for running this notebook.

In [None]:
# choose the data directory
DATA_DIR = "/home/pnlong/musescore/datav"
SHOW_SEQUENCES = False

In [None]:
# imports
from os.path import exists, basename
from os import makedirs, remove
import numpy as np
import pandas as pd
import representation
from IPython.display import display, HTML, Audio
from time import perf_counter
import utils
import torch
import dataset
import music_x_transformers
import train
import encode
import decode
from read_mscz.read_mscz import read_musescore
from read_mscz.music import BetterMusic

# filepaths
TEST_DATA_DIR = "/home/pnlong/musescore/test_data/evalqual"
if not exists(TEST_DATA_DIR):
    makedirs(TEST_DATA_DIR)
PREFIX_MSCZ_FILEPATH = f"{TEST_DATA_DIR}/simple.mscz"
if not exists(PREFIX_MSCZ_FILEPATH):
    raise FileNotFoundError("Must provide a valid MuseScore prefix filepath.")
PREFIX_OUTPUT = basename(PREFIX_MSCZ_FILEPATH).split(".")[0]

# load the encoding
encoding = representation.load_encoding(filepath = f"{DATA_DIR}/encoding.json")

# some more variables
include_velocity = ("velocity" in encoding["dimensions"])
use_absolute_time = not (("beat" in encoding["dimensions"]) and ("position" in encoding["dimensions"]))

# helper function for displaying sequences
def show(sequence: torch.tensor, columns: list = encoding["dimensions"], show_index: bool = True):
    start = perf_counter()
    if not SHOW_SEQUENCES:
        return
    if len(sequence.shape) == 3:
        sequence = sequence.squeeze(0)
    sequence = pd.DataFrame(data = sequence, columns = columns)
    # sequence.style.hide(axis = "index")
    display(HTML(sequence.to_html(index = show_index)))
    end = perf_counter()
    return end - start

## Prepare Prefix Sequence

Prepare the prefix sequence by extracting relevant data from the MuseScore file.

In [None]:
# path to save prepared prefix sequence filepaths to
paths = f"{TEST_DATA_DIR}/paths.txt"
remove(paths)

# helper function
def prepare_prefix(music: BetterMusic, prefix_path: str):

    # extract data from BetterMusic object
    music.tracks = [music.tracks[0],] # make sure it is just one track
    data = encode.extract_data(music = music, use_implied_duration = True, include_velocity = include_velocity, use_absolute_time = use_absolute_time)
    show(sequence = data, columns = representation.DIMENSIONS)

    # save encoded data
    np.save(file = prefix_path, arr = data)

    # text file with just the prefix path inside
    with open(paths, "a") as paths_output:
        paths_output.write(prefix_path + "\n")

# get BetterMusic object, both normal and expressive-feature-realized
music = read_musescore(path = PREFIX_MSCZ_FILEPATH, timeout = 10)
for track in music.tracks:
    for note in track.notes:
        note.velocity = track.notes[0].velocity
prepare_prefix(music = music, prefix_path = f"{TEST_DATA_DIR}/{PREFIX_OUTPUT}.npy")
music = read_musescore(path = PREFIX_MSCZ_FILEPATH, timeout = 10)
music.realize_expressive_features()
prepare_prefix(music = music, prefix_path = f"{TEST_DATA_DIR}/{PREFIX_OUTPUT}.realized.npy")


## List the Models

List the models that can be loaded.

In [None]:
with open(f"{DATA_DIR}/models.txt", "r") as models_output: # read in list of trained models
    models = [model.strip() for model in models_output.readlines()]
    for model in models:
        print(f"  - {model}")

## Load a Model

Specify the model to evaluate (from the list generated above) by setting the `model` field below. Then, load in the model's parameters.

In [None]:
# which model to use
model = "anticipation_conditional_aug_ape_20M"

In [None]:
# get directories
model_dir = f"{DATA_DIR}/{model}"
evalqual_output_dir = f"{model_dir}/evalqual"
if not exists(evalqual_output_dir):
    makedirs(evalqual_output_dir)

In [None]:
# load training configurations
train_args = utils.load_json(filepath = f"{model_dir}/train_args.json")

# set the device to cpu
device = torch.device("cpu")

# create the dataset
max_seq_len = train_args["max_seq_len"]
conditioning = train_args["conditioning"]
test_dataset = dataset.MusicDataset(paths = paths, encoding = encoding, conditioning = conditioning, max_seq_len = max_seq_len, use_augmentation = False, is_baseline = False)

# create the model
print("Creating model...")
use_absolute_time = not (("beat" in encoding["dimensions"]) and ("position" in encoding["dimensions"]))
model = music_x_transformers.MusicXTransformer(
    dim = train_args["dim"],
    encoding = encoding,
    depth = train_args["layers"],
    heads = train_args["heads"],
    max_seq_len = max_seq_len,
    max_temporal = encoding["max_" + ("time" if use_absolute_time else "beat")],
    rotary_pos_emb = train_args["rel_pos_emb"],
    use_abs_pos_emb = train_args["abs_pos_emb"],
    emb_dropout = train_args["dropout"],
    attn_dropout = train_args["dropout"],
    ff_dropout = train_args["dropout"],
).to(device)

# load the checkpoint
checkpoint_filepath = f"{model_dir}/checkpoints/best_model.{train.PARTITIONS[1]}.pth"
model.load_state_dict(state_dict = torch.load(f = checkpoint_filepath, map_location = device))
print(f"Loaded model weights from: {checkpoint_filepath}")
model.eval()
        
# get special tokens
sos = encoding["type_code_map"]["start-of-song"]
eos = encoding["type_code_map"]["end-of-song"]
note_token, grace_note_token = encoding["type_code_map"]["note"], encoding["type_code_map"]["grace-note"]
expressive_feature_token = encoding["type_code_map"][representation.EXPRESSIVE_FEATURE_TYPE_STRING]
is_anticipation = (conditioning == encode.CONDITIONINGS[-1])
sigma = train_args["sigma"] if use_absolute_time else encode.SIGMA_METRICAL

# create data loader, get the singular batch
test_data_loader = torch.utils.data.DataLoader(dataset = test_dataset, num_workers = 4, collate_fn = dataset.MusicDataset.collate, batch_size = 1, shuffle = False)
test_iter = iter(test_data_loader)
def get_next_batch() -> torch.tensor:
    batch = next(test_iter)
    batch = batch["seq"]
    batch = batch[batch[:, :, 0] != eos].unsqueeze(dim = 0) # remove eos token from batch
    return batch

## Generate Sequences

Now armed with the loaded model, generate a sequence given a prefix with and without expressive features.

### Notes Only

Generate with a prefix of only notes.

In [None]:
# make sure prefix is correct
batch = get_next_batch()
prefix_note = batch[batch[:, :, 0] != expressive_feature_token].unsqueeze(dim = 0) # filter out expressive features

# generate new samples
generated_note = model.generate(
    seq_in = prefix_note,
    seq_len = train.DEFAULT_MAX_SEQ_LEN,
    eos_token = eos,
    temperature = 1.0,
    filter_logits_fn = "top_k",
    filter_thres = 0.9,
    monotonicity_dim = ("type", "time" if use_absolute_time else "beat"),
    notes_only = True,
    is_anticipation = is_anticipation,
    sigma = sigma
)
show(generated_note)
generated_note = torch.cat(tensors = (prefix_note, generated_note), dim = 1).numpy() # wrangle a bit
np.save(file = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.note.npy", arr = generated_note) # save as a numpy array

# convert to audio
music = decode.decode(codes = generated_note[0], encoding = encoding) # convert to a BetterMusic object
audio_output_note = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.note.wav"
music.write(path = audio_output_note)

### Notes and Expressive Features

Generate with a prefix of notes and expressive features.

In [None]:
# make sure prefix is correct
batch = get_next_batch()
prefix_total = batch

# generate new samples
generated_total = model.generate(
    seq_in = prefix_total,
    seq_len = train.DEFAULT_MAX_SEQ_LEN,
    eos_token = eos,
    temperature = 1.0,
    filter_logits_fn = "top_k",
    filter_thres = 0.9,
    monotonicity_dim = ("type", "time" if use_absolute_time else "beat"),
    notes_only = False,
    is_anticipation = is_anticipation,
    sigma = sigma
)
show(generated_total)
generated_total = torch.cat(tensors = (prefix_total, generated_total), dim = 1).numpy() # wrangle a bit
np.save(file = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.total.npy", arr = generated_total) # save as a numpy array

# convert to audio
music = decode.decode(codes = generated_total[0], encoding = encoding) # convert to a BetterMusic object
audio_output_total = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.total.wav"
music.write(path = audio_output_total)

## Let's Compare the Audios!

Compare the `.wav` files -- did adding expressive features make a difference?

In [None]:
print(f"NOTES ONLY: {audio_output_note}")
display(Audio(audio_output_note))
print(f"EXPRESSIVE FEATURES: {audio_output_total}")
display(Audio(audio_output_total))