# Qualitative Evaluation of Model Performance

Given the same prefix sequence of notes, does adding expressive features change model output? That is, do models respect expressive features?

## Constants

Some constants like filepaths and encodings for running this notebook.

In [1]:
# choose the data directory
DATA_DIR = "/home/pnlong/musescore/datavaa"

In [3]:
# imports
from os.path import exists, basename
from os import makedirs
import numpy as np
import representation
import utils
import torch
import dataset
import music_x_transformers
import train
from encode import extract_data
import decode
from read_mscz.read_mscz import read_musescore

# filepaths
TEST_DATA_DIR = "/home/pnlong/musescore/test_data/evalqual"
if not exists(TEST_DATA_DIR):
    makedirs(TEST_DATA_DIR)
PREFIX_MSCZ_FILEPATH = f"{TEST_DATA_DIR}/test.mscz"
if not exists(PREFIX_MSCZ_FILEPATH):
    raise FileNotFoundError("Must provide a valid MuseScore prefix filepath.")
PREFIX_OUTPUT = basename(PREFIX_MSCZ_FILEPATH)

# load the encoding
encoding = representation.load_encoding(filepath = f"{DATA_DIR}/encoding.json")

# some more variables
include_velocity = ("velocity" in encoding["dimensions"])
use_absolute_time = not (("beat" in encoding["dimensions"]) and ("position" in encoding["dimensions"]))

## Prepare Prefix Sequence

Prepare the prefix sequence by extracting relevant data from the MuseScore file.

In [4]:
# get BetterMusic object
music = read_musescore(path = PREFIX_MSCZ_FILEPATH, timeout = 10)
music.realize_expressive_features()

# extract data from BetterMusic object
music.tracks = [music.tracks[0],] # make sure it is just one track
data = extract_data(music = music, use_implied_duration = True, include_velocity = include_velocity, use_absolute_time = use_absolute_time)

# save encoded data
prefix_path = f"{TEST_DATA_DIR}/{basename(PREFIX_MSCZ_FILEPATH)}.npy"
np.save(file = prefix_path, arr = data)

# text file with just the prefix path inside
paths = f"{TEST_DATA_DIR}/paths.txt"
with open(paths, "w") as paths_output:
    paths_output.write(prefix_path + "\n")

## List the Models

List the models that can be loaded.

In [5]:
with open(f"{DATA_DIR}/models.txt", "r") as models_output: # read in list of trained models
    models = [model.strip() for model in models_output.readlines()]
    for model in models:
        print(f"  - {model}")

  - baseline_aug_ape_31M
  - prefix_aug_ape_31M
  - anticipation_aug_ape_31M
  - prefix_conditional_aug_ape_31M
  - anticipation_conditional_aug_ape_31M


## Load a Model

Specify the model to evaluate (from the list generated above) by setting the `model` field below. Then, load in the model's parameters.

In [6]:
# which model to use
model = ""

In [7]:
# get directories
model_dir = f"{DATA_DIR}/{model}"
evalqual_output_dir = f"{model_dir}/evalqual"
if not exists(evalqual_output_dir):
    makedirs(evalqual_output_dir)

In [12]:
# load training configurations
train_args = utils.load_json(filepath = f"{model_dir}/train_args.json")

# set the device to cpu
device = torch.device("cpu")

# create the dataset
max_seq_len = train_args["max_seq_len"]
test_dataset = dataset.MusicDataset(paths = paths, encoding = encoding, max_seq_len = max_seq_len, use_augmentation = False, is_baseline = False)

# create the model
print("Creating model...")
use_absolute_time = not (("beat" in encoding["dimensions"]) and ("position" in encoding["dimensions"]))
model = music_x_transformers.MusicXTransformer(
    dim = train_args["dim"],
    encoding = encoding,
    depth = train_args["layers"],
    heads = train_args["heads"],
    max_seq_len = max_seq_len,
    max_temporal = encoding["max_" + ("time" if use_absolute_time else "beat")],
    rotary_pos_emb = train_args["rel_pos_emb"],
    use_abs_pos_emb = train_args["abs_pos_emb"],
    emb_dropout = train_args["dropout"],
    attn_dropout = train_args["dropout"],
    ff_dropout = train_args["dropout"],
).to(device)

# load the checkpoint
checkpoint_filepath = f"{model_dir}/checkpoints/best_model.{train.PARTITIONS[1]}.pth"
model.load_state_dict(state_dict = torch.load(f = checkpoint_filepath, map_location = device))
print(f"Loaded model weights from: {checkpoint_filepath}")
model.eval()
        
# get special tokens
sos = encoding["type_code_map"]["start-of-song"]
eos = encoding["type_code_map"]["end-of-song"]
note_token, grace_note_token = encoding["type_code_map"]["note"], encoding["type_code_map"]["grace-note"]
expressive_token = encoding["type_code_map"][representation.EXPRESSIVE_FEATURE_TYPE_STRING]

# create data loader, get the singular batch
test_data_loader = torch.utils.data.DataLoader(dataset = test_dataset, num_workers = 4, collate_fn = dataset.MusicDataset.collate, batch_size = 1, shuffle = False)
test_iter = iter(test_data_loader)
batch = next(test_iter)
batch = batch["seq"]

Creating model...
Loaded model weights from: /home/pnlong/musescore/datavaa/baseline_aug_ape_31M/checkpoints/best_model.valid.pth


## Generate Sequences

Now armed with the loaded model, generate a sequence given a prefix with and without expressive features.

### Notes Only

Generate with a prefix of only notes.

In [20]:
# make sure prefix is correct
prefix_note = batch[batch[:, :, 0] != encoding["type_code_map"][representation.EXPRESSIVE_FEATURE_TYPE_STRING]].unsqueeze(dim = 0) # filter out expressive features

# generate new samples
generated_note = model.generate(
    seq_in = prefix_note,
    seq_len = train.DEFAULT_MAX_SEQ_LEN,
    eos_token = eos,
    temperature = 1.0,
    filter_logits_fn = "top_k",
    filter_thres = 0.9,
    monotonicity_dim = ("type", "time" if use_absolute_time else "beat"),
    notes_only = True
)
generated_note = torch.cat(tensors = (prefix_note, generated_note), dim = 1).numpy() # wrangle a bit
np.save(file = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.note.npy", arr = generated_note) # save as a numpy array

# convert to audio
music = decode.decode(codes = generated_note[0], encoding = encoding) # convert to a BetterMusic object
audio_output_note = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.note.wav"
music.write(path = audio_output_note)

AttributeError: 'tuple' object has no attribute 'pop'

### Notes and Expressive Features

Generate with a prefix of notes and expressive features.

In [None]:
# make sure prefix is correct
prefix_total = batch

# generate new samples
generated_total = model.generate(
    seq_in = prefix_total,
    seq_len = train.DEFAULT_MAX_SEQ_LEN,
    eos_token = eos,
    temperature = 1.0,
    filter_logits_fn = "top_k",
    filter_thres = 0.9,
    monotonicity_dim = ("type", "time" if use_absolute_time else "beat"),
    notes_only = True
)
generated_total = torch.cat(tensors = (prefix_total, generated_total), dim = 1).numpy() # wrangle a bit
np.save(file = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.total.npy", arr = generated_total) # save as a numpy array

# convert to audio
music = decode.decode(codes = generated_total[0], encoding = encoding) # convert to a BetterMusic object
audio_output_total = f"{evalqual_output_dir}/{PREFIX_OUTPUT}.total.wav"
music.write(path = audio_output_total)

## Let's Compare the Audios!

Compare the `.wav` files -- did adding expressive features make a difference?

In [None]:
import IPython
print("NOTES ONLY")
IPython.display.display(IPython.display.Audio(audio_output_note))
print("EXPRESSIVE FEATURES")
IPython.display.display(IPython.display.Audio(audio_output_total))