In [43]:
import mir_eval
import glob
import json
import numpy as np
import tensorflow as tf
import pretty_midi
import librosa


from basic_pitch import inference
from basic_pitch import models

from basic_pitch.constants import (
    ANNOT_N_FRAMES,
    ANNOTATIONS_FPS,
    ANNOTATIONS_N_SEMITONES,
    AUDIO_N_SAMPLES,
    N_FREQ_BINS_CONTOURS,
    AUDIO_SAMPLE_RATE,
    FFT_HOP
)

BATCH_SIZE = 3

tfkl = tf.keras.layers

In [95]:
# Load in the ground truth MIDI files
# glob is a pattern matching utility for files

#use maestro-v3.0.0.json to get needed files

with open('datasets/maestro/maestro-v3.0.0/maestro-v3.0.0.json', 'r') as f:
    data = json.load(f)
    midi_filenames = data['midi_filename']
    audio_filenames = data['audio_filename']

In [7]:
# initialize spotify basic pitch model
model = models.model()
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])


In [9]:
y = model.predict(x_train[1])



IndexError: list index out of range

In [None]:
# Compare the two sets of MIDI files using mir_eval
scores = mir_eval.midi.evaluate(gt_midi, model_midi)



AttributeError: module 'mir_eval' has no attribute 'midi'

In [80]:
# preprocess audio
x_train = []
y_pred = []

for idx in range(0, 10):

    audio_file = "datasets/maestro/maestro-v3.0.0/" + audio_filenames['{}'.format(idx)]

    # overlap 30 frames
    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len
    audio_windowed, _, audio_original_length = inference.get_audio_input(audio_file, overlap_len, hop_size)

    x_train.append(audio_windowed)
    output = model.predict(audio_windowed)
    unwrapped_output = {k: inference.unwrap_output(output[k], audio_original_length, n_overlapping_frames) for k in output}
    y_pred.append(unwrapped_output)
    





In [81]:
def midi_to_piano_onset_matrix(midi_path, frames_per_second=ANNOTATIONS_FPS):
    """
    Convert MIDI file to a binary matrix representing onset of piano keys using a set FPS.

    Parameters:
    - midi_path (str): Path to the MIDI file.
    - frames_per_second (int): Number of frames per second for the binary representation.

    Returns:
    - numpy.ndarray: Binary matrix where rows represent the 88 piano keys and columns are time frames.
    """

    # Load the MIDI file
    midi_data = pretty_midi.PrettyMIDI(midi_path)

    # Duration of the MIDI file in seconds
    duration = midi_data.get_end_time()

    # 88 keys for standard piano
    num_piano_keys = 88

    # Calculate the total number of frames based on the FPS
    total_frames = int(duration * frames_per_second)

    # Initialize binary matrix with zeros
    binary_matrix = np.zeros((total_frames, num_piano_keys))

    for instrument in midi_data.instruments:
        for note in instrument.notes:
            # Only consider valid piano notes (from 21 to 108)
            if 21 <= note.pitch <= 108:
                # Find the frame for this onset time
                onset_frame = int(note.start * frames_per_second)

                # Prevent indexing beyond the matrix size
                if onset_frame < total_frames:
                    # Adjust the pitch value to fit within our matrix's row indices (0-87)
                    adjusted_pitch = note.pitch - 21

                    # Mark the onset in the binary matrix
                    binary_matrix[onset_frame, adjusted_pitch] = 1

    return binary_matrix


In [82]:
idx = 0
midi_file = "datasets/maestro/maestro-v3.0.0/" + midi_filenames['{}'.format(idx)]

pm_midi = pretty_midi.PrettyMIDI(midi_file)

duration = pm_midi.get_end_time()

onsets = midi_to_piano_onset_matrix(midi_file, frames_per_second=ANNOTATIONS_FPS)
padding = y_pred[idx]['onset'].shape[0] - onsets.shape[0]
onsets = np.pad(onsets, [(0, padding), (0, 0)], 'constant')

print("Y_train MIDI to binary onset matrix shape: ", onsets.shape)
print("Y_prediction from model shape: ", y_pred[idx]['onset'].shape)


Y_train MIDI to binary onset matrix shape:  (60567, 88)
Y_prediction from model shape:  (60567, 88)


In [88]:
x_train = []
y_train = []
y_pred = []

for idx in range(0, 10):

    # preprocess audio
    audio_file = "datasets/maestro/maestro-v3.0.0/" + audio_filenames['{}'.format(idx)]

    n_overlapping_frames = 30
    overlap_len = n_overlapping_frames * FFT_HOP
    hop_size = AUDIO_N_SAMPLES - overlap_len
    audio_windowed, _, audio_original_length = inference.get_audio_input(audio_file, overlap_len, hop_size)

    x_train.append(audio_windowed)
    output = model.predict(audio_windowed)
    unwrapped_output = {k: inference.unwrap_output(output[k], audio_original_length, n_overlapping_frames) for k in output}
    y_pred.append(unwrapped_output)


    # preprocess midi
    midi_file = "datasets/maestro/maestro-v3.0.0/" + midi_filenames['{}'.format(idx)]

    pm_midi = pretty_midi.PrettyMIDI(midi_file)
    onsets = midi_to_piano_onset_matrix(midi_file, frames_per_second=ANNOTATIONS_FPS)
    padding = y_pred[idx]['onset'].shape[0] - onsets.shape[0]
    onsets = np.pad(onsets, [(0, padding), (0, 0)], 'constant')
    y_train.append(onsets)



In [103]:
frames_ratio = []
for idx in range(0,10):
    y_pred[idx]['onset'].shape
    filename = "datasets/maestro/maestro-v3.0.0/" + audio_filenames['{}'.format(idx)]
    audio, sr = librosa.load(filename, sr=AUDIO_SAMPLE_RATE)
    # get file length in seconds
    file_length = librosa.get_duration(y = audio, sr = sr)
    # get frames / duration
    frames_ratio.append(y_pred[idx]['onset'].shape[0] / file_length)

print(frames_ratio)

[85.99908121847122, 85.99975048108587, 85.9984533224385, 85.99935673157775, 85.99752409290758, 85.99765904452198, 85.99927325581396, 85.99889281996315, 85.99592687252594, 85.99768746472216]


In [83]:
# Example tensors (replace with actual data)
y_true = tf.constant(onsets)  # Ground truth binary representation
y_prediction = tf.constant(y_pred[idx]['onset'])  # Model's predicted probabilities

bce = tf.keras.losses.BinaryCrossentropy()
loss = bce(y_true, y_prediction)
print(loss.numpy())



0.64883465


In [89]:
# train the model

# Initialize the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
loss_fn = tf.keras.losses.BinaryCrossentropy()



In [90]:
num_epochs = 3

In [None]:

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Loop through batches
    for step, (x_batch, y_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            # Forward pass
            logits = model(x_batch, training=True)
            
            # Compute the loss
            loss_value = loss_fn(y_batch, logits)
        
        # Get gradients
        gradients = tape.gradient(loss_value, model.trainable_weights)
        
        # Update weights
        optimizer.apply_gradients(zip(gradients, model.trainable_weights))
        
        # Update the metric
        metric.update_state(y_batch, logits)
        
        # Print progress
        if step % 100 == 0:
            print(f"Step {step}: loss = {loss_value:.4f}, accuracy = {metric.result().numpy():.4f}")
            
    # Reset metric at the end of epoch
    metric.reset_states()
    val_metric = tf.keras.metrics.SparseCategoricalAccuracy()

    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val, training=False)
        val_metric.update_state(y_batch_val, val_logits)
        
    print(f"Validation accuracy: {val_metric.result().numpy():.4f}")
    val_metric.reset_states()

