# Score Melody Dataset

### Imports

In [4]:
import pandas as pd
from utils import read_annotations, read_mid, getNotes, getBeats,save_mid
from random import randrange
import numpy as np
from miditoolkit.midi import parser as mid_parser  
from miditoolkit.midi import containers as ct
import miditoolkit as mtk
from tqdm import tqdm
from melody_extraction import extractMelody,split2midi
import random
random.seed(420)

### Read annotations

In [None]:
dataset_dir = "../Sample_data/asap-dataset"
annotations_file=f"{dataset_dir}/asap_annotations.json"

annotations=read_annotations(annotations_file,process=False)

score_filenames=annotations['score_filename'].unique()
performance_filenames=annotations['performance_filename'].unique()

### Helper functions

In [None]:
def perfNote2ScoreNote(perf_note,score,performance,score_beats,performance_beats):
    """
    Score, Performance are a list of notes
    """
    # Get the beats in between which perf_note is present
    start_beat=getStartBeat(perf_note,performance_beats,performance)
    
    try:
        end_beat=min(start_beat+2,len(score_beats)-1)
        start_beat=max(0,start_beat-2)
    except:
        # print(f"start_beat {start_beat} out of range")
        return None,None


    # Get the start and end time of the beat for performance
    perf_start_time=performance_beats[start_beat]
    perf_end_time=performance_beats[end_beat]
    
    # Get the notes in that beat for performance
    perf_notes=notesBetween(perf_start_time,perf_end_time,performance)

    # Get the start and end time of the beat for score
    score_start_time=score_beats[start_beat]
    score_end_time=score_beats[end_beat]

    # Get the notes in that beat for score
    score_notes=notesBetween(score_start_time,score_end_time,score)

    # Get the note in score_notes corresponding to perf_note
    score_note,prob_dist=identifyNote(perf_note,score_notes,perf_notes)

    return(score_note,prob_dist)

def identifyNote(perf_note,score_notes,perf_notes):
    """
    Returns note in score_notes corresponding to perf_note
    """
    radius=5
    perf_pitches=[note.pitch for note in perf_notes]
    score_pitches=[note.pitch for note in score_notes]
    
    # Assert that perf_note is in perf_notes
    
    if not noteInNotes(perf_note,perf_notes):
        # print(f"perf_note {perf_note} not in score_pitches")
        return(None,None)
    

    if perf_note.pitch not in score_pitches:
        # print(f"perf_note {perf_note} not in score_pitches")
        return(None,None)

    # Get the set of notes in perf_notes that are near perf_note
    perf_pitches_near=pitchesNear(perf_note,perf_notes,radius=radius)

    # Find position of perf_pitch in score_pitches with maximum similarity
    sim_list=[]
    for i in range(len(score_pitches)):
        score_pitch=score_pitches[i]
        if score_pitch==perf_note.pitch:
            score_pitch_near=pitchesNear(score_notes[i],score_notes,radius=radius)
            sim=similarity(perf_pitches_near,score_pitch_near)
            
            sim_list.append((i,sim))

    # Get the index of the note in score_notes with maximum similarity
    max_sim=max(sim_list,key=lambda x:x[1])
    max_sim_index=max_sim[0]

    prob_dist=getDistribution(sim_list)

    # Return the note in score_notes corresponding to max_sim_index
    score_note=score_notes[max_sim_index]
    
    return score_note,prob_dist

def getDistribution(sim_list):
    """
    Returns the probability distribution of sim_list
    """
    sim_list=np.array(sim_list)
    sim_list[:,1]=sim_list[:,1]/sum(sim_list[:,1])
    # Convert to a list of tuples
    sim_list=[tuple(x) for x in sim_list]
    # Sort the list by similarity
    sim_list=sorted(sim_list,key=lambda x:x[1],reverse=True)
    return sim_list
    

def similarity(set1,set2):
    """
    Returns the similarity between two sets
    """
    return len(set1.intersection(set2))/len(set1.union(set2))


def notesNear(note,notes,radius=5):
    """
    Returns the notes in notes that are near note
    """
    
    note_index=findNote(note,notes)
    start_index=max(0,note_index-radius)
    end_index=min(len(notes),note_index+radius)
    return notes[start_index:end_index]

def findNote(note,notes):
    """
    Returns the note index in notes that is equal to note
    """
    for i in range(len(notes)):
        current_note=notes[i]
        if note.pitch==current_note.pitch and note.start==current_note.start and note.end==current_note.end:
            return i
    return None

def pitchesNear(note,notes,radius=5):
    """
    Returns the pitches of notes in notes that are near note
    """
    notes_near=notesNear(note,notes,radius)
    return set([note.pitch for note in notes_near])
    

def noteInNotes(note,notes):
    """
    Returns True if note is in notes
    """
    for n in notes:
        if note.pitch==n.pitch and note.start==n.start and note.end==n.end:
            return True
    return False


def getStartBeat(note,beats,midi_obj):
    ''' Returns the start beat of note in midi_obj '''
    t2tmap=midi_obj.get_tick_to_time_mapping()
    
    start_ticks=note.start

    start_time=t2tmap[start_ticks]

    for i in range(len(beats)-1):
        if beats[i]<start_time and beats[i+1]>start_time:
            return i


def notesBetween(start_time,end_time, midi_obj : mtk.MidiFile):
    """
    Returns notes between start_time and end_time
    """
    notes=getNotes(midi_obj)
    t2tmap=midi_obj.get_tick_to_time_mapping()
    notes_in_range=[]
    for note in notes:
        note_start_time=t2tmap[note.start]
        note_end_time=t2tmap[note.end]
        if note_start_time>=start_time and note_end_time<=end_time:
            notes_in_range.append(note)
    return notes_in_range


In [None]:
import os
def extractScoreMelody(score_filename,performance_filename,annotations,dataset_dir="Sample_Data/asap-dataset"):
    score_w_velocity=getScoreWithVelocity(score_filename,performance_filename,annotations,dataset_dir)

    # save the score with velocity
    score_w_velocity_filename="temp.mid"

    score_w_velocity.dump(score_w_velocity_filename)

    # Extract the melody from the score with velocity
    score_w_melody=extractMelody(score_w_velocity_filename)

    # Remove the score with velocity
    os.remove(score_w_velocity_filename)

    return(score_w_melody)


def getScoreWithVelocity(score_filename,performance_filename,annotations,dataset_dir="Sample_Data/asap-dataset"):
    score_path=os.path.join(dataset_dir,score_filename)
    performance_path=os.path.join(dataset_dir,performance_filename)

    # Read in score and performance
    score_obj=read_mid(score_path)
    performance_obj=read_mid(performance_path)

    # Get notes
    score_notes=getNotes(score_obj)
    performance_notes=getNotes(performance_obj)

    # Get beats
    score_beats = getBeats(score_filename,annotations,"score")
    performance_beats = getBeats(performance_filename,annotations,"performance")

    # Create a dictionary to map performance note to score note
    not_matched=0
    perf2score_dict={}
    for i,perf_note in tqdm(enumerate(performance_notes),total=len(performance_notes)):
        score_note,sim_list=perfNote2ScoreNote(perf_note,score_obj,performance_obj,score_beats,performance_beats)
        if score_note is not None:
            perf2score_dict[perf_note]=score_note
        else:
            not_matched+=1

    print("Not matched: ",not_matched)
    print("Not matched percentage",int(100*not_matched/len(performance_notes)))

    # Reverse perf2score_dict

    score2perf_dict={}

    for perf_note,score_note in perf2score_dict.items():
        if score_note not in score2perf_dict.keys():
            score2perf_dict[score_note]=[perf_note]
        else:
            score2perf_dict[score_note].append(perf_note)

    # For each score note, set the velocity to be the velocity of the corresponding performance note

    for score_note in score2perf_dict.keys():
        perf_note=score2perf_dict[score_note][0]
        score_note.velocity=perf_note.velocity

    # Remove instruments in score

    for i,track in enumerate(score_obj.instruments):
        if i!=0:
            score_obj.instruments.remove(track)

    # Add notes 
    for score_note in score2perf_dict.keys():
        score_obj.instruments[0].notes.append(score_note)

    return(score_obj)

### Extract melody for the score

In [None]:
from utils import score2PerfFileMap
import json

def genScore2MelodyDataset(annotations,dataset_dir="../Sample_Data/asap-dataset",store_dir="../Store/Score2Melody"):

    file_map=score2PerfFileMap(annotations)

    score_filenames=list(file_map.keys())



    for score_filename in tqdm(score_filenames):
        performance_filenames=file_map[score_filename]
        for performance_filename in performance_filenames:
            print("--------------------")
            score_w_melody=extractScoreMelody(score_filename,performance_filename,annotations, dataset_dir)
            print("--------------------")

            score_output_path=os.path.join(store_dir,performance_filename.replace(".mid","_score.mid"))
           

            save_mid(score_w_melody,score_output_path)

    
    # save annotations
    annotations_output_path=os.path.join(store_dir,"annotations.json")
    annotations.to_json(annotations_output_path)

In [None]:
genScore2MelodyDataset(annotations)

### Test for a single pair, saved in Store/Score2Melody
TODO : Create folder if it doesn't exist

In [8]:
file_map=score2PerfFileMap(annotations)

score_filenames=list(file_map.keys())

print(score_filenames)
score_filename="Mozart/Piano_Sonatas/11-3/midi_score.mid"
performance_filename=file_map[score_filename][0]

print(score_filename)
score_w_melody,performance_w_melody=extractScoreMelody(score_filename,performance_filename,annotations, dataset_dir)

store_dir="../Store/Score2Melody"

save_mid(score_w_melody,os.path.join(store_dir,"score.mid"))
save_mid(performance_w_melody,os.path.join(store_dir,"performance.mid"))

['Bach/Fugue/bwv_846/midi_score.mid', 'Bach/Fugue/bwv_848/midi_score.mid', 'Balakirev/Islamey/midi_score.mid', 'Beethoven/Piano_Sonatas/1-1/midi_score.mid', 'Beethoven/Piano_Sonatas/10-1/midi_score.mid', 'Brahms/Six_Pieces_op_118/2/midi_score.mid', 'Chopin/Ballades/1/midi_score.mid', 'Chopin/Ballades/2/midi_score.mid', 'Debussy/Images_Book_1/1_Reflets_dans_lEau/midi_score.mid', 'Debussy/Pour_le_Piano/1/midi_score.mid', 'Glinka/The_Lark/midi_score.mid', 'Haydn/Keyboard_Sonatas/31-1/midi_score.mid', 'Haydn/Keyboard_Sonatas/32-1/midi_score.mid', 'Liszt/Annees_de_pelerinage_2/1_Gondoliera/midi_score.mid', 'Liszt/Ballade_2/midi_score.mid', 'Mozart/Fantasie_475/midi_score.mid', 'Mozart/Piano_Sonatas/11-3/midi_score.mid', 'Prokofiev/Toccata/midi_score.mid', 'Rachmaninoff/Preludes_op_23/4/midi_score.mid', 'Rachmaninoff/Preludes_op_23/6/midi_score.mid', 'Ravel/Gaspard_de_la_Nuit/1_Ondine/midi_score.mid', 'Ravel/Miroirs/3_Une_Barque/midi_score.mid', 'Schubert/Impromptu_op.90_D.899/1/midi_score.m

 25%|██▌       | 707/2821 [00:05<00:15, 138.57it/s]


KeyboardInterrupt: 

# Melody Dataset to tokens

In [None]:
score2melodyFolder="../Store/Score2Melody"

# read in annotations
annotations_path=os.path.join(score2melodyFolder,"annotations.json")

file_map=score2PerfFileMap(annotations)

score_filenames=list(file_map.keys())

for score_filename in score_filenames:
    performance_filenames=file_map[score_filename]

    for performance_filename in performance_filenames:

        # convert to tokens 
        score_path=os.path.join(score2melodyFolder,performance_filename.replace(".mid","_score.mid"))
        performance_path=os.path.join(score2melodyFolder,performance_filename)



In [None]:
score_filename="Mozart/Piano_Sonatas/11-3/midi_score.mid"
performance_filename="Mozart/Piano_Sonatas/11-3/Stahievitch02.mid"

store_dir="../Sample_Data/asap-dataset"

score_path=os.path.join(store_dir,score_filename)
performance_path=os.path.join(store_dir,performance_filename)

annotations=read_annotations("../Sample_Data/asap-dataset/asap_annotations.json",process=False)

# Read in score and performance
score_obj=read_mid(score_path)
performance_obj=read_mid(performance_path)

In [38]:

from utils import read_mid, getNotes
from matplotlib import pyplot as plt
from ipywidgets import interact

def filterNotes(notes,threshold=32):
    return [note for note in notes if note.velocity>threshold]

def plotWindows(midi_file,window_sizes,start_time,composer):
    data=[]

    for window_size in window_sizes:
        mid=read_mid(midi_file)
        notes=getNotes(mid)

        # notes between 0 and window_size
        notes_in_window=notesBetween(start_time,start_time+window_size,mid)
        velocities=[note.velocity for note in notes_in_window]

        data.append((window_size,velocities))


        
    # subplot data histograms
    fig, axs = plt.subplots(2, 3)
    # increase space
    fig.subplots_adjust(hspace=1.5)
    # increase size
    fig.set_size_inches(18.5, 10.5)
    fig.suptitle(composer, fontsize=16)

    for i,(window_size,velocities) in enumerate(data):
        axs[i//3,i%3].hist(velocities,bins=32)
        axs[i//3,i%3].set_title("Window size: "+str(window_size))
        axs[i//3,i%3].set_xlabel("Velocity")
        axs[i//3,i%3].set_ylabel("Frequency")


dictionary={
    "Bach":"../Store/Score2Melody/Bach/Fugue/bwv_846/Shi05M_score.mid",
    "Chopin":"../Store/Score2Melody/Chopin/Ballades/1/Ali01_score.mid",
    "Mozart": "../Store/Score2Melody/Mozart/Piano_Sonatas/11-3/Stahievitch02_score.mid",
    "Rachmanioff":"../Store/Score2Melody/Rachmaninoff/Preludes_op_23/4/ChenGuang12M_score.mid"
    }

@interact(composer=list(dictionary.keys()),window_size=(1,100,1),start_time=(0,50,1))
def plotWindowVelocities(composer,window_size,start_time):

    midi_file=dictionary[composer]
    mid=read_mid(midi_file)
    notes=getNotes(mid)

    # notes between 0 and window_size
    notes_in_window=notesBetween(start_time,start_time+window_size,mid)
    velocities=[note.velocity for note in notes_in_window]

    plt.hist(velocities,bins=128)
    plt.title(composer)
    plt.xlabel("Velocity")
    plt.ylabel("Frequency")
    plt.show()

interactive(children=(Dropdown(description='composer', options=('Bach', 'Chopin', 'Mozart', 'Rachmanioff'), va…

In [39]:
@interact(composer=list(dictionary.keys()),window_size=(1,100,1),start_time=(0,50,1))
def plotWindowDurations(composer,window_size,start_time):

    midi_file=dictionary[composer]
    mid=read_mid(midi_file)
    notes=getNotes(mid)

    # notes between 0 and window_size
    notes_in_window=notesBetween(start_time,start_time+window_size,mid)
    velocities=[note.end-note.start for note in notes_in_window]

    plt.hist(velocities,bins=128)
    plt.title(composer)
    plt.xlabel("Velocity")
    plt.ylabel("Frequency")
    plt.show()

interactive(children=(Dropdown(description='composer', options=('Bach', 'Chopin', 'Mozart', 'Rachmanioff'), va…

In [40]:
dictionary={
    "Bach":"../Store/Score2Melody/Bach/Fugue/bwv_846/Shi05M_score.mid",
    "Chopin":"../Store/Score2Melody/Chopin/Ballades/1/Ali01_score.mid",
    "Mozart": "../Store/Score2Melody/Mozart/Piano_Sonatas/11-3/Stahievitch02_score.mid",
    "Rachmanioff":"../Store/Score2Melody/Rachmaninoff/Preludes_op_23/4/ChenGuang12M_score.mid"
    }

@interact(composer=list(dictionary.keys()),window_size=(1,100,1),start_time=(0,50,1))
def plotWindowPitches(composer,window_size,start_time):

    midi_file=dictionary[composer]
    mid=read_mid(midi_file)
    notes=getNotes(mid)

    # notes between 0 and window_size
    notes_in_window=notesBetween(start_time,start_time+window_size,mid)
    pitches=[note.pitch for note in notes_in_window]

    plt.hist(pitches,bins=128)
    plt.title(composer)
    plt.xlabel("Pitch")
    plt.ylabel("Frequency")
    plt.show()

interactive(children=(Dropdown(description='composer', options=('Bach', 'Chopin', 'Mozart', 'Rachmanioff'), va…

In [41]:
dictionary={
    "Bach":"../Store/Score2Melody/Bach/Fugue/bwv_846/Shi05M_score.mid",
    "Chopin":"../Store/Score2Melody/Chopin/Ballades/1/Ali01_score.mid",
    "Mozart": "../Store/Score2Melody/Mozart/Piano_Sonatas/11-3/Stahievitch02_score.mid",
    "Rachmanioff":"../Store/Score2Melody/Rachmaninoff/Preludes_op_23/4/ChenGuang12M_score.mid"
    }
def numMelodyNotes(notes,threshold):
    return (len([note for note in notes if note.velocity>=threshold]))

@interact(composer=list(dictionary.keys()),window_size=(1,100,1),start_time=(0,50,1),min_velocity=(0,127,1))
def plotWindowVelocities(composer,window_size,start_time,min_velocity=50):

    midi_file=dictionary[composer]
    mid=read_mid(midi_file)
    notes=getNotes(mid)

    # notes between 0 and window_size
    notes_in_window=notesBetween(start_time,start_time+window_size,mid)
    velocities=[note.velocity for note in notes_in_window]
    velocity_thresholds=range(min_velocity,128,1)

    data=[]
    for velocity_threshold in velocity_thresholds:
        num_melody=numMelodyNotes(notes_in_window,velocity_threshold)
        data.append((velocity_threshold,num_melody))
    
    plt.plot([x[0] for x in data],[x[1]/window_size for x in data])
    plt.xlabel("Velocity threshold")
    plt.ylabel("Melody notes per second")
    # plot lines at 3 and 6 notes
    plt.axhline(y=0.25,color="red")
    plt.axhline(y=2,color="red")


interactive(children=(Dropdown(description='composer', options=('Bach', 'Chopin', 'Mozart', 'Rachmanioff'), va…

In [None]:
window_sizes=[1,2,3,5,10,30] # seconds
velocity_thresholds=[30]
start_time=0


for composer,midi_file in dictionary.items():
    plotWindows(midi_file,window_sizes,start_time,composer)

In [35]:

from copy import deepcopy
from re import M

def inNotes(note,notes):
    for n in notes:
        if note.start==n.start and note.end==n.end and note.pitch==n.pitch:
            return True
    return False

def notesBetween(start_time,end_time, midi_obj : mtk.MidiFile,instrument=-1):
    """
    Returns notes between start_time and end_time
    """
    notes=getNotes(midi_obj,instrument)
    t2tmap=midi_obj.get_tick_to_time_mapping()
    notes_in_range=[]
    for note in notes:
        note_start_time=t2tmap[note.start]
        note_end_time=t2tmap[note.end]
        if note_start_time>=start_time and note_end_time<=end_time:
            notes_in_range.append(note)
    return notes_in_range


def velocityThresholdFiltering(window_notes,mps_range=[0.25,2],window_length=-1):
    # window lopength in seconds
    if window_length==-1:
        window_length=window_notes[-1].end-window_notes[0].start

    # sort by velocity
    window_notes=sorted(window_notes,key=lambda x: x.velocity,reverse=True)

    num_notes_range=range(int(mps_range[0]*window_length),int(mps_range[1]*window_length),1)



    for threshold in range(128):
        num_notes=len([note for note in window_notes if note.velocity>=threshold])
        if num_notes in num_notes_range:
            print("Threshold: ",threshold)
            print("Num notes: ",num_notes)
            # if isStableThreshold(threshold,window_notes):
            #     break
            break
    # filter notes

    melody_notes=[note for note in window_notes if note.velocity>=threshold]

    # Assert melody contained in window
    for note in melody_notes:
        assert inNotes(note,window_notes)


    # print mps
    print("Melody notes per second: ",len(melody_notes)/window_length)

    return melody_notes


def isStableThreshold(threshold,notes):

    # sort by velocity
    notes=sorted(notes,key=lambda x: x.velocity,reverse=True)

    threshold_range=range(threshold,threshold+2)

    num_melody_notes_values=[]

    for threshold in threshold_range:
        # number of melody notes
        num_melody_notes=len([note for note in notes if note.velocity>=threshold])

        num_melody_notes_values.append(num_melody_notes)


    check=True

    for i in range(len(num_melody_notes_values)-1):
        if num_melody_notes_values[i]!=num_melody_notes_values[i+1]:
            check=False
    
def extractMelodyAgain(mid,window_size=5,instrument=0):
    notes=getNotes(mid,instrument=instrument)
    stored_notes=deepcopy(notes)

    # Chop up the notes into windows of window_size seconds

    t2tmap=mid.get_tick_to_time_mapping()
    max_time=t2tmap[notes[-1].end]
    windows=[]

    for start_time in range(0,int(max_time),window_size):
        window=notesBetween(start_time,start_time+window_size,mid,instrument=instrument)

        windows.append(window)



    # For each window, do velocity threshold filtering

    melody_notes=[]
    for window in windows:
        
        window_melody_notes=velocityThresholdFiltering(window,window_length=window_size)

        melody_notes+=window_melody_notes


        

    # Get normal notes (stored-melody_notes)
    normal_notes=[note for note in notes if not inNotes(note,melody_notes)]
    
    mid=split2midi(normal_notes,melody_notes)

    return mid

In [36]:
dictionary={
    "Bach":"../Store/Score2Melody/Bach/Fugue/bwv_846/Shi05M_score.mid",
    "Chopin":"../Store/Score2Melody/Chopin/Ballades/1/Ali01_score.mid",
    "Mozart": "../Store/Score2Melody/Mozart/Piano_Sonatas/11-3/Stahievitch02_score.mid",
    "Rachmanioff":"../Store/Score2Melody/Rachmaninoff/Preludes_op_23/4/ChenGuang12M_score.mid"
    }

composer="Chopin"
midi_file=dictionary[composer]

mid=read_mid(midi_file)

mid_out=extractMelodyAgain(mid,window_size=5,instrument=0)

mid_out.dump("test.mid")

Threshold:  74
Num notes:  9
Melody notes per second:  1.8
Threshold:  54
Num notes:  9
Melody notes per second:  1.8
Threshold:  0
Num notes:  2
Melody notes per second:  0.4
Threshold:  43
Num notes:  9
Melody notes per second:  1.8
Threshold:  41
Num notes:  9
Melody notes per second:  1.8
Threshold:  37
Num notes:  9
Melody notes per second:  1.8
Threshold:  58
Num notes:  9
Melody notes per second:  1.8
Threshold:  0
Num notes:  9
Melody notes per second:  1.8
Threshold:  45
Num notes:  9
Melody notes per second:  1.8
Threshold:  48
Num notes:  8
Melody notes per second:  1.6
Threshold:  33
Num notes:  9
Melody notes per second:  1.8
Threshold:  0
Num notes:  9
Melody notes per second:  1.8
Threshold:  67
Num notes:  9
Melody notes per second:  1.8
Threshold:  73
Num notes:  8
Melody notes per second:  1.6
Threshold:  50
Num notes:  9
Melody notes per second:  1.8
Threshold:  46
Num notes:  9
Melody notes per second:  1.8
Threshold:  0
Num notes:  9
Melody notes per second:  1.8
T

# Training

In [None]:
data_dir="../Store/Score2Melody/"

annotations_file=data_dir+"annotations.csv"