In [1]:
from random import randrange
import numpy as np
from miditoolkit.midi import parser as mid_parser  
from miditoolkit.midi import containers as ct



def filterMelodyNotes(notes):
    """
    Filter out melody notes.
    Returns a tuple of (normal notes,ghost notes)
    """
    normal_notes=[]
    ghost_notes=[]
    for i,note in enumerate(notes):
        context=notes[i-10:i+10]
        if isMelodyNote(note,context):
            ghost_notes.append(note)
        else:
            normal_notes.append(note)
    return (normal_notes,ghost_notes)

def isMelodyNote(note,context_notes,heuristic="velocity",params=None):
    """
    Check if a note is a ghost note.
    """
    heuristic=heuristics[heuristic]
    return heuristic(note,context_notes,params)

def velocityThreshold(test_note,context_notes,params):
    ''' 
    Detect outliers in velocity.
    '''
    method=params["method"] if params is not None else 1
    all_velocities=[note.velocity for note in context_notes]+[test_note.velocity]
    outliers=getOutliers(all_velocities,method)

    # check if test_note is an outlier
    if test_note.velocity in outliers:
        return True

def getOutliers(data,method=1):
    if method==1:
        outliers=[]
        
        z_threshold=2
        median = np.median(data)
        std =np.std(data,)
        
        
        for y in data:
            z_score= (y - median)/std
            if z_score>z_threshold:
                outliers.append(y)
    elif method==2:
        data=sorted(data)
        q1, q3= np.percentile(data,[25,75])
        iqr = q3 - q1
        lower_bound = q1 -(1.5 * iqr) 
        outliers=[]
        for y in data:
            if y<lower_bound:
                outliers.append(y)
    return outliers

def split2midi(normal_notes,ghost_notes):
    """
    Convert a list of notes to a midi file where first instrument is normal notes and second instrument is ghost notes.
    """
    normal_notes = [note for note in normal_notes if note.velocity != 0]
    ghost_notes = [note for note in ghost_notes if note.velocity != 0]
    mido_obj = mid_parser.MidiFile()
    beat_resol = mido_obj.ticks_per_beat

    # create an  instrument
    normal_instrument = mid_parser.Instrument(program=0)
    ghost_instrument = mid_parser.Instrument(program=1)
    mido_obj.instruments.append(normal_instrument)
    mido_obj.instruments.append(ghost_instrument)
    normal_instrument.notes = normal_notes
    ghost_instrument.notes = ghost_notes
    
    return mido_obj



heuristics={
    "velocity":velocityThreshold,
}

In [2]:
import pandas as pd

dataset_dir = "Datasets/asap-dataset"
# read in json file as pandas dataframe
annotations=pd.read_json(f"{dataset_dir}/asap_annotations.json").transpose()
# add column for score_filename to annotations that converts row name to score_filename
annotations['score_filename'] = annotations.index.map(lambda x: f"{'/'.join(x.split('/')[:-1])}/midi_score.mid")
# rename index to performance_filename
annotations.index=annotations.index.rename('performance_filename')
annotations.reset_index(inplace=True)

annotations.head(2)

# only keep rows of annotations where score_and_performance_aligned is True	
annotations = annotations[annotations['score_and_performance_aligned'] == True]
annotations.head(2)

# unique score_filename values
score_filenames = annotations['score_filename'].unique()

performance_filenames = annotations['performance_filename'].unique()

In [3]:
 

total=2
composers=["Bach","Chopin","Mozart"]
counts=[0 for i in range(len(composers))]

for filename in tqdm(performance_filenames):
    composer=filename.split("/")[0]
    if composer in composers and counts[composers.index(composer)]<total:
        counts[composers.index(composer)]+=1
        full_filename=f"Datasets/asap-dataset/{filename}"
        midi_file=mtk.MidiFile(full_filename)
        notes=[]
        for instrument in midi_file.instruments:
            notes.extend(instrument.notes)
        notes=sorted(notes,key=lambda x:x.start)
        normal_notes,ghost_notes=filterMelodyNotes(notes)
        print(f"{filename} has {len(ghost_notes)} ghost notes")
        out_midi=split2midi(normal_notes,ghost_notes)
        
        output_filename=f"{composer}_{filename.split('/')[-1]}"
        out_midi.dump(f"Store/Filter/{output_filename}")

  0%|          | 1/1037 [00:00<02:41,  6.40it/s]

Bach/Fugue/bwv_846/Shi05M.mid has 3 ghost notes
Bach/Fugue/bwv_848/Denisova06M.mid has 23 ghost notes


 43%|████▎     | 441/1037 [00:01<00:01, 374.19it/s]

Chopin/Ballades/1/Ali01.mid has 48 ghost notes


 45%|████▍     | 463/1037 [00:02<00:03, 190.69it/s]

Chopin/Ballades/1/BuiJL04M.mid has 43 ghost notes


 86%|████████▌ | 892/1037 [00:02<00:00, 339.54it/s]

Mozart/Fantasie_475/Huangci05M.mid has 64 ghost notes


100%|██████████| 1037/1037 [00:03<00:00, 297.68it/s]

Mozart/Piano_Sonatas/11-3/Stahievitch02.mid has 27 ghost notes



