Notebook mainly for filtering down the files that could be used in the dataset.

In [None]:
import itertools
import os

import pretty_midi
pretty_midi.pretty_midi.MAX_TICK = 1e16
import random

nes_ins_name_to_min_pitch = {
    'p1': 33,
    'p2': 33,
    'tr': 21
}
nes_ins_name_to_max_pitch = {
    'p1': 108,
    'p2': 108,
    'tr': 108
}


def instrument_is_monophonic(ins):
    # Ensure sorted
    ins.notes = sorted(ins.notes, key=lambda x: x.start)
    notes = ins.notes
    last_note_start = -1
    for n in notes:
        assert n.start >= last_note_start
        last_note_start = n.start

    monophonic = True
    for i in range(len(notes) - 1):
        n0 = notes[i]
        n1 = notes[i + 1]
        if n0.end > n1.start:
            monophonic = False
            break
    return monophonic


def emit_nesmdb_midi_examples(
        midi_fp,
        output_fname,
        min_num_instruments=1,
        filter_mid_len_below_seconds=5.,
        filter_mid_len_above_seconds=600.,
        filter_mid_bad_times=True,
        filter_ins_max_below=67,
        filter_ins_min_above=108,
        filter_ins_duplicate=True,
        output_include_drums=True,
        output_max_num=16,
        output_max_num_seconds=180.):
    midi_name = os.path.split(midi_fp)[1].split('.')[0]
    print('starting')

    if min_num_instruments <= 0:
        raise ValueError()

    # Ignore unusually large MIDI files (only ~25 of these in the dataset)
    if os.path.getsize(midi_fp) > (512 * 1024): #512K
        print('too large')
        return

    try:
        midi = pretty_midi.PrettyMIDI(midi_fp)
    except:
        print('improper prettymidi load')
        return
    
    # Filter MIDIs that are not just one violin instrument
    violins = [ins for ins in midi.instruments if ins.program == 40 or ins.program == 41]
    if len(violins) != 1:
        return

    # Filter MIDIs with extreme length
    midi_len = midi.get_end_time()
    if midi_len < filter_mid_len_below_seconds or midi_len > filter_mid_len_above_seconds:
        return

    # Filter out negative times and quantize to audio samples
    ins = [ins for ins in midi.instruments if ins.program == 40 or ins.program == 41][0]
    for n in ins.notes:
        if filter_mid_bad_times:
            if n.start < 0 or n.end < 0 or n.end < n.start:
                return
        n.start = round(n.start * 44100.) / 44100.
        n.end = round(n.end * 44100.) / 44100.


    # Filter out instruments with bizarre ranges
    pitches = [n.pitch for n in ins.notes]
    min_pitch = min(pitches)
    max_pitch = max(pitches)
    if not(max_pitch >= filter_ins_max_below and min_pitch <= filter_ins_min_above):
        return

    # Sort notes for polyphonic filtering and proper saving
    ins.notes = sorted(ins.notes, key=lambda x: x.start)

    # Filter out polyphonic instruments
    print(instrument_is_monophonic(ins))
    if not(instrument_is_monophonic(ins)):
        return
    
    # filter out files that have no velocity variation
    vs = set([note.velocity for note in ins.notes])
    
    if len(vs) == 1:
        return

    # save this midi file name to a text file
    with open(output_fname, 'a') as f:
        print('success!')
        print(midi_fp)
        f.write(midi_fp + '\n')

In [None]:
import matplotlib.pyplot as plt

In [None]:
startends = np.array([[n.start, n.end] for n in ins.notes])

In [None]:
startends[1:,0] - startends[:-1,1]

In [None]:
emit_nesmdb_midi_examples('6/62e625a3b10002509a1402b5d8c94ca0.mid', 'testout.midi')

In [None]:
midi = pretty_midi.PrettyMIDI('0/088a14b1c7a4f8113e724c2c27fadcce.mid')

In [None]:
dir(midi)

In [None]:
midi.

In [None]:
midi.fluidsynth()

In [None]:
midi = emit_nesmdb_midi_examples('0/088a14b1c7a4f8113e724c2c27fadcce.mid', 'temp')

In [None]:
midi = emit_nesmdb_midi_examples('0/016521b8455db300c5a74a831e6b8538.mid', 'temp')

In [None]:
print(midi)

In [None]:
midi.instruments[1].program

In [None]:
import glob
import shutil
import multiprocessing

import numpy as np
import pretty_midi
from tqdm import tqdm

midi_fps = glob.glob('dataset/*/*.mid*')
out_dir = 'filelist3.txt'

def _task(x):
    emit_nesmdb_midi_examples(x, out_dir)

with multiprocessing.Pool(8) as p:
    r = list(tqdm(p.imap(_task, midi_fps), total=len(midi_fps))) # ~1:37:40

In [None]:
midi_fps