In [1]:
'''
*** This is a slightly modified version of the original data processing script available with the dataset

This is the data processing script for POP909:A Pop song Dataset for Music Arrangement Generation
============
It will allow you to quickly process the POP909 Files (Midi) into the Google Magenta's music representation 
    as like [Music Transformer](https://magenta.tensorflow.org/music-transformer) 
            [Performance RNN](https://magenta.tensorflow.org/performance-rnn).

'''
import pickle
import os
import sys
from helpers.midi import MidiEventProcessor
import pretty_midi as pyd
import numpy as np

BASE_DIR = "/home/rithomas"
DATA_DIR = os.path.join(BASE_DIR, "data", "POP909-Dataset", "POP909")
OUTPUT_DIR = os.path.join(BASE_DIR, "cache", "preprocessed", "POP909")
save_path = os.path.join(OUTPUT_DIR, "pop909-event-token.npy")

In [2]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

def prepare_midi_notes(notes):
    for i in range(len(notes)):
        notes[i].start = round(notes[i].start,2)    
    notes.sort(key = lambda x:x.start)
    return notes

def preprocess_midi(path):
    data = pyd.PrettyMIDI(path)    

    mpr = MidiEventProcessor()
    separated_notes = {}
    repr_seq = {}
    
    for instr in data.instruments:
        separated_notes[instr.name] = prepare_midi_notes(instr.notes)
        repr_seq[instr.name] = mpr.encode(separated_notes[instr.name])
        print(len(repr_seq[instr.name]))

    return repr_seq

def preprocess_pop909(midi_root, save_dir):
    save_py = []
    midi_paths = [d for d in os.listdir(midi_root)]
    i = 0
    out_fmt = '{}-{}.data'
    for path in midi_paths:
        if (path.isnumeric()):
            print(' ', end='[{}]'.format(path), flush=True)
            filename = midi_root + '/' + path + '/' + path + '.mid'
            try:
                data = preprocess_midi(filename)
            except KeyboardInterrupt:
                print(' Abort')
                return
            except EOFError:
                print('EOF Error')
                return
            save_py.append(data)
        
    save_py = np.array(save_py)
    print(save_py.size)
    np.save(save_path, save_py)
            
    
# replace the folder with your POP909 data folder
#preprocess_pop909(DATA_DIR, OUTPUT_DIR)

In [3]:
pop909 = np.load(save_path, allow_pickle=True)
song = pop909[0]
print(song.keys())
print(song)

dict_keys(['MELODY', 'BRIDGE', 'PIANO'])
{'MELODY': [355, 355, 355, 355, 355, 355, 355, 355, 355, 355, 355, 355, 355, 355, 324, 374, 63, 276, 191, 256, 375, 66, 278, 194, 375, 65, 300, 375, 63, 193, 277, 376, 65, 257, 191, 321, 375, 66, 193, 277, 374, 68, 194, 323, 375, 66, 259, 196, 294, 66, 194, 194, 286, 376, 65, 285, 193, 256, 375, 63, 280, 191, 256, 376, 61, 301, 189, 258, 374, 63, 293, 191, 262, 375, 61, 301, 374, 58, 256, 189, 276, 375, 56, 258, 186, 355, 277, 184, 291, 375, 54, 275, 182, 257, 374, 56, 278, 58, 258, 184, 355, 269, 186, 275, 375, 58, 267, 186, 266, 375, 58, 277, 186, 375, 61, 294, 189, 261, 375, 59, 299, 187, 256, 374, 58, 301, 374, 59, 258, 186, 297, 58, 258, 187, 353, 186, 291, 375, 58, 268, 186, 264, 374, 58, 272, 186, 260, 375, 63, 293, 191, 262, 374, 61, 301, 374, 58, 258, 189, 297, 375, 54, 186, 299, 182, 373, 58, 347, 186, 375, 56, 346, 373, 59, 184, 346, 375, 56, 256, 187, 299, 184, 375, 58, 355, 355, 288, 186, 355, 290, 374, 63, 276, 191, 256, 375, 66, 2