In [4]:
'''
*** This is a slightly modified version of the original data processing script available with the dataset

This is the data processing script for POP909:A Pop song Dataset for Music Arrangement Generation
============
It will allow you to quickly process the POP909 Files (Midi) into the Google Magenta's music representation 
    as like [Music Transformer](https://magenta.tensorflow.org/music-transformer) 
            [Performance RNN](https://magenta.tensorflow.org/performance-rnn).

'''
import pickle
import os
import sys
from helpers.midi import MidiEventProcessor
import pretty_midi as pyd
import numpy as np
import tensorflow as tf

BASE_DIR = "/home/rithomas"
#BASE_DIR = "/home/richhiey/Desktop/workspace/projects/virtual_musicians"

#DATA_DIR = os.path.join(BASE_DIR, "data", "POP909-Dataset", "POP909")
DATA_DIR = os.path.join(BASE_DIR, "data", "POP909-Dataset", "POP909")

#OUTPUT_DIR = os.path.join(BASE_DIR, "cache", "preprocessed", "POP909")
OUTPUT_DIR = os.path.join(BASE_DIR, "data", "preprocessed")

MIDI_EVENTS_PATH = os.path.join(OUTPUT_DIR, "pop909-event-token.npy")
TFRECORD_DATASET = os.path.join(OUTPUT_DIR, "train.tfrecords")

In [2]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

def prepare_midi_notes(notes):
    for i in range(len(notes)):
        notes[i].start = round(notes[i].start,2)    
    notes.sort(key = lambda x:x.start)
    return notes

def preprocess_midi(path):
    data = pyd.PrettyMIDI(path)    

    mpr = MidiEventProcessor()
    separated_notes = {}
    repr_seq = {}
    
    for instr in data.instruments:
        separated_notes[instr.name] = prepare_midi_notes(instr.notes)
        repr_seq[instr.name] = mpr.encode(separated_notes[instr.name])
        print(len(repr_seq[instr.name]))

    return repr_seq

def preprocess_pop909(midi_root, save_dir):
    save_py = []
    midi_paths = [d for d in os.listdir(midi_root)]
    i = 0
    out_fmt = '{}-{}.data'
    for path in midi_paths:
        if (path.isnumeric()):
            filename = midi_root + '/' + path + '/' + path + '.mid'
            print(' ', end='[{}]'.format(filename), flush=True)
            try:
                data = preprocess_midi(filename)
            except KeyboardInterrupt:
                print(' Abort')
                return
            except EOFError:
                print('EOF Error')
                return
            save_py.append(data)
        
    save_py = np.array(save_py)
    print(save_py.size)
    np.save(MIDI_EVENTS_PATH, save_py)
            
    
# replace the folder with your POP909 data folder
preprocess_pop909(DATA_DIR, OUTPUT_DIR)

 [/home/rithomas/data/POP909-Dataset/POP909/883/883.mid]1892
1378
3949
 [/home/rithomas/data/POP909-Dataset/POP909/440/440.mid]1413
1594
6341
 [/home/rithomas/data/POP909-Dataset/POP909/437/437.mid]1478
2051
3591
 [/home/rithomas/data/POP909-Dataset/POP909/070/070.mid]1497
619
4102
 [/home/rithomas/data/POP909-Dataset/POP909/193/193.mid]1335
851
5574
 [/home/rithomas/data/POP909-Dataset/POP909/007/007.mid]1719
1257
5525
 [/home/rithomas/data/POP909-Dataset/POP909/100/100.mid]1427
1877
5134
 [/home/rithomas/data/POP909-Dataset/POP909/094/094.mid]1453
899
4230
 [/home/rithomas/data/POP909-Dataset/POP909/177/177.mid]2200
784
5670
 [/home/rithomas/data/POP909-Dataset/POP909/199/199.mid]993
290
2710
 [/home/rithomas/data/POP909-Dataset/POP909/530/530.mid]1551
2768
5964
 [/home/rithomas/data/POP909-Dataset/POP909/889/889.mid]1542
1450
4506
 [/home/rithomas/data/POP909-Dataset/POP909/547/547.mid]1385
1392
4000
 [/home/rithomas/data/POP909-Dataset/POP909/867/867.mid]1958
1498
4304
 [/home/rith

 [/home/rithomas/data/POP909-Dataset/POP909/712/712.mid]2146
1580
4910
 [/home/rithomas/data/POP909-Dataset/POP909/765/765.mid]1126
1277
3914
 [/home/rithomas/data/POP909-Dataset/POP909/258/258.mid]1726
1599
4980
 [/home/rithomas/data/POP909-Dataset/POP909/322/322.mid]1703
991
4602
 [/home/rithomas/data/POP909-Dataset/POP909/355/355.mid]1760
1763
9599
 [/home/rithomas/data/POP909-Dataset/POP909/252/252.mid]1407
472
5506
 [/home/rithomas/data/POP909-Dataset/POP909/225/225.mid]1531
1126
5647
 [/home/rithomas/data/POP909-Dataset/POP909/328/328.mid]1594
1162
3982
 [/home/rithomas/data/POP909-Dataset/POP909/662/662.mid]1328
1660
4984
 [/home/rithomas/data/POP909-Dataset/POP909/615/615.mid]1680
1806
4757
 [/home/rithomas/data/POP909-Dataset/POP909/781/781.mid]1938
698
6176
 [/home/rithomas/data/POP909-Dataset/POP909/718/718.mid]2127
1566
4483
 [/home/rithomas/data/POP909-Dataset/POP909/126/126.mid]1831
1694
4476
 [/home/rithomas/data/POP909-Dataset/POP909/151/151.mid]1291
352
2557
 [/home/ri

 [/home/rithomas/data/POP909-Dataset/POP909/630/630.mid]1679
2884
3842
 [/home/rithomas/data/POP909-Dataset/POP909/647/647.mid]1546
1990
6393
 [/home/rithomas/data/POP909-Dataset/POP909/394/394.mid]1877
1074
4986
 [/home/rithomas/data/POP909-Dataset/POP909/200/200.mid]1787
1110
4635
 [/home/rithomas/data/POP909-Dataset/POP909/277/277.mid]1524
1240
5507
 [/home/rithomas/data/POP909-Dataset/POP909/299/299.mid]1002
707
3364
 [/home/rithomas/data/POP909-Dataset/POP909/544/544.mid]1666
2402
4841
 [/home/rithomas/data/POP909-Dataset/POP909/533/533.mid]2633
2191
5065
 [/home/rithomas/data/POP909-Dataset/POP909/813/813.mid]1541
1583
4467
 [/home/rithomas/data/POP909-Dataset/POP909/449/449.mid]1802
2326
6007
 [/home/rithomas/data/POP909-Dataset/POP909/864/864.mid]1508
1645
5098
 [/home/rithomas/data/POP909-Dataset/POP909/174/174.mid]1329
1336
4657
 [/home/rithomas/data/POP909-Dataset/POP909/103/103.mid]2055
657
6255
 [/home/rithomas/data/POP909-Dataset/POP909/097/097.mid]1712
1572
4505
 [/home/

 [/home/rithomas/data/POP909-Dataset/POP909/465/465.mid]1792
840
4883
 [/home/rithomas/data/POP909-Dataset/POP909/562/562.mid]1527
1217
4388
 [/home/rithomas/data/POP909-Dataset/POP909/481/481.mid]1851
112
7666
 [/home/rithomas/data/POP909-Dataset/POP909/515/515.mid]1388
1217
3638
 [/home/rithomas/data/POP909-Dataset/POP909/835/835.mid]1470
1619
4262
 [/home/rithomas/data/POP909-Dataset/POP909/418/418.mid]2060
2169
4968
 [/home/rithomas/data/POP909-Dataset/POP909/842/842.mid]1601
2312
6043
 [/home/rithomas/data/POP909-Dataset/POP909/152/152.mid]1102
645
4193
 [/home/rithomas/data/POP909-Dataset/POP909/125/125.mid]1413
1741
5567
 [/home/rithomas/data/POP909-Dataset/POP909/028/028.mid]1902
506
3344
 [/home/rithomas/data/POP909-Dataset/POP909/616/616.mid]1362
743
2701
 [/home/rithomas/data/POP909-Dataset/POP909/782/782.mid]1517
1976
4431
 [/home/rithomas/data/POP909-Dataset/POP909/661/661.mid]1778
2575
5795
 [/home/rithomas/data/POP909-Dataset/POP909/226/226.mid]2087
573
5894
 [/home/rith

 [/home/rithomas/data/POP909-Dataset/POP909/670/670.mid]1866
2047
4802
 [/home/rithomas/data/POP909-Dataset/POP909/039/039.mid]1254
517
4015
 [/home/rithomas/data/POP909-Dataset/POP909/143/143.mid]1555
2049
5605
 [/home/rithomas/data/POP909-Dataset/POP909/134/134.mid]1647
2376
6609
 [/home/rithomas/data/POP909-Dataset/POP909/824/824.mid]2070
1905
5427
 [/home/rithomas/data/POP909-Dataset/POP909/409/409.mid]1534
2112
4284
 [/home/rithomas/data/POP909-Dataset/POP909/853/853.mid]701
829
2725
 [/home/rithomas/data/POP909-Dataset/POP909/573/573.mid]1482
1563
4453
 [/home/rithomas/data/POP909-Dataset/POP909/490/490.mid]1796
1471
4258
 [/home/rithomas/data/POP909-Dataset/POP909/504/504.mid]2130
1841
5446
 [/home/rithomas/data/POP909-Dataset/POP909/403/403.mid]1796
1344
4686
 [/home/rithomas/data/POP909-Dataset/POP909/859/859.mid]2375
3861
5343
 [/home/rithomas/data/POP909-Dataset/POP909/597/597.mid]1900
894
4415
 [/home/rithomas/data/POP909-Dataset/POP909/474/474.mid]2011
2542
3958
 [/home/ri

 [/home/rithomas/data/POP909-Dataset/POP909/112/112.mid]1823
1637
5076
 [/home/rithomas/data/POP909-Dataset/POP909/086/086.mid]2028
1799
6776
 [/home/rithomas/data/POP909-Dataset/POP909/802/802.mid]4156
549
3640
 [/home/rithomas/data/POP909-Dataset/POP909/458/458.mid]2012
2546
11411
 [/home/rithomas/data/POP909-Dataset/POP909/875/875.mid]1564
1683
5611
 [/home/rithomas/data/POP909-Dataset/POP909/555/555.mid]1755
1417
3823
 [/home/rithomas/data/POP909-Dataset/POP909/522/522.mid]1329
1323
4400
 [/home/rithomas/data/POP909-Dataset/POP909/288/288.mid]1409
910
4006
 [/home/rithomas/data/POP909-Dataset/POP909/385/385.mid]1852
2608
4613
 [/home/rithomas/data/POP909-Dataset/POP909/211/211.mid]1840
663
6697
 [/home/rithomas/data/POP909-Dataset/POP909/266/266.mid]1048
1776
4797
 [/home/rithomas/data/POP909-Dataset/POP909/621/621.mid]1676
1005
3774
 [/home/rithomas/data/POP909-Dataset/POP909/656/656.mid]2072
2356
4581
 [/home/rithomas/data/POP909-Dataset/POP909/751/751.mid]1444
1837
4207
 [/home/

 [/home/rithomas/data/POP909-Dataset/POP909/673/673.mid]1396
1805
3810
 [/home/rithomas/data/POP909-Dataset/POP909/604/604.mid]1959
1148
3714
 [/home/rithomas/data/POP909-Dataset/POP909/790/790.mid]1854
1680
4460
 [/home/rithomas/data/POP909-Dataset/POP909/339/339.mid]1709
724
2697
 [/home/rithomas/data/POP909-Dataset/POP909/243/243.mid]1281
635
4156
 [/home/rithomas/data/POP909-Dataset/POP909/234/234.mid]1757
1934
6778
 [/home/rithomas/data/POP909-Dataset/POP909/333/333.mid]1201
941
3131
 [/home/rithomas/data/POP909-Dataset/POP909/344/344.mid]1480
1527
5153
 [/home/rithomas/data/POP909-Dataset/POP909/249/249.mid]1535
1414
6294
 [/home/rithomas/data/POP909-Dataset/POP909/697/697.mid]1638
1517
5069
 [/home/rithomas/data/POP909-Dataset/POP909/703/703.mid]1812
1670
4421
 [/home/rithomas/data/POP909-Dataset/POP909/774/774.mid]2013
1186
4877
 [/home/rithomas/data/POP909-Dataset/POP909/679/679.mid]1595
1828
4311
 [/home/rithomas/data/POP909-Dataset/POP909/868/868.mid]1813
1747
5406
 [/home/r

4732
 [/home/rithomas/data/POP909-Dataset/POP909/898/898.mid]1093
1036
4128
 [/home/rithomas/data/POP909-Dataset/POP909/556/556.mid]1903
1445
4961
 [/home/rithomas/data/POP909-Dataset/POP909/188/188.mid]1882
741
5580
 [/home/rithomas/data/POP909-Dataset/POP909/111/111.mid]1720
1464
5322
 [/home/rithomas/data/POP909-Dataset/POP909/085/085.mid]1505
1573
5582
 [/home/rithomas/data/POP909-Dataset/POP909/166/166.mid]1390
914
3043
 [/home/rithomas/data/POP909-Dataset/POP909/061/061.mid]1539
809
4024
 [/home/rithomas/data/POP909-Dataset/POP909/182/182.mid]2200
904
5660
 [/home/rithomas/data/POP909-Dataset/POP909/016/016.mid]1527
2035
5235
 [/home/rithomas/data/POP909-Dataset/POP909/451/451.mid]2033
1720
6953
 [/home/rithomas/data/POP909-Dataset/POP909/426/426.mid]2334
1365
4879
 [/home/rithomas/data/POP909-Dataset/POP909/892/892.mid]1908
1464
5258
 [/home/rithomas/data/POP909-Dataset/POP909/906/906.mid]1483
1019
3608
 [/home/rithomas/data/POP909-Dataset/POP909/760/760.mid]614
940
3629
 [/home