In [1]:
import json
import pretty_midi
import IPython.display

import numpy as np
import matplotlib.pyplot as plt

from glob import glob
from time import time
from collections import Counter
from utils.common_utils import *

%load_ext blackcellmagic
%load_ext autoreload
%autoreload 2

plt.style.use('seaborn')

#### Load Melody Track List

In [2]:
with open('./melody_track_dict.json', 'r') as f:
    melody_track_dict = json.load(f)

new_melody_track_dict = {}
for old_key in melody_track_dict:
    new_key = old_key.split('/')[-1]
    new_melody_track_dict[new_key] = melody_track_dict[old_key]

print('The number of melody_track_dict :', len(new_melody_track_dict))

The number of melody_track_dict : 63968


In [None]:
data_path = '/workspace/music/data/lakh/'
output_path = '/workspace/music/data/lakh_melody2music_res_4_bar_4/'

phrase_count = 0
melody_count = 0
no_melody_count = 0

melody_meta = {'tracks': [], 'num_tracks': []}
no_melody_meta = {'tracks': [], 'num_tracks': []}

start_time = time()
for folder_path in glob(data_path + '*'):
    for file_path in glob(folder_path + '/*'):
        try:
            pm = pretty_midi.PrettyMIDI(file_path)
            file_name = file_path.split('/')[-1]

            # check whether 4/4 time sign
            if not check_time_sign(pm):
                continue

            # check melody track
            if file_name in new_melody_track_dict:
                melody_track = new_melody_track_dict[file_name]['melody']
                melody_count += 1
            else:
                melody_track = None
                no_melody_count += 1

            # keep tracks for only melody, piano, bass, guitar, drum, string
            pm = unify_tracks(pm, melody_track)

            # check the number of inst
            if len(pm.instruments) < 2:
                continue
            
            # keep meta info
            inst_list = list(map(lambda x: x.name, pm.instruments))
            
            if melody_track == None:
                no_melody_meta['tracks'].append(inst_list)
                no_melody_meta['num_tracks'].append(len(pm.instruments))
            else:
                melody_meta['tracks'].append(inst_list)
                melody_meta['num_tracks'].append(len(pm.instruments))
            
            # convert to pianoroll object
            pianoroll, beat_start, event_time = get_pianoroll(pm, res=4)

            # get phrase collection from pianoroll
            window_pianoroll = get_window(pianoroll, res=4, bar=4)

            # constraint & save
            for i in range(window_pianoroll.shape[0]):
                phrase = window_pianoroll[i]

                # empty check
                if check_empty_bar(phrase, thres=0.01, res=4):
                    continue

                save_file_name = 'phrase_' + str(phrase_count) + '.npy'

                if melody_track == None:
                    save_path = output_path + 'no_melody/' + save_file_name
                else:
                    save_path = output_path + 'melody/' + save_file_name

                    # melody empty check
                    if np.sum(phrase[:, :, MELODY_REL_NUM]) < 24:
                        continue

                # save file
                with open(save_path, 'wb') as f:
                    np.save(f, phrase)

                phrase_count += 1
                if phrase_count % 100000 == 0:
                    print('I am on %d (%0.2f sec) - melody files: %d, no_melody files: %d' % (phrase_count, time()-start_time, melody_count, no_melody_count))
                    start_time = time()            
        except:
            continue

#### Preprocess Lakh Dataset

In [None]:
# visualize pianoroll
plot_pianoroll(pianoroll[:256], save_path='', res=4, 
               SIZE=[10, 10], CHAR_FONT_SIZE=17, NUM_FONT_SIZE=13, LABEL_PAD=13)