In [1]:
import os
import numpy as np 
import pandas as pd 
import pickle

import string
from string import digits

import re
from collections import defaultdict

from music21 import *

In [2]:
# with open('../chords_dict.pkl', 'rb') as picklefile:
#     d = pickle.load(picklefile)

In [2]:
def open_midi(midi_path, remove_drums):
    '''
    There is an one-line method to read MIDIs
    but to remove the drums we need to manipulate some
    low level MIDI events.
    '''
    
    mf = midi.MidiFile()
    mf.open(midi_path)
    mf.read()
    mf.close()
    if (remove_drums):
        for i in range(len(mf.tracks)):
            mf.tracks[i].events = [ev for ev in mf.tracks[i].events if ev.channel != 10]
            # By convention track 10 is reserved for percussion
    melody_track = ""
    
    '''
    It is hard to identify the melody track. I'll do my best by looking for keywords, 
    but even if I'm confusing the melody with the base line I'm assuming it is still 
    useful to capture the relationship between that and the harmony of the song.
    '''
    
    for track in range(len(mf.tracks)):
            if 'lead' in str(mf.tracks[track].events[1].data).lower() or 'voice' in str(mf.tracks[track].events[1].data).lower() or 'melody' in str(mf.tracks[track].events[1].data).lower() or 'karaoke' in str(mf.tracks[track].events[1].data).lower():
                melody_track = track - 1
                break
    if melody_track == "":
        melody_track = 0
        # By convention the first track is the melody
    return (midi.translate.midiFileToStream(mf),melody_track)

In [3]:
notes_unique = ['i','bi','#i','ii','bii','#ii','iii','biii','#iii', 
  'iv','biv','#iv','v','bv','#v','vi','bvi','#vi',
  'vii','bvii','#vii','rest']

In [4]:
def get_melo_encoded(df,melody_track):
    ''' This function returns the encoded melody'''
    music_key = df.analyze('key')
    time = []
    melody_roman = []
    duration = []
    for nt in (df.parts[melody_track].notesAndRests):
        note_array = np.zeros(len(notes_unique)+1) # add one place for the duration of the note
        if isinstance(nt, note.Note):     
            time.append(float(nt.offset))
            roman_chord = chord.Chord(nt.pitch.name + " " + "C") # for some reason Music21 doesn't allow to 
            roman_chord.remove('C') # convert augmented notes into a chord, so I add and remove 'C' and it works
            roman_chord_numeral = roman.romanNumeralFromChord(roman_chord,music_key).figure
            note_array[notes_unique.index(roman_chord_numeral)] = 1
            note_array[22] = nt.duration.quarterLength
            melody_roman.append(note_array)
        elif isinstance(nt, note.Rest):
            time.append(float(nt.offset))
            note_array[21] = 1
            note_array[22] = nt.duration.quarterLength
            melody_roman.append(note_array)
    
            
    melody = pd.DataFrame({'offset':time,'input':melody_roman})
    melody['group'] = melody['offset'].apply(lambda x: np.floor(x/float(4))) # split sequences in groups of 4 offsets
    return melody

In [5]:
def get_chords(df):
    '''Using chordify function extract the harmony of the song'''
    music_key = df.analyze('key')
    df_chordify = df.chordify()

    time = []
    chordify = []
    
    for thisChord in df_chordify.recurse().getElementsByClass('Chord'):
        time.append(float(thisChord.offset))
        chordify.append(simplify_roman_name(thisChord, music_key))

    chordify_df = pd.DataFrame({'offset':time,'target':chordify})
    chordify_df['group'] = chordify_df['offset'].apply(lambda x: np.floor(x/float(4)))
    return chordify_df

In [6]:
d = defaultdict(str)
def simplify_roman_name(thisChord, music_key):
    global d
    if d[(str(thisChord), str(music_key))]:
        return d[(str(thisChord), str(music_key))]
    roman_numeral = roman.romanNumeralFromChord(thisChord, music_key)
    '''Thanks @wfaria for this code! https://www.kaggle.com/wfaria/midi-music-data-extraction-using-music21/notebook
    in this method we try to simplify names, even if it ends in
    a different chord to reduce the chord vocabulary and reduce the number of classes for the decoder model.'''
    
    
    ret = roman_numeral.romanNumeral
    inversion_name = None
    inversion = roman_numeral.inversion()
    
    # Checking valid inversions.
    if ((roman_numeral.isTriad() and inversion < 3) or
            (inversion < 4 and
                 (roman_numeral.seventh is not None or roman_numeral.isSeventh()))):
        inversion_name = roman_numeral.inversionName()
        
    if (inversion_name is not None):
        ret = ret + str(inversion_name)
        
    elif (roman_numeral.isDominantSeventh()): ret = ret + "M7"
    elif (roman_numeral.isDiminishedSeventh()): ret = ret + "o7"
    d[(str(thisChord), str(music_key))] = ret
    return ret

In [7]:
def group_measure_encoded(df):
    ''' We group melodies and harmonies by 4 offsets'''
    
    df = df.fillna("null")
    grouped = df.groupby('group_x')
    input_texts = np.array([np.array(list(grouped.get_group(x)['input'])) for x in grouped.groups])
    target_texts = [list(grouped.get_group(x)['target']) for x in grouped.groups]
    group = [x for x in grouped.groups]
    target_texts = [' '.join(x) for x in target_texts]
    return pd.DataFrame({'group':group,'input':input_texts,'target':target_texts})

In [8]:
def get_melo_chord(df,melody_track):
    ''' Returns the final dataframe with the melody and the harmony'''
    melo = get_melo_encoded(df,melody_track)
    chords = get_chords(df)    
    melo_chords = melo.merge(chords,on='offset',how='left')
    grouped  = group_measure_encoded(melo_chords)

    final = (grouped
             .sort_values(by='group')
             .rename(columns={'input':'melody','target':'harmony'})
             .fillna('rest')) 
    return final

In [9]:
for folder in ['0']:#,'6','5','4','3','2','1','0','9','8']:
    processed_songs = []
    subfolder = "../lmd_full/" + folder
    midis = os.listdir(subfolder)
    try:
        midis.remove('.DS_Store')
    except:
        pass
    midis = midis[:20]
    for midi_name in midis:
        path = "../lmd_full/" + folder + "/" + midi_name
        try:
            print('processing file ', midi_name)
            song, melody_track = open_midi(path,True)
            song_roman = get_melo_chord(song,melody_track)
            processed_songs.append(song_roman)
        except:
            print('couldnt process file ', midi_name)
#     with open('processed_songs_{}.pkl'.format(folder), 'wb') as picklefile:
#         pickle.dump(processed_songs, picklefile)
        


processing file  04823afe63438741d70fc9c6083f522a.mid
processing file  08bb1300476659f0b318896670a89b39.mid
processing file  0cbb4ec645d5cc60e14e27b51e1660dd.mid
processing file  096bc34fbe50e58c2ac36a1e8c2fc752.mid
processing file  03f3e2c02f0f61e8142fd1049bd6dd5d.mid
processing file  0a12b5fbdd27f8be2e4412013cd7c8be.mid
processing file  0c0c3ba74a4977fc635b2673ef3db7c9.mid
processing file  0ac68ec6594fc26e383c6fb4590a48d0.mid
processing file  0b63dcdbc167f1a4618d001080091ffc.mid
processing file  0e0e734424ceda5dfe4d6de0b20d49b4.mid
processing file  0d6d0540fc4e1178ae11b4562894e48e.mid
processing file  0ed68d986ba7a29fab2c765b2baced35.mid
processing file  03b0a3a4225538c792fb72db8c22f7ad.mid
couldnt process file  03b0a3a4225538c792fb72db8c22f7ad.mid
processing file  09b64e761bb341dccaaf1e9ecd172068.mid
processing file  096feb77e464ec03aa519165500ccb56.mid
processing file  07a34ec6484183f2b299fba6878eb881.mid
processing file  0b4b476ae6dfd73205b104e4cb9e79e0.mid
processing file  0d1b56

In [15]:
with open('processed_songs.pkl', 'wb') as picklefile:
    pickle.dump(processed_songs, picklefile)

In [14]:
with open('dict.pkl', 'wb') as picklefile:
    pickle.dump(d, picklefile)