# Extracting Time Series Data from MIDI Files

Let's start by loading all the file names and their channel mappings.

In [1]:
import os
import midi
import pickle

In [2]:
with open('channel_mappings.pkl', 'r') as f:
    channel_mappings = pickle.load(f)

In [3]:
midi_path = './midi/pop/'
midi_files = os.listdir(midi_path)

In [4]:
assert len(midi_files) == len(channel_mappings)

Before we begin, we'll want to denote which instruments actually play melodies (basically anything that isn't a toneless percussion instrument, sound effect, or "pad" instrument). See the chart in [this](./flattening_tracks.ipynb) notebook for reference. If a given channel uses a "non-melody instrument", we will only extract the rhythmic information from that channel.

In [5]:
melody_instruments = range(88) + range(104, 112)

Now we'll build new object types to contain the time series objects.

In [6]:
class EventSequence(object):
    """A container for a sequence of events. 
    
    Abstract class, to be implemented via MelodySequence
    or RhythmSequence.
    """
    
    def __init__(self, num_events = 0, resolution = 240):
        """Initialize object with default of zero events and 
        resolution of 240 ticks per beat.
        """
        
        self.num_events = num_events
        self.resolution = resolution
    
    def add_event(self):
        """Add one new event."""
        
        self.num_events += 1
        
class MelodySequence(EventSequence):
    """A container for a melodic sequence. 
    Inherits from EventSequence.
    """
    
    def __init__(self, num_events = 0, resolution = 240):
        super(MelodySequence, self).__init__(num_events, resolution)
        self.notes = []
        
    def add_note(self, note):
        if len(self.notes) > 0 and note[1] == self.notes[-1][1]:
            self.add_note_to_chord(note)
        else:
            super(MelodySequence, self).add_event()
            self.notes.append(note)
    
    def add_note_to_chord(self, note):
        previous = self.notes[-1]
        if isinstance(previous[0], int):
            self.notes[-1] = (list(set([previous[0], note[0]])), previous[1])
        else:
            self.notes[-1] = (list(set(previous[0] + [note[0]])), previous[1])
        
class RhythmSequence(EventSequence):
    """A container for a rhythmic sequence.
    Inherits from EventSequence.
    """
    
    def __init__(self, num_events = 0, resolution = 240):
        super(RhythmSequence, self).__init__(num_events, resolution)
        self.ticks = []
        
    def add_tick(self, tick):
        if len(self.ticks) > 0 and tick == self.ticks[-1]:
            pass
        else:
            super(RhythmSequence, self).add_event()
            self.ticks.append(tick)
            
    def normalize(self):
        self.ticks = map(lambda x: float(x) / self.resolution, self.ticks)

A couple of helper functions to actually extract the sequence of information from the given channel in the given MIDI file.

In [7]:
def get_melody_sequence(melody, mfile, channel):
    """Pull out the melodies from the given channel in mfile.
    
    Inputs: melody is an empty MelodySequence object
            mfile is a midi.containers.Pattern object
            channel is an integer channel number from 0-15 (shouldn't be 9)
            
    Output: MelodySequence object filled with notes from mfile channel
    """
    
    mfile.make_ticks_abs()
    for event in mfile[0]:
        if isinstance(event, midi.events.NoteOnEvent) and event.channel == channel and event.data[1] != 0:
            melody.add_note((event.data[0], event.tick))
    melody.notes = sorted(melody.notes, key = lambda x: x[1])
    if len(melody.notes) > 0:
        melody.notes = list(zip(*melody.notes)[0])
    return melody

def get_rhythm_sequence(rhythm, mfile, channel):
    """Pull out the rhythms from the given channel in mfile.
    
    Inputs: rhythm is an empty RhythmSequence object
            mfile is a midi.containers.Pattern object
            channel is an integer channel number from 0-15
            
    Output: Normalized RhythmSequence object filled with ticks 
            from mfile channel
    """
    
    mfile.make_ticks_abs()
    for event in mfile[0]:
        if (isinstance(event, midi.events.NoteOnEvent) and 
            event.channel == channel and 
            event.data[1] != 0):
            rhythm.add_tick(event.tick)
    rhythm.ticks = sorted(rhythm.ticks)
    rhythm.normalize()
    return rhythm

The main function below will loop through all of the assigned channels, create the relevant time series objects, and call the two helper functions above to get a rhythm and/or melody sequence for each instrument in the song.

In [8]:
def get_sequences(mfile, mapping):
    """Extract a list of melody sequences and rhythm sequences from mfile.
    
    Inputs: mfile is a midi.containers.Pattern object
            mapping is a dictionary associating """
    melodies = []
    rhythms = []
    res = mfile.resolution
    for channel in mapping:
        instruments = mapping[channel]
        if instruments == None:         # skip entire process if channel isn't used
            continue
                                        # new Sequence objects
        #print channel
        rhythm = RhythmSequence(resolution = res)
        melody = MelodySequence(resolution = res)
        melodic_channel = True
                                        # if *any* non-melodic instrument is used,
        for instrument in instruments:  # the channel will be marked as non-melodic
            if instrument not in melody_instruments:
                melodic_channel = False
                break

        if melodic_channel and channel != 9:    # channel 10 reserved for percussion 
            melodies.append(get_melody_sequence(melody, mfile, channel).notes)
        else:                            # capture melody iff melodic instrument
            melodies.append([])
                                                # always capture rhythm
        rhythms.append(get_rhythm_sequence(rhythm, mfile, channel).ticks)

    return melodies, rhythms

The following helper function takes any MIDI file and creates an equivalent single track (i.e. MIDI format 0) file.

In [9]:
def make_one_track(mfile):
    event_list = []
    mfile.make_ticks_abs()
    for track in mfile:
        for event in track:
            event_list.append(event)
    master_track = midi.containers.Track(events = sorted(event_list, key = lambda x: x.tick), 
                                         tick_relative = False)
    
    master_file = midi.containers.Pattern(tracks = [master_track], 
                                          resolution = mfile.resolution, 
                                          format = 0, 
                                          tick_relative = False)
    return master_file

We'll do a test run on the first two files in the directory.

In [10]:
mfile1 = midi.read_midifile(midi_path + midi_files[0])
flat_file1 = make_one_track(mfile1)
mapping1 = channel_mappings[0]
mfile2 = midi.read_midifile(midi_path + midi_files[1])
flat_file2 = make_one_track(mfile2)
mapping2 = channel_mappings[1]

melodies1, rhythms1 = get_sequences(flat_file1, mapping1)
melodies2, rhythms2 = get_sequences(flat_file2, mapping2)

In [11]:
print melodies1[0]

[72, 73, 75, 68, 73, 76, 68, 73, 68, 71, 76, 68, 71, 68, 70, 76, 68, 70, 70, 73, 78, 70, 73, 65, 58, 60, 62]


In [12]:
print rhythms1[0]

[3.0, 4.0, 5.0, 5.716666666666667, 5.866666666666666, 6.066666666666666, 6.966666666666667, 8.008333333333333, 8.966666666666667, 8.991666666666667, 9.008333333333333, 10.0, 11.0, 11.966666666666667, 11.983333333333333, 12.0, 13.0, 14.0, 14.683333333333334, 14.833333333333334, 15.025, 16.0, 17.0, 171.0, 171.95, 173.0, 174.00833333333333]


In [13]:
assert len(melodies1) == len(rhythms1)
for i in range(len(melodies1)):
    assert (len(melodies1[i]) == len(rhythms1[i]) or len(melodies1[i]) == 0)

In [14]:
print melodies2[0]

[[65, 58], [65, 58], [65, 58], [65, 60], [65, 60], [65, 60], [65, 58], [65, 58], [65, 58], [65, 60], [65, 60], [65, 60], [65, 58], [65, 58], [65, 58], [65, 57], [65, 57], [65, 57], [62, 55], [62, 55], [62, 55], [65, 57], [57, 69, 62], [64, 57, 69], 65, 62, [65, 70], [70, 63], [65, 70], 58, [69, 62], [64, 69], [69, 62], [67, 60], [67, 60], [72, 67], [72, 65], [72, 65], [65, 60], [65, 60], [65, 60], [65, 60], [65, 58], [65, 60], [65, 58], [74, 69], [74, 69], [74, 69], [70, 63], [65, 70], [70, 63], [65, 57], [57, 69, 62], [64, 57, 69], 65, 62, [65, 70], [70, 63], [65, 70], 58, [67, 60], [67, 60], [67, 60], [62, 70], [62, 70], [65, 62], 58, [65, 60], [65, 60], [65, 60], [58, 62], [57, 62], [62, 55], [58, 53], [58, 55], [58, 53], [65, 58], 60, 62, 58, 62, 62, 63, 65, 65, 60, 60, 62, 63, [67, 63], [67, 62], [67, 63], [67, 62], 62, [67, 63], [72, 63], 63, [70, 63], [70, 63], [70, 63], [65, 58], 60, 62, 58, 62, 62, 63, 65, 65, 60, 60, 62, 63, [67, 63], [67, 62], [67, 63], [67, 62], 62, [67, 63

In [15]:
print rhythms2[0]

[0.0, 1.5, 3.0, 4.0, 5.5, 7.0, 8.0, 9.5, 11.0, 12.0, 13.5, 15.0, 16.0, 17.5, 19.0, 20.0, 21.5, 23.0, 24.0, 25.5, 27.0, 28.0, 29.5, 31.0, 31.25, 31.5, 32.0, 33.5, 35.0, 35.5, 36.0, 37.5, 39.0, 40.0, 41.5, 43.25, 44.0, 45.5, 47.25, 48.0, 50.0, 51.0, 52.0, 53.5, 55.0, 56.0, 57.5, 59.0, 60.0, 61.5, 63.0, 64.0, 65.5, 67.0, 67.25, 67.5, 68.0, 69.5, 71.0, 71.5, 72.0, 73.5, 75.0, 76.0, 77.5, 79.0, 79.5, 80.0, 82.0, 83.0, 84.0, 85.5, 87.0, 88.0, 89.5, 91.0, 92.0, 96.0, 96.5, 96.75, 97.5, 98.0, 98.5, 99.25, 100.0, 101.5, 102.0, 102.5, 103.25, 104.0, 105.0, 106.0, 106.5, 106.75, 107.25, 108.0, 108.5, 109.0, 110.0, 110.5, 111.25, 112.0, 112.5, 112.75, 113.5, 114.0, 114.5, 115.25, 116.0, 117.5, 118.0, 118.5, 119.25, 120.0, 121.0, 122.0, 122.5, 122.75, 123.25, 124.0, 124.5, 125.0, 126.0, 126.5, 127.25, 128.0, 129.5, 131.0, 132.0, 133.5, 135.0, 136.0, 137.5, 139.0, 140.0, 141.5, 143.0, 143.5, 143.75, 144.0, 145.5, 147.0, 147.5, 148.0, 149.5, 151.0, 152.0, 153.5, 155.25, 156.0, 157.5, 159.25, 160.0, 1

In [16]:
assert len(melodies2) == len(rhythms2)
for i in range(len(melodies2)):
    assert (len(melodies2[i]) == len(rhythms2[i]) or len(melodies2[i]) == 0)

Now, the chords have sets of notes (instead of repeated notes), the rhythmic ticks are all unique and normalized to units of "beats", and the sizes of each list match (unless melodic length is zero). It looks like we're ready to scrape all the sequences!

## Getting ALL the sequences

In [17]:
all_melodies = []
all_rhythms = []

for i in range(len(midi_files)):
    try:
        mfile = midi.read_midifile(midi_path + midi_files[i])
        flat_file = make_one_track(mfile)
        mapping = channel_mappings[i]
        melodies, rhythms = get_sequences(flat_file, mapping)
        all_melodies += melodies
        all_rhythms += rhythms
    except:
        print "Error at " + midi_files[i]



In [18]:
len(all_melodies)

68978

In [19]:
len(all_rhythms)

68978

In [20]:
with open('all_melodies.pkl', 'w') as f:
    pickle.dump(all_melodies, f)
with open('all_rhythms.pkl', 'w') as g:
    pickle.dump(all_rhythms, g)

In [22]:
for i in range(len(all_rhythms)):
    assert (len(all_melodies[i]) == 0 or len(all_rhythms[i]) == len(all_melodies[i]))

In [23]:
len(filter(lambda x: len(x) > 0, all_melodies))

56721

In [24]:
68978 - 56721

12257

So we have 56,721 different melody tracks paired with rhythm tracks, and an additional 12,257 purely rhythmic tracks.