# Extracting Time Series Data from MIDI Files

Let's start by loading all the file names and their channel mappings.

In [2]:
import os
import midi
import pickle

In [3]:
with open('channel_mappings.pkl', 'r') as f:
    channel_mappings = pickle.load(f)

In [4]:
midi_path = './midi/pop/'
midi_files = os.listdir(midi_path)

In [5]:
assert len(midi_files) == len(channel_mappings)

Before we begin, we'll want to denote which instruments actually play melodies (basically anything that isn't a toneless percussion instrument, sound effect, or "pad" instrument). See the chart in [this](./flattening_tracks.ipynb) notebook for reference. If a given channel uses a "non-melody instrument", we will only extract the rhythmic information from that channel.

In [6]:
melody_instruments = range(88) + range(104, 112)

Now we'll build new object types to contain the time series objects.

In [117]:
class EventSequence(object):
    """A container for a sequence of events. 
    
    Abstract class, to be implemented via MelodySequence
    or RhythmSequence.
    """
    
    def __init__(self, num_events = 0, resolution = 240):
        """Initialize object with default of zero events and 
        resolution of 240 ticks per beat.
        """
        
        self.num_events = num_events
        self.resolution = resolution
    
    def add_event(self):
        """Add one new event."""
        
        self.num_events += 1
        
class MelodySequence(EventSequence):
    """A container for a melodic sequence. 
    Inherits from EventSequence.
    """
    
    def __init__(self, num_events = 0, resolution = 500000):
        super(MelodySequence, self).__init__(num_events, resolution)
        self.notes = []
        
    def add_note(self, note):
        if len(self.notes) > 0 and note[1] == self.notes[-1][1]:
            self.add_note_to_chord(note)
        else:
            super(MelodySequence, self).add_event()
            self.notes.append(note)
    
    def add_note_to_chord(self, note):
        previous = self.notes[-1]
        if isinstance(previous[0], int):
            self.notes[-1] = ([previous[0], note[0]], previous[1])
        else:
            self.notes[-1] = (previous[0] + [note[0]], previous[1])
        
class RhythmSequence(EventSequence):
    """A container for a rhythmic sequence.
    Inherits from EventSequence.
    """
    
    def __init__(self, num_events = 0, resolution = 500000):
        super(RhythmSequence, self).__init__(num_events, resolution)
        self.ticks = []
        
    def add_tick(self, tick):
        if len(self.ticks) > 0 and tick == self.ticks[-1]:
            pass
        else:
            super(RhythmSequence, self).add_event()
            self.ticks.append(tick)

A couple of helper functions to actually extract the sequence of information from the given channel in the given MIDI file.

In [118]:
def get_rhythm_sequence(rhythm, mfile, channel):
    """Pull out the rhythms from the given channel in mfile.
    
    Inputs: rhythm is an empty RhythmSequence object
            mfile is a midi.containers.Pattern object
            channel is an integer channel number from 0-15
            
    Output: RhythmSequence object filled with ticks from mfile channel
    """
    
    mfile.make_ticks_abs()
    for track in mfile:
        for event in track:
            if isinstance(event, midi.events.NoteOnEvent) and event.channel == channel and event.data[1] != 0:
                rhythm.add_tick(event.tick)
    rhythm.ticks = sorted(rhythm.ticks)
    return rhythm

def get_melody_sequence(melody, mfile, channel):
    """Pull out the melodies from the given channel in mfile.
    
    Inputs: melody is an empty MelodySequence object
            mfile is a midi.containers.Pattern object
            channel is an integer channel number from 0-15 (shouldn't be 9)
            
    Output: MelodySequence object filled with notes from mfile channel
    """
    
    mfile.make_ticks_abs()
    for track in mfile:
        for event in track:
            if isinstance(event, midi.events.NoteOnEvent) and event.channel == channel and event.data[1] != 0:
                melody.add_note((event.data[0], event.tick))
    melody.notes = sorted(melody.notes, key = lambda x: x[1])
    melody.notes = list(zip(*melody.notes)[0])
    return melody

The main function below will loop through all of the assigned channels, create the relevant time series objects, and call the two helper functions above to get a rhythm and/or melody sequence for each instrument in the song.

In [126]:
def get_sequences(mfile, mapping):
    """Extract a list of melody sequences and rhythm sequences from mfile.
    
    Inputs: mfile is a midi.containers.Pattern object
            mapping is a dictionary associating """
    melodies = []
    rhythms = []
    res = mfile.resolution
    for channel in mapping:
        instruments = mapping[channel]
        if instruments == None:
            continue
        
        rhythm = RhythmSequence(resolution = res)
        melody = MelodySequence(resolution = res)
        melodic_channel = True

        for instrument in instruments:
            if instrument not in melody_instruments:
                melodic_channel = False
                break

        if melodic_channel and channel != 9:
            melodies.append(get_melody_sequence(melody, mfile, channel).notes)
        else:
            melodies.append([])

        rhythms.append(get_rhythm_sequence(rhythm, mfile, channel).ticks)

    return melodies, rhythms

We'll do a test run on the first two files in the directory.

In [127]:
mfile1 = midi.read_midifile(midi_path + midi_files[0])
mapping1 = channel_mappings[0]
mfile2 = midi.read_midifile(midi_path + midi_files[1])
mapping2 = channel_mappings[1]

melodies1, rhythms1 = get_sequences(mfile1, mapping1)
melodies2, rhythms2 = get_sequences(mfile2, mapping2)

In [128]:
print melodies1[0]

[72, 73, 75, 68, 73, 76, 68, 73, 68, 71, 76, 68, 71, 68, 70, 76, 68, 70, 70, 73, 78, 70, 73, 65, 58, 60, 62]


In [129]:
print rhythms1[0]

[360, 480, 600, 686, 704, 728, 836, 961, 1076, 1079, 1081, 1200, 1320, 1436, 1438, 1440, 1560, 1680, 1762, 1780, 1803, 1920, 2040, 20520, 20634, 20760, 20881]


Note that **either** there are an equal number of note events in corresponding melody and rhythm sequences **or** no melody information was recorded: 

In [134]:
for i in range(len(melodies1)):
    assert (len(melodies1[i]) == len(rhythms1[i]) or len(melodies1[i]) == 0)

In [135]:
print melodies2[0]

[[65, 58], [65, 58], [65, 58], [58, 65], [65, 60], [65, 60], [60, 65], [65, 58], [65, 58], [58, 65], [65, 60], [65, 60], [60, 65], [65, 58], [65, 58], [58, 65], [65, 57], [65, 57], [65, 57], [62, 55], [62, 55], [55, 62], [65, 57], [62, 69, 57], [69, 64, 57], 65, 62, [70, 65], [70, 63], [70, 65], 58, [69, 62], [69, 64], [69, 62], [67, 60], [67, 60], [67, 72], [72, 65], [72, 65], [65, 60], [65, 60], [65, 60], [65, 60], [65, 58], [65, 60], [58, 65], [69, 74], [69, 74], [69, 74], [70, 63], [70, 65], [70, 63], [65, 57], [62, 69, 57], [69, 64, 57], 65, 62, [70, 65], [70, 63], [70, 65], 58, [67, 60], [67, 60], [67, 60], [70, 62], [70, 62], [62, 65], 58, [60, 65], [65, 60], [65, 60], [62, 58], [62, 57], [62, 55], [58, 53], [58, 55], [58, 53], [65, 58], 60, 62, 58, 62, 62, 63, 65, 65, 60, 60, 62, 63, [63, 67], [62, 67], [63, 67], [62, 67], 62, [63, 67], [72, 63], 63, [70, 63], [70, 63], [70, 63], [65, 58], 60, 62, 58, 62, 62, 63, 65, 65, 60, 60, 62, 63, [63, 67], [62, 67], [63, 67], [62, 67], 6

In [136]:
print rhythms2[0]

[0, 0, 576, 1152, 1536, 2112, 2688, 3072, 3648, 4224, 4608, 5184, 5760, 6144, 6720, 7296, 7680, 8256, 8832, 9216, 9792, 10368, 10752, 11328, 11904, 12000, 12096, 12288, 12864, 13440, 13632, 13824, 14400, 14976, 15360, 15936, 16608, 16896, 17472, 18144, 18432, 19200, 19584, 19968, 20544, 21120, 21504, 22080, 22656, 23040, 23616, 24192, 24576, 25152, 25728, 25824, 25920, 26112, 26688, 27264, 27456, 27648, 28224, 28800, 29184, 29760, 30336, 30528, 30720, 31488, 31872, 32256, 32832, 33408, 33792, 34368, 34944, 35328, 36864, 37056, 37152, 37440, 37632, 37824, 38112, 38400, 38976, 39168, 39360, 39648, 39936, 40320, 40704, 40896, 40992, 41184, 41472, 41664, 41856, 42240, 42432, 42720, 43008, 43200, 43296, 43584, 43776, 43968, 44256, 44544, 45120, 45312, 45504, 45792, 46080, 46464, 46848, 47040, 47136, 47328, 47616, 47808, 48000, 48384, 48576, 48864, 49152, 49728, 50304, 50688, 51264, 51840, 52224, 52800, 53376, 53760, 54336, 54912, 55104, 55200, 55296, 55872, 56448, 56640, 56832, 57408, 57984

Note that we have a bit of an issue: the `rhythms2[0]` array starts with two consecutive zeros, which should really be flattened down to one, since these events happen at the same time. The issue here is that the note events came from two different tracks, and flattening only occurs within each track with the current setup. We address this issue in the [following notebook](./getting_melodies_and_rhythms_optimized.ipynb).

In [137]:
for i in range(len(melodies2)):
    assert (len(melodies2[i]) == len(rhythms2[i]) or len(melodies2[i]) == 0)

***Important note:*** The rhythm sequences are in units of "ticks", the definition of which change from file to file and depend on the global *resolution* parameter. This will be addressed in the following notebook when we extract the melodies and rhythms from *all* the tracks using our new framework.