# 2024-10-04

convert midi files to a representation for a neural network

In [83]:
from pathlib import Path
import music21 as m21

path_to_data = Path("/Users/savv/datasets/maestro-v3.0.0")

midi_files = [p.relative_to(path_to_data) for p in path_to_data.glob("**/*.midi")]

print(len(midi_files))

midi_files[:30]

1276


[PosixPath('2013/ORIG-MIDI_01_7_7_13_Group__MID--AUDIO_12_R1_2013_wav--1.midi'),
 PosixPath('2013/ORIG-MIDI_03_7_6_13_Group__MID--AUDIO_09_R1_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_7_13_Group__MID--AUDIO_13_R1_2013_wav--1.midi'),
 PosixPath('2013/ORIG-MIDI_03_7_6_13_Group__MID--AUDIO_10_R1_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_6_13_Group__MID--AUDIO_01_R1_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_10_13_Group_MID--AUDIO_08_R3_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_6_13_Group__MID--AUDIO_02_R1_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_02_7_6_13_Group__MID--AUDIO_08_R1_2013_wav--3.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_6_13_Group__MID--AUDIO_04_R1_2013_wav--3.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_6_13_Group__MID--AUDIO_03_R1_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_01_7_7_13_Group__MID--AUDIO_11_R1_2013_wav--1.midi'),
 PosixPath('2013/ORIG-MIDI_03_7_10_13_Group_MID--AUDIO_18_R3_2013_wav--2.midi'),
 PosixPath('2013/ORIG-MIDI_0

In [3]:
file = (
    path_to_data / "2013/ORIG-MIDI_01_7_7_13_Group__MID--AUDIO_12_R1_2013_wav--1.midi"
)
midi = m21.converter.parse(file)
midi

<music21.stream.Score 0x1042817e0>

In [4]:
parts = m21.instrument.partitionByInstrument(midi)
parts

<music21.stream.Score 0x10b122f20>

In [24]:
part = list(midi.parts)[0]

[<music21.stream.Part Piano>]

In [5]:
note = list(parts.parts.recurse())[8]

note.pitch.midi, note.duration.quarterLength

  note = list(parts.parts.recurse())[8]


(49, 0.75)

In [6]:
rest = list(parts.parts.recurse())[10]

rest.duration.quarterLength

  rest = list(parts.parts.recurse())[10]


Fraction(1, 12)

In [94]:
chord = list(parts.parts.recurse())[11]

([n.pitch.midi for n in chord.notes], chord.duration.quarterLength)

  chord = list(parts.parts.recurse())[11]


([61, 77], 0.25)

In [7]:
rest

<music21.note.Rest 1/12ql>

Two options for encoding duration: 

- Express duration as 16ths (integer multiples of 0.25). A "hold" token allows longer durations.
    - triplets are lost (rounded down to 16ths)
- combine note+duration tokens e.g. C4-16th, C4-8th

what to do with octaves? 

- can use a 2-octave range for a melody, and clip everything inside it. may not be straightforward (when wrapping around it will be weird). 
    - could encode using intervals (+3, -2). easier to specify single note jumps than a range for the whole melody

what to do with chords?

- if one-hot encoded then just allow multiple notes to be on at the same time
- with interval-encoding it's more difficult maybe. no it's the same

In [79]:
REST = "R"


def get_note_list(file: Path | str, rest=REST) -> list[int | tuple | str, float]:
    """Get note list from midi file

    Args:
        file: path to midi file

    Returns:
        list of (note, duration) tuples where:

        - note: midi note number, tuple of midi note numbers, or rest.
        - duration: fraction of quarter note
    """
    song = m21.converter.parse(file)
    instruments = m21.instrument.partitionByInstrument(song).parts
    instrument = instruments[0]  # Use first instrument.
    notes = []
    for event in instrument.recurse():
        if isinstance(event, m21.note.Note):
            note = event.pitch.midi
        elif isinstance(event, m21.note.Rest):
            note = rest
        elif isinstance(event, m21.chord.Chord):
            # note = tuple(n.pitch.midi for n in event.notes)
            note = event.notes[0].pitch.midi  # Save first note in chord.
        else:
            continue
        notes.append((note, event.duration.quarterLength))
    return notes


get_note_list(file)[:20]

[('R', 2.0),
 ('R', 2.0),
 (77, Fraction(1, 3)),
 (49, 0.75),
 (68, Fraction(1, 3)),
 ('R', Fraction(1, 12)),
 (61, 0.25),
 ('R', 0.5),
 (73, 0.25),
 (51, 0.75),
 (78, 0.25),
 ('R', 0.25),
 (78, 0.25),
 (61, Fraction(1, 3)),
 ('R', 0.5),
 ('R', Fraction(4, 3)),
 (73, 0.25),
 (53, 1.0),
 ('R', Fraction(1, 6)),
 (73, 0.25)]

In [81]:
# Encode to (122, _, _, 22, _, -1, _) and round durations.

HOLD = "H"
START = "S"
END = "E"
note_list = get_note_list(file)


def encode_song(note_list: list[int | str], step=0.25) -> str:
    """Encode a list of (note, duration) events to a time series representation

    Args:
        note_list: list of (note, duration) events
        step: Step size for time series (sampling step). Fraction of quarter note. Defaults to 0.25.

    Returns:
        string
    """
    encoded_note_list = [START]
    for note, duration in note_list:
        num_steps = max(1, int(duration / step))
        encoded_note_list.append(note)
        encoded_note_list.extend([HOLD] * (num_steps - 1))
    encoded_note_list.append(END)

    return " ".join(str(n) for n in encoded_note_list)


encode_song(note_list)[:50]

'S R H H H H H H H R H H H H H H H 77 49 H H 68 R 6'