In [278]:
import numpy as np
from music21 import converter, note, duration
import pandas as pd

# Custom Midi note object
from midi_note import MIDINote

#utility
import random
import copy
from bidict import bidict

In [2]:
PATH = "POP909"

In [3]:
def parse_raw_notes(file):
    print("Loading Music File:", file)
    midi_data = converter.parse(file)

    raw_notes = []
    for part in midi_data.parts:
        if part.partName == 'MELODY':
            midi_elements = part.recurse()
            for element in midi_elements:
                if isinstance(element, note.Note):
                    note_duration = duration.Duration()
                    note_duration.quarterLength = element.quarterLength
                    raw_note = str(element.pitch)
                    raw_notes.append(MIDINote(raw_note, str(note_duration.type), str(element.quarterLength)))

    return np.array(raw_notes)

In [4]:
# list of songs by key
songs_by_key = dict()

songs_by_key["Emin"] = ['033', '034', '056', '071', '078', '116', '130', '240', '272', '276', '283', '317', '322',
                        '380', '457', '458', '476', '501', '520', '560', '566', '578', '625', '653', '667', '680',
                        '693', '740', '772', '788', '854', '899']
songs_by_key["Dmin"] = ['027', '040', '073', '103', '125', '141', '151', '158', '198', '226', '284', '295', '297',
                        '344', '378', '384', '389', '446', '451', '587', '604', '610', '627', '682', '689', '791',
                        '842', '845', '852', '859', '894']
songs_by_key["Amaj"] = ['058', '086', '102', '155', '174', '183', '208', '231', '291', '346', '353', '362', '410',
                        '463', '475', '478', '497', '499', '615', '641', '687', '691', '721', '728', '751', '774',
                        '810', '867', '891']
songs_by_key["Cmaj"] = ['038', '055', '068', '079', '131', '132', '136', '171', '172', '185', '203', '211', '216',
                        '233', '243', '278', '293', '312', '319', '320', '326', '331', '368', '386', '432', '459',
                        '493', '496', '548', '570', '591', '603', '612', '621', '702', '710', '714', '722', '735',
                        '761', '793', '824', '833', '873', '888', '892', '909']

In [5]:
sample_limit = 4
all_song_ids = []

for key, song_ids in songs_by_key.items():
    # all_song_ids += random.sample(song_ids, sample_limit)
    all_song_ids += songs_by_key[key][:sample_limit]

In [6]:
def parse_raw_notes(file_path):
    print("Loading Music File:", file_path)
    raw_notes = []
    midi_data = converter.parse(file_path)
    for part in midi_data.parts:
        if part.partName == 'MELODY':
            midi_elements = part.recurse()
            for element in midi_elements:
                if isinstance(element, note.Note):
                    note_duration = duration.Duration()
                    note_duration.quarterLength = element.quarterLength
                    raw_note = str(element.pitch)
                    raw_notes.append(MIDINote(raw_note, str(note_duration.type), str(element.quarterLength)).as_map)

    return raw_notes

In [7]:
parsed_midi_notes = []
for song_id in all_song_ids:
    song_path = f"{PATH}/{song_id}/{song_id}.mid"
    parsed_raw_notes = parse_raw_notes(song_path)
    parsed_midi_notes.append(parsed_raw_notes)

Loading Music File: POP909/033/033.mid
Loading Music File: POP909/034/034.mid
Loading Music File: POP909/056/056.mid
Loading Music File: POP909/071/071.mid
Loading Music File: POP909/027/027.mid
Loading Music File: POP909/040/040.mid
Loading Music File: POP909/073/073.mid
Loading Music File: POP909/103/103.mid
Loading Music File: POP909/058/058.mid
Loading Music File: POP909/086/086.mid
Loading Music File: POP909/102/102.mid
Loading Music File: POP909/155/155.mid
Loading Music File: POP909/038/038.mid
Loading Music File: POP909/055/055.mid
Loading Music File: POP909/068/068.mid
Loading Music File: POP909/079/079.mid


In [8]:
# parsed_midi_notes

def get_random_song(song_list):
    rand_song_id = random.randint(0, len(song_list) - 1)
    rand_song = song_list[rand_song_id]
    return rand_song, rand_song_id

In [48]:
# get a view of what a song with parsed midi notes look like
random_song, random_song_id = get_random_song(parsed_midi_notes)
data_frame = pd.DataFrame(random_song)
print("Randomly selected song id:", random_song_id)
data_frame.head(10)

Randomly selected song id: 14


Unnamed: 0,note,duration_type,length
0,G4,eighth,1/3
1,E4,16th,0.25
2,C5,16th,0.25
3,B4,eighth,1/3
4,B4,quarter,1.0
5,B4,eighth,5/12
6,G4,eighth,1/3
7,E4,eighth,1/3
8,E4,quarter,1.0
9,E4,quarter,1.0


In [79]:
# show flat notes
# if default set id = 13, 5
random_song, random_song_id = get_random_song(parsed_midi_notes)
data_frame = pd.DataFrame(random_song)

flats = data_frame[data_frame["note"].str.contains("-")]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 13


Unnamed: 0,note,duration_type,length
386,E-5,eighth,1/3
387,E-5,quarter,2/3


In [128]:
# initial preprocessing
# map flats to equivalent sharps
flats_map = {
    'D-': 'C#',
    'E-': 'D#',
    'G-': 'F#',
    'A-': 'G#',
    'B-': 'A#'
}


def map_flat(song):
    for song_note in song:
        if "-" in song_note["note"]:
            flat_note = song_note["note"][:2]
            song_note["note"] = song_note["note"].replace(flat_note, flats_map[flat_note])


def initial_preprocess(songs):
    for song in songs:
        map_flat(song)
    return songs


initial_preprocessed_notes = initial_preprocess(copy.deepcopy(parsed_midi_notes))

In [165]:
random_song, random_song_id = get_random_song(initial_preprocessed_notes)
print("Randomly selected song id:", random_song_id)
# show if flats still exists
flats = data_frame[data_frame["note"].str.contains("-")]
flats.head(10)

Randomly selected song id: 3


Unnamed: 0,note,duration_type,length


In [221]:
random_song, random_song_id = get_random_song(initial_preprocessed_notes)
data_frame = pd.DataFrame(random_song)
# show complex notes
flats = data_frame[data_frame["duration_type"] == "complex"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 9


Unnamed: 0,note,duration_type,length


In [239]:
random_song, random_song_id = get_random_song(initial_preprocessed_notes)
data_frame = pd.DataFrame(random_song)
# show 32nd notes
flats = data_frame[data_frame["duration_type"] == "32nd"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 10


Unnamed: 0,note,duration_type,length
1,C#5,32nd,1/12
3,E5,32nd,1/12
5,F#5,32nd,1/12
7,A4,32nd,1/12
9,G#4,32nd,1/12
16,F#4,32nd,1/12
31,E5,32nd,1/12
33,F#5,32nd,1/12
48,B4,32nd,1/12
53,F#4,32nd,1/12


In [240]:
duration_map = {
    'whole': 4.0,
    'half': 2.0,
    'quarter': 1.0,
    'eighth': 0.5,
    '16th': 0.25,
}

In [241]:
def map_duration(song):
    duration_type = song["duration_type"]
    if duration_type in duration_map:
        song["length"] = duration_map[duration_type]
        return
    # if note is 32nd then it is transformed in to 16th
    if duration_type == "32nd":
        song["duration_type"] = "16th"
        song["length"] = 0.25

    # if a note is complex then map it to its nearest non-complex note
    if duration_type == "complex":
        length = float(song["length"])
        if length <= 0.25:
            song["duration_type"] = "16th"
            song["length"] = "0.25"

        minimum_duration = "whole"
        distance = float('inf')
        for duration_type, duration_length in duration_map.items():
            if abs(duration_length - length) < distance:
                distance = abs(duration_length - length)
                minimum_duration = duration_type
        song["duration_type"] = minimum_duration
        song["length"] = duration_map[minimum_duration]


def second_preprocess(songs):
    for song in songs:
        for song_note in song:
            map_duration(song_note)
    return songs


second_preprocess_data = second_preprocess(copy.deepcopy(initial_preprocessed_notes))

In [242]:
random_song, random_song_id = get_random_song(second_preprocess_data)
data_frame = pd.DataFrame(random_song)
# show complex notes
flats = data_frame[data_frame["duration_type"] == "complex"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 14


Unnamed: 0,note,duration_type,length


In [244]:
random_song, random_song_id = get_random_song(second_preprocess_data)
data_frame = pd.DataFrame(random_song)
# show 32nd notes
flats = data_frame[data_frame["duration_type"] == "32nd"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 13


Unnamed: 0,note,duration_type,length


In [245]:
def octave_frequency_by_song(song):
    octave_freq = {}
    for song_note in song:
        octave = song_note["note"][-1]
        if octave not in octave_freq:
            octave_freq[octave] = 0
        octave_freq[octave] += 1
    return octave_freq


def octave_frequency_by_songs(songs):
    octave_freq_list = []
    for song in songs:
        octave_freq_by_song = octave_frequency_by_song(song)
        octave_freq_list.append(octave_freq_by_song)
    return octave_freq_list


def get_octave_frequency(songs):
    octave_freq = {}
    for song in songs:
        for song_note in song:
            octave = song_note["note"][-1]
            if octave not in octave_freq:
                octave_freq[octave] = 0
            octave_freq[octave] += 1
    return octave_freq

In [247]:
random_song, random_song_id = get_random_song(second_preprocess_data)
data_frame = pd.DataFrame(random_song)
# show notes on outlier octaves
flats = data_frame[(~data_frame["note"].str.contains("4")) & (~data_frame["note"].str.contains("5"))]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 15


Unnamed: 0,note,duration_type,length
13,C6,16th,0.25
15,C6,eighth,0.5
17,C6,eighth,0.5
29,C6,16th,0.25
59,C6,16th,0.25
60,C6,16th,0.25
61,C6,16th,0.25
62,C6,16th,0.25
63,C6,eighth,0.5
76,C6,16th,0.25


In [248]:
#all song
octave_frequency = get_octave_frequency(second_preprocess_data)
octave_frequency

{'4': 2582, '5': 3585, '6': 1007, '3': 41}

In [250]:
#individual songs
octave_frequency_list = octave_frequency_by_songs(second_preprocess_data)
octave_frequency_list

[{'4': 223, '5': 151},
 {'5': 369, '6': 108, '4': 3},
 {'4': 302, '5': 163},
 {'4': 307, '5': 173, '6': 1},
 {'5': 153, '6': 231},
 {'5': 256, '6': 148},
 {'4': 431, '3': 32, '5': 22},
 {'5': 272, '6': 199},
 {'5': 174, '4': 207},
 {'5': 237, '4': 180},
 {'5': 328, '4': 154, '6': 15},
 {'4': 228, '5': 261},
 {'5': 370, '4': 72, '6': 12},
 {'4': 308, '3': 9, '5': 179, '6': 2},
 {'4': 164, '5': 267},
 {'5': 210, '6': 291, '4': 3}]

In [309]:
def rescale_octave(song, lower_octave_details, higher_octave_details):
    lower_octave = lower_octave_details[0]
    higher_octave = higher_octave_details[0]
    for midi_note in song:
        octave = midi_note["note"][-1]
        if octave == lower_octave:
            midi_note["note"] = midi_note["note"][:-1] + "4"
        elif octave == higher_octave:
            midi_note["note"] = midi_note["note"][:-1] + "5"
    note_freq_per_note = octave_frequency_by_song(song)
    print(note_freq_per_note)


def to_dual_octaves(song, octave_sorted):
    lower_octave_details, higher_octave_details = sorted(octave_sorted[:2], key=lambda x: x[0])
    for song_note in song:
        octave = song_note["note"][-1]
        if octave < lower_octave_details[0]:
            song_note["note"] = song_note["note"][:-1] + lower_octave_details[0]
        elif octave > higher_octave_details[0]:
            song_note["note"] = song_note["note"][:-1] + higher_octave_details[0]

    octave_freq = octave_frequency_by_song(song)

    if len(octave_freq) >= 2 or "4" not in octave_freq or "5" not in octave_freq:
        rescale_octave(song, lower_octave_details, higher_octave_details)


def octave_preprocessing(song):
    octave_freq = octave_frequency_by_song(song)
    # only preprocess songs that has
    if len(octave_freq) >= 2 or "4" not in octave_freq or "5" not in octave_freq:
        octave_sorted = []
        for song_key, song_key_frequency in octave_freq.items():
            octave_sorted.append((song_key, song_key_frequency))
        octave_sorted.sort(key=lambda x: -x[1])
        print(octave_sorted)
        to_dual_octaves(song, octave_sorted)


def third_preprocessing(songs):
    for song in songs:
        octave_preprocessing(song)
    return songs


third_preprocessed_data = third_preprocessing(copy.deepcopy(second_preprocess_data))

[('4', 223), ('5', 151)]
{'4': 223, '5': 151}
[('5', 369), ('6', 108), ('4', 3)]
{'4': 372, '5': 108}
[('4', 302), ('5', 163)]
{'4': 302, '5': 163}
[('4', 307), ('5', 173), ('6', 1)]
{'4': 307, '5': 174}
[('6', 231), ('5', 153)]
{'4': 153, '5': 231}
[('5', 256), ('6', 148)]
{'4': 256, '5': 148}
[('4', 431), ('3', 32), ('5', 22)]
{'5': 453, '4': 32}
[('5', 272), ('6', 199)]
{'4': 272, '5': 199}
[('4', 207), ('5', 174)]
{'5': 174, '4': 207}
[('5', 237), ('4', 180)]
{'5': 237, '4': 180}
[('5', 328), ('4', 154), ('6', 15)]
{'5': 343, '4': 154}
[('5', 261), ('4', 228)]
{'4': 228, '5': 261}
[('5', 370), ('4', 72), ('6', 12)]
{'5': 382, '4': 72}
[('4', 308), ('5', 179), ('3', 9), ('6', 2)]
{'4': 317, '5': 181}
[('5', 267), ('4', 164)]
{'4': 164, '5': 267}
[('6', 291), ('5', 210), ('4', 3)]
{'4': 213, '5': 291}


In [310]:
random_song, random_song_id = get_random_song(third_preprocessed_data)
data_frame = pd.DataFrame(random_song)
# show outlier octaves notes
flats = data_frame[(~data_frame["note"].str.contains("4")) & (~data_frame["note"].str.contains("5"))]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 11


Unnamed: 0,note,duration_type,length


In [311]:
# all songs
octave_frequency = get_octave_frequency(third_preprocessed_data)
octave_frequency

{'4': 3452, '5': 3763}

In [312]:
"""
TO TRAINABLE DATA
"""

'\nTO TRAINABLE DATA\n'

In [313]:

notes_map = bidict({
    'C': 0,
    'C#': 5,
    'D': 10,
    'D#': 15,
    'E': 20,
    'F': 25,
    'F#': 30,
    'G': 35,
    'G#': 40,
    'A': 45,
    'A#': 50,
    'B': 55,
})
train_duration_map = bidict({
    'whole': 0,
    'half': 1,
    'quarter': 2,
    'eighth': 3,
    '16th': 4,
})

def map_note_to_int(song_note):
    pitch = song_note["note"][:-1]
    octave = int(song_note["note"][-1])

    return notes_map[pitch] + train_duration_map[song_note["duration_type"]] + ((octave - 4) * 60)

In [314]:
def song_map_to_int(song):
    song_notes_to_int = []
    for song_note in song:
        song_notes_to_int.append(map_note_to_int(song_note))
    return song_notes_to_int


def songs_map_to_int(songs):
    songs_in_int = []
    for song in songs:
        songs_in_int.append(song_map_to_int(song))
    return songs_in_int

In [320]:
sequence_length = 50
def shift_append(song_in_int, seq_len):
    _X = []
    _Y = []
    limit = len(song_in_int) - seq_len
    for index in range(limit):
        _X.append(song_in_int[index:index + seq_len])
        _Y.append(song_in_int[index + seq_len])

    return _X, _Y


def shift_append_songs(songs_in_int, seq_len):
    X = []
    Y = []
    for song_in_int in songs_in_int:
        x, y = shift_append(song_in_int, seq_len)
        X += x
        Y += y
    return np.array(X), np.array(Y)

In [321]:
songs_map_int = songs_map_to_int(third_preprocessed_data)
len(songs_map_int)

16

In [322]:
X_train, Y_train = shift_append_songs(songs_map_int, sequence_length)

In [324]:
X_train.shape

(6415, 50)

In [325]:
Y_train.shape

(6415,)

In [326]:
with open(f'data/x_train_sample.npy', 'wb') as f:
    np.save(f, X_train)
with open(f'data/y_train_sample.npy', 'wb') as f:
    np.save(f, Y_train)

