In [1]:
import numpy as np
from music21 import converter, note, duration
import pandas as pd

# Custom Midi note object
from midi_note import MIDINote

#utility
import os
import random
import copy
from bidict import bidict

## Preprocessing Simulation

This notebook is a gist of the overall preprocessing step the researcher applied to the dataset.

![](http://www.musicxlab.com/img/logo-fine.4b1b5226.png)
The study used the dataset from [Music-X-Lab](http://www.musicxlab.com/) from the study [POP909: A Pop-song Dataset for Music Arrangement Generation](https://arxiv.org/abs/2008.07142).

Clone the data repository at [POP909](https://github.com/music-x-lab/POP909-Dataset).

In [2]:
# path of the data storage
PATH = "POP909"

####  Data Selection for simulation
The researcher collected all the songs belonging to 4 musical key for the simulation from the dataset:
 - E minor
 - D minor
 - A major
 - C Major

In [3]:
songs_by_key = dict()

songs_by_key["Emin"] = ['033', '034', '056', '071', '078', '116', '130', '240', '272', '276', '283', '317', '322',
                        '380', '457', '458', '476', '501', '520', '560', '566', '578', '625', '653', '667', '680',
                        '693', '740', '772', '788', '854', '899']
songs_by_key["Dmin"] = ['027', '040', '073', '103', '125', '141', '151', '158', '198', '226', '284', '295', '297',
                        '344', '378', '384', '389', '446', '451', '587', '604', '610', '627', '682', '689', '791',
                        '842', '845', '852', '859', '894']
songs_by_key["Amaj"] = ['058', '086', '102', '155', '174', '183', '208', '231', '291', '346', '353', '362', '410',
                        '463', '475', '478', '497', '499', '615', '641', '687', '691', '721', '728', '751', '774',
                        '810', '867', '891']
songs_by_key["Cmaj"] = ['038', '055', '068', '079', '131', '132', '136', '171', '172', '185', '203', '211', '216',
                        '233', '243', '278', '293', '312', '319', '320', '326', '331', '368', '386', '432', '459',
                        '493', '496', '548', '570', '591', '603', '612', '621', '702', '710', '714', '722', '735',
                        '761', '793', '824', '833', '873', '888', '892', '909']

#### Song Selection
Set how many songs will be selected for each music key using the *sample_limit* parameter.

Note: *To simulate using random sample, set* **random_selection=<span style="color:green">True </span>**

In [4]:
sample_limit = 4
all_song_ids = []
random_selection = True
for key, song_ids in songs_by_key.items():
    if random_selection:
        all_song_ids += random.sample(song_ids, sample_limit)
    else:
        all_song_ids += songs_by_key[key][:sample_limit]

### Parse Raw Notes Function

This function parses a midi file into a python runnable data structure, filters the melody part of the midi song, and save the properties (duration type, length, pitch) a note to create a Note Object.
<p> Example of a Note object: { note: C#4, duration_type: whole, length: 4 }

In [9]:
def parse_raw_notes(file_path):
    print("Loading Music File:", file_path)
    raw_notes = []
    midi_data = converter.parse(file_path)
    for part in midi_data.parts:
        if part.partName == 'MELODY':
            midi_elements = part.recurse()
            for element in midi_elements:
                if isinstance(element, note.Note):
                    note_duration = duration.Duration()
                    note_duration.quarterLength = element.quarterLength
                    raw_note = str(element.pitch)
                    raw_notes.append(MIDINote(raw_note, str(note_duration.type), str(element.quarterLength)).as_map)
    return raw_notes


PATH = "./Varied Rhythm/"
filenames = os.listdir(PATH)
preset_paths = [PATH + filename for filename in filenames]

parsed_midi_notes = []
for song_path in preset_paths:
    parsed_raw_notes = parse_raw_notes(song_path)
    print(len(parsed_raw_notes))
    parsed_midi_notes.append(parsed_raw_notes)

print(parsed_midi_notes[0])

Loading Music File: ./Varied Rhythm/A#Maj.mid
53
Loading Music File: ./Varied Rhythm/A#Min.mid
50
Loading Music File: ./Varied Rhythm/AMaj.mid
51
Loading Music File: ./Varied Rhythm/AMin.mid
53
Loading Music File: ./Varied Rhythm/BMaj.mid
53
Loading Music File: ./Varied Rhythm/BMin.mid
52
Loading Music File: ./Varied Rhythm/C#Maj.mid
53
Loading Music File: ./Varied Rhythm/C#Min.mid
51
Loading Music File: ./Varied Rhythm/CMaj.mid
54
Loading Music File: ./Varied Rhythm/CMin.mid
51
Loading Music File: ./Varied Rhythm/D#Maj.mid
50
Loading Music File: ./Varied Rhythm/D#Min.mid
54
Loading Music File: ./Varied Rhythm/DMaj.mid
52
Loading Music File: ./Varied Rhythm/DMin.mid
51
Loading Music File: ./Varied Rhythm/EMaj.mid
53
Loading Music File: ./Varied Rhythm/EMin.mid
53
Loading Music File: ./Varied Rhythm/F#Maj.mid
51
Loading Music File: ./Varied Rhythm/F#Min.mid
55
Loading Music File: ./Varied Rhythm/FMaj.mid
52
Loading Music File: ./Varied Rhythm/FMin.mid
52
Loading Music File: ./Varied Rhy

### This function returns a random song on a given list
This function will be used throughout the simulation to randomly select and display a song after every preprocessing steps.

In [40]:
def get_random_song(song_list):
    rand_song_id = random.randint(0, len(song_list) - 1)
    rand_song = song_list[rand_song_id]
    return rand_song, rand_song_id

### View random song
Get a general view of what a parsed midi song looks like.

In [41]:
random_song, random_song_id = get_random_song(parsed_midi_notes)
data_frame = pd.DataFrame(random_song)
print("Randomly selected song id:", random_song_id)
data_frame.head(10)

Randomly selected song id: 10


Unnamed: 0,note,duration_type,length
0,C5,eighth,0.5
1,C5,eighth,0.5
2,B-4,eighth,0.5
3,G4,16th,0.25
4,C5,eighth,0.5
5,B-4,eighth,0.5
6,E-5,16th,0.25
7,F5,16th,0.25
8,F5,eighth,0.5
9,C5,16th,0.25


### View random song with flats
Randomly select song to check/show flat notes

In [42]:
# if default set id = 13, 5
random_song, random_song_id = get_random_song(parsed_midi_notes)
data_frame = pd.DataFrame(random_song)

flats = data_frame[data_frame["note"].str.contains("-")]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 0


Unnamed: 0,note,duration_type,length
6,B-4,eighth,0.5
7,B-4,16th,0.25
23,E-5,eighth,0.5
32,E-5,eighth,0.5


## Initial preprocessing
The initial preprocessing includes mapping of **flat** notes to its corresponding **sharp** values

Example: *E-5 (E flat, 5th Octave) is equal to D#5 (D sharp, 5th Octave)*

In [43]:
flats_map = {
    'D-': 'C#',
    'E-': 'D#',
    'G-': 'F#',
    'A-': 'G#',
    'B-': 'A#'
}


def map_flat(song):
    for song_note in song:
        if "-" in song_note["note"]:
            flat_note = song_note["note"][:2]
            song_note["note"] = song_note["note"].replace(flat_note,
                                                          flats_map[flat_note])


def initial_preprocess(songs):
    for song in songs:
        map_flat(song)
    return songs


initial_preprocessed_notes = initial_preprocess(copy.deepcopy(parsed_midi_notes))

### View random song to check for flats
All flats are mapped to sharps, therefore, selecting songs that contain flat values will show empty tables.

In [44]:
random_song, random_song_id = get_random_song(initial_preprocessed_notes)
data_frame = pd.DataFrame(random_song)
print("Randomly selected song id:", random_song_id)
# show if flats still exists
flats = data_frame[data_frame["note"].str.contains("-")]
flats.head(10)

Randomly selected song id: 9


Unnamed: 0,note,duration_type,length


### View random song to check for complex notes
Some songs contain complex notes, they are notes that contain *advanced duration* (notes that are complex and tuplets with fractional durations not found on basic music notes

In [45]:
random_song, random_song_id = get_random_song(initial_preprocessed_notes)
data_frame = pd.DataFrame(random_song)
# show complex notes
flats = data_frame[data_frame["duration_type"] == "complex"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 7


Unnamed: 0,note,duration_type,length


### View random song to check for 32nd duration typed notes
Some songs contain notes that have 32nd duration which is not included in our proposed output classes.

In [46]:
random_song, random_song_id = get_random_song(initial_preprocessed_notes)
data_frame = pd.DataFrame(random_song)
# show 32nd notes
flats = data_frame[data_frame["duration_type"] == "32nd"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 18


Unnamed: 0,note,duration_type,length


## Second Preprocessing
The second preprocessing involves mapping the standard length for each duration type, mapping complex notes
to its nearest non-complex note, and rounding up of all 32nd duration-typed notes to 16th .

In [47]:
duration_map = {
    'whole': 4.0,
    'half': 2.0,
    'quarter': 1.0,
    'eighth': 0.5,
    '16th': 0.25,
}


def map_duration(song):
    duration_type = song["duration_type"]
    if duration_type in duration_map:
        song["length"] = duration_map[duration_type]
        return
    # if note is 32nd then it is transformed in to 16th
    if duration_type == "32nd":
        song["duration_type"] = "16th"
        song["length"] = 0.25

    # if a note is complex then map it to its nearest non-complex note
    if duration_type == "complex":
        length = float(song["length"])
        if length <= 0.25:
            song["duration_type"] = "16th"
            song["length"] = "0.25"

        minimum_duration = "whole"
        distance = float('inf')
        for duration_type, duration_length in duration_map.items():
            if abs(duration_length - length) < distance:
                distance = abs(duration_length - length)
                minimum_duration = duration_type
        song["duration_type"] = minimum_duration
        song["length"] = duration_map[minimum_duration]


def second_preprocess(songs):
    for song in songs:
        for song_note in song:
            map_duration(song_note)
    return songs


second_preprocess_data = second_preprocess(copy.deepcopy(initial_preprocessed_notes))


#### Check if complex notes are still present

In [48]:
random_song, random_song_id = get_random_song(second_preprocess_data)
data_frame = pd.DataFrame(random_song)
# show complex notes
flats = data_frame[data_frame["duration_type"] == "complex"]
print("Randomly selected song id:", random_song_id)
flats.head(10)
random_song

Randomly selected song id: 5


[{'note': 'E5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'D5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'A5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'E5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'F#5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'F#5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'F#5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'E5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'F#5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'F#5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'F#5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'A5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'F#5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'A5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'A5', 'duration_type': '16th', 'length': 0.25},
 {'note': 'A5', 'duration_type': 'eighth', 'length': 0.5},
 {'note': 'G5', 'duration_type': 'eighth', 'length': 0.5

#### Check if 32nd notes are still present

In [49]:
random_song, random_song_id = get_random_song(second_preprocess_data)
data_frame = pd.DataFrame(random_song)
# show 32nd notes
flats = data_frame[data_frame["duration_type"] == "32nd"]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 16


Unnamed: 0,note,duration_type,length


### Utility functions in checking octave frequency

#### Function that returns octave frequency in all songs

In [50]:
def get_octave_frequency(songs):
    octave_freq = {}
    for song in songs:
        for song_note in song:
            octave = song_note["note"][-1]
            if octave not in octave_freq:
                octave_freq[octave] = 0
            octave_freq[octave] += 1
    return octave_freq


octave_frequency = get_octave_frequency(second_preprocess_data)
octave_frequency

{'5': 923, '4': 332}

#### Function that returns octave frequency of a song

In [51]:
def octave_frequency_by_song(song):
    octave_freq = {}
    for song_note in song:
        octave = song_note["note"][-1]
        if octave not in octave_freq:
            octave_freq[octave] = 0
        octave_freq[octave] += 1
    return octave_freq

#### Function that returns octave frequency of each song

In [52]:
def octave_frequency_by_songs(songs):
    octave_freq_list = []
    for song in songs:
        octave_freq_by_song = octave_frequency_by_song(song)
        octave_freq_list.append(octave_freq_by_song)
    return octave_freq_list


#individual songs
octave_frequency_list = octave_frequency_by_songs(second_preprocess_data)
for i, ofl in enumerate(octave_frequency_list):
    print(f"Song no: {i}")
    print(ofl)


Song no: 0
{'5': 51, '4': 2}
Song no: 1
{'5': 49, '4': 1}
Song no: 2
{'5': 38, '4': 13}
Song no: 3
{'4': 8, '5': 45}
Song no: 4
{'5': 51, '4': 2}
Song no: 5
{'5': 51, '4': 1}
Song no: 6
{'4': 16, '5': 37}
Song no: 7
{'4': 18, '5': 33}
Song no: 8
{'5': 32, '4': 22}
Song no: 9
{'4': 14, '5': 37}
Song no: 10
{'5': 39, '4': 11}
Song no: 11
{'5': 30, '4': 24}
Song no: 12
{'4': 26, '5': 26}
Song no: 13
{'4': 24, '5': 27}
Song no: 14
{'5': 25, '4': 28}
Song no: 15
{'5': 44, '4': 9}
Song no: 16
{'5': 34, '4': 17}
Song no: 17
{'4': 18, '5': 37}
Song no: 18
{'5': 42, '4': 10}
Song no: 19
{'4': 20, '5': 32}
Song no: 20
{'5': 48, '4': 4}
Song no: 21
{'4': 9, '5': 44}
Song no: 22
{'4': 30, '5': 24}
Song no: 23
{'5': 47, '4': 5}


### View random song containing notes not belonging to the dominant octaves
Some songs contain notes not belonging to 4th and fifth octaves (outlier notes)

In [53]:
random_song, random_song_id = get_random_song(second_preprocess_data)
data_frame = pd.DataFrame(random_song)

flats = data_frame[(~data_frame["note"].str.contains("4")) & (~data_frame["note"].str.contains("5"))]
print("Randomly selected song id:", random_song_id)
flats.head(10)


Randomly selected song id: 8


Unnamed: 0,note,duration_type,length


## Third Preprocessing


#### This function gets the frequency of octaves of a song and sorts them
The sorted frequency of octaves will identify the most dominant octaves of the song.

In [54]:
def octave_preprocessing(song):
    octave_freq = octave_frequency_by_song(song)

    if len(octave_freq) >= 2 or "4" not in octave_freq or "5" not in octave_freq:
        octave_sorted = []
        for song_key, song_key_frequency in octave_freq.items():
            octave_sorted.append((song_key, song_key_frequency))
        octave_sorted.sort(key=lambda x: -x[1])
        print(octave_sorted)
        to_dual_octaves(song, octave_sorted)

#### This function rescales octaves of notes to the dominant octaves of the song
There are only two octaves proposed in the output class of the model. Hence, songs with
multiple octaves should rescale their outlier octaves to the nearest dominant octaves.

In [55]:
def to_dual_octaves(song, octave_sorted):
    lower_octave_details, higher_octave_details = sorted(octave_sorted[:2], key=lambda x: x[0])
    for song_note in song:
        octave = song_note["note"][-1]
        if octave < lower_octave_details[0]:
            song_note["note"] = song_note["note"][:-1] + lower_octave_details[0]
        elif octave > higher_octave_details[0]:
            song_note["note"] = song_note["note"][:-1] + higher_octave_details[0]

    octave_freq = octave_frequency_by_song(song)

    if "4" not in octave_freq or "5" not in octave_freq:
        rescale_octave(song, lower_octave_details, higher_octave_details)


#### This function rescales octaves to fit the octaves in output class
Some songs have dominant octaves from 3rd to 4th octaves or from 5th to 6th octaves. This method rescales dominant
octaves of the song to 4th and 5th octaves to fit the proposed output classes.

In [56]:
def rescale_octave(song, lower_octave_details, higher_octave_details):
    lower_octave = lower_octave_details[0]
    higher_octave = higher_octave_details[0]
    for midi_note in song:
        octave = midi_note["note"][-1]
        if octave == lower_octave:
            midi_note["note"] = midi_note["note"][:-1] + "4"
        elif octave == higher_octave:
            midi_note["note"] = midi_note["note"][:-1] + "5"
    note_freq_per_note = octave_frequency_by_song(song)
    print(note_freq_per_note)

In [57]:
# run third preprocessing
def third_preprocessing(songs):
    for song in songs:
        octave_preprocessing(song)
    return songs


third_preprocessed_data = third_preprocessing(copy.deepcopy(second_preprocess_data))

[('5', 51), ('4', 2)]
[('5', 49), ('4', 1)]
[('5', 38), ('4', 13)]
[('5', 45), ('4', 8)]
[('5', 51), ('4', 2)]
[('5', 51), ('4', 1)]
[('5', 37), ('4', 16)]
[('5', 33), ('4', 18)]
[('5', 32), ('4', 22)]
[('5', 37), ('4', 14)]
[('5', 39), ('4', 11)]
[('5', 30), ('4', 24)]
[('4', 26), ('5', 26)]
[('5', 27), ('4', 24)]
[('4', 28), ('5', 25)]
[('5', 44), ('4', 9)]
[('5', 34), ('4', 17)]
[('5', 37), ('4', 18)]
[('5', 42), ('4', 10)]
[('5', 32), ('4', 20)]
[('5', 48), ('4', 4)]
[('5', 44), ('4', 9)]
[('4', 30), ('5', 24)]
[('5', 47), ('4', 5)]


### View random song containing notes not belonging to the dominant octaves
Randomly select a song to check the effect of third preprocessing

In [58]:
random_song, random_song_id = get_random_song(third_preprocessed_data)
data_frame = pd.DataFrame(random_song)
# show outlier octaves notes
flats = data_frame[(~data_frame["note"].str.contains("4")) & (~data_frame["note"].str.contains("5"))]
print("Randomly selected song id:", random_song_id)
flats.head(10)

Randomly selected song id: 23


Unnamed: 0,note,duration_type,length


The result of the third preprocessing based on octave_frequency on the sample dataset

In [59]:
octave_frequency = get_octave_frequency(third_preprocessed_data)
octave_frequency

{'5': 923, '4': 332}

### Conversion to Integer
The following preprocessing step will convert the MIDI objects to integers that can be used to train the model.

There are 12 notes in total within an octave (C, C#, D, D#, E, F, F#, G, G#, A, A#, B) with 5 different duration types (whole, half, quarter, eighth, 16th). The mapping that will be used groups similar pitches within an octave together, which leads to groups of 5. In total, there are `12 notes * 5 duration types * 2 octaves = 120` note mappings.

`notes_map` is a bidirectional dictionary that maps the note name to the first index of the pitch's group within an octave. 

`train_duration_map` is a bidirectional dictionary that maps the duration type to its corresponding integer. This is used to offset the number in `notes_map` to get the correct note pitch and duration.

Currently, the baseline octave is the 4th octave. Any octave above that will offset the corresponding integer by `12 notes * 5 durations = 60` indices (e.g. C4 whole note is index 0, C5 whole note is index 60, C6 whole note is 120, etc.).

In [60]:

notes_map = bidict({
    'C': 0,
    'C#': 5,
    'D': 10,
    'D#': 15,
    'E': 20,
    'F': 25,
    'F#': 30,
    'G': 35,
    'G#': 40,
    'A': 45,
    'A#': 50,
    'B': 55,
})
train_duration_map = bidict({
    'whole': 0,
    'half': 1,
    'quarter': 2,
    'eighth': 3,
    '16th': 4,
})


def map_note_to_int(song_note):
    pitch = song_note["note"][:-1]
    octave = int(song_note["note"][-1])

    return notes_map[pitch] + train_duration_map[song_note["duration_type"]] + ((octave - 4) * 60)

#### To trainable data, an example:
Given the explanation of the mapping above, the calculation of the integer representation of a note is as follows:
```
(note_idx * 5) + duration_idx + ((octave - 4) * 60)
```
The `octave - 4` portion of the equation assumes that the baseline/0th index octave is 4th octave. `note_map`'s values already consider the multiplication by 5.

Let's say we wish to convert a `C#4 eighth note` to an integer. `C#4` is the **2nd** note in the notes map (index 1), and `eighth` is the **4th** note in the duration map (index 3). `C#4` is in the 4th octave, which means its index will not be offset. The final calculation will be
```
(1 * 5) + 3 + ((4 - 4) * 60) = 8 
```

In [61]:
map_note_to_int(MIDINote("C#4", "eighth", 0.5).as_map)

8

As another example, let us convert `F5 16th note` to an integer. `F5` is the **6th** note in the notes map (index 5), and `16th` is the **5th** type in the duration map (index 4). `F5` is in the 5th octave, which means its index will be offset by `12 notes * 5 durations = 60`. The final calculation will be
```
(5 * 5) + 4 + ((5-4) * 60) = 89
```

In [62]:
map_note_to_int(MIDINote("F5", "16th", 0.25).as_map)

89

The songs, after being converted to an array of `MIDINote`s, will be converted to an array of integers using the `map_note_to_int` function explained earlier.

#### Applying the integer mapping

In [64]:
def song_map_to_int(song):
    song_notes_to_int = []
    for song_note in song:
        song_notes_to_int.append(map_note_to_int(song_note))
    return song_notes_to_int


def songs_map_to_int(songs):
    songs_in_int = []
    for song in songs:
        songs_in_int.append(song_map_to_int(song))
    return songs_in_int


songs_map_int = songs_map_to_int(third_preprocessed_data)
presets = {}

for i in range(len(songs_map_int)):
    key = filenames[i].split('.')[0].replace("M","m")
    presets[i] = {"key": key, "melody": songs_map_int[i]}

with open(f'data/presets.npy', 'wb') as f:
    np.save(f, presets)
presets

{0: {'key': 'A#maj',
  'melody': [74,
   73,
   63,
   74,
   62,
   88,
   53,
   54,
   74,
   87,
   98,
   87,
   73,
   74,
   73,
   64,
   63,
   63,
   61,
   88,
   89,
   88,
   99,
   78,
   99,
   97,
   72,
   64,
   64,
   73,
   74,
   72,
   78,
   74,
   108,
   73,
   64,
   73,
   63,
   74,
   89,
   73,
   99,
   108,
   88,
   64,
   108,
   99,
   88,
   74,
   89,
   98,
   89]},
 1: {'key': 'A#min',
  'melody': [114,
   79,
   102,
   92,
   77,
   78,
   69,
   89,
   94,
   104,
   93,
   89,
   78,
   113,
   64,
   114,
   78,
   79,
   78,
   104,
   104,
   89,
   104,
   63,
   78,
   113,
   113,
   88,
   113,
   111,
   69,
   89,
   88,
   68,
   79,
   87,
   77,
   68,
   79,
   103,
   104,
   104,
   89,
   87,
   68,
   63,
   113,
   113,
   54,
   64]},
 2: {'key': 'Amaj',
  'melody': [82,
   93,
   83,
   59,
   59,
   69,
   83,
   59,
   58,
   59,
   93,
   94,
   84,
   68,
   69,
   82,
   84,
   69,
   58,
   69,
   59,
   49,
   58,
  

### Generating the input data for the model
This final process called **shift append** applies the *sliding window* algorithm of size `sequence_length` to generate
the X, and Y values needed as an input for the model.

As an example, a song (already mapped to int) has [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] elements.

Given that we have `sequence_length=4`, the process for the shift append is of follows:

**[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]**
<pre>
    x₁            y₁
[1, 2, 3, 4] --> [5]

       x₂           y₂
   [2, 3, 4, 5] --> [6]

          x₃           y₃
      [3, 4, 5, 6] --> [7]

             x₄           y₄
         [4, 5, 6, 7] --> [8]

                x₅           y₅
            [5, 6, 7, 8] --> [9]

                   x₆            y₆
               [6, 7, 8, 9] --> [10]
</pre>
Everytime the window of size `sequence_length` slides, the values inside the window are stored
as a single `x` value and its corresponding `y` value is the element after the window.

The aggregated `x` values and `y` values would then be:

<pre>
X = [                   Y = [
    [1, 2, 3, 4],               5,
    [2, 3, 4, 5],               6,
    [3, 4, 5, 6],               7,
    [4, 5, 6, 7],               8,
    [5, 6, 7, 8],               9,
    [6, 7, 8, 9]                10,
]                           ]
</pre>

In [56]:
sequence_length = 50

def shift_append(song_in_int, seq_len):
    _X = []
    _Y = []
    limit = len(song_in_int) - seq_len
    for index in range(limit):
        _X.append(song_in_int[index:index + seq_len])
        _Y.append(song_in_int[index + seq_len])

    return _X, _Y


def shift_append_songs(songs_in_int, seq_len):
    X = []
    Y = []
    for song_in_int in songs_in_int:
        x, y = shift_append(song_in_int, seq_len)
        X += x
        Y += y
    return np.array(X), np.array(Y)


X_train, Y_train = shift_append_songs(songs_map_int, sequence_length)

#### Dimensions for X_train

In [57]:
X_train.shape

(6286, 50)

Dimensions for Y_train

In [None]:
Y_train.shape

#### Saving the training data

In [59]:
with open(f'data/x_train_sample.npy', 'wb') as f:
    np.save(f, X_train)
with open(f'data/y_train_sample.npy', 'wb') as f:
    np.save(f, Y_train)