In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [2]:
import glob
import pickle
import numpy as np
from music21 import converter, instrument, note, chord, stream
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Activation
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## The data

The dataset contains piano pieces mostly from Final Fantasy (<a href="https://github.com/Skuldur/Classical-Piano-Composer">source</a>) in midi format.

We use music21 to extract the data from midi files and also for create midi files.

When we create a music21 object from a song it will be built with two objects: <b>Note</b>, <b>Chord</b>.
    
    
### Note
Contains information about:
* <b>pitch</b>:  the frequency of the sound, represented with a leter ([A, B, C, D, E, F, G], high->low)
* <b>octave</b>: which set of pitches you use on piano
* <b>offset</b>: location of the note in the piece

### Chord

A set of notes to be played together.

The dataset consist of <b>358</b> different notes and chords. For simplicity we fix the offset to 0.5.

### Reading the data

With ```converter.parse``` we can create a music21 object from the mid file. After that we split the object to Notes and Chords. The chords will be represented as string, where the "." symbol separate the different notes.

In [3]:
notes = []

for file in glob.glob("./midi_songs/*.mid"):
    midi = converter.parse(file)

    notes_to_parse = None
    try:
        parts = instrument.partitionByInstrument(midi)
    except:
        pass
    if parts:
        notes_to_parse = parts.parts[0].recurse()
    else: 
        notes_to_parse = midi.flat.notes

    for element in notes_to_parse:
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, chord.Chord):
            notes.append('.'.join(str(n) for n in element.normalOrder))

After we read and processed all the mid files we will have a long list of notes and chords.

In [4]:
print(notes[0:15])
print(notes[275:300])

['F4', 'F2', 'F4', 'F2', 'F4', 'F2', 'G#4', 'G#2', 'F4', 'F2', 'F4', 'F2', 'G#4', 'G#2', 'F4']
['8.0', '7.11', 'G2', '6.10', '7.11', 'G2', '8.0', '3.8', '0.3', '3.8', '7.11', 'G2', '7.11', '1.6', '10.1', '1.6', '7.11', 'G2', '0.3', 'C2', '10.2', 'D2', 'E-2', '8.0', '7.11']


How much data we have?

In [5]:
print(len(notes))

57359


How many different strings we have?

In [6]:
pitchnames = sorted(set(item for item in notes))

print(len(pitchnames))

358


The different objects:

In [7]:
print(pitchnames)

['0', '0.1', '0.1.5', '0.1.6', '0.2', '0.2.3.7', '0.2.4.7', '0.2.5', '0.2.6', '0.2.7', '0.3', '0.3.5', '0.3.5.8', '0.3.6', '0.3.6.8', '0.3.6.9', '0.3.7', '0.4', '0.4.5', '0.4.6', '0.4.7', '0.5', '0.5.6', '0.6', '1', '1.2', '1.2.4.6.8.10', '1.2.6', '1.2.6.8', '1.3', '1.3.5', '1.3.5.8', '1.3.6', '1.3.7', '1.3.8', '1.4', '1.4.6', '1.4.6.9', '1.4.7', '1.4.7.10', '1.4.7.9', '1.4.8', '1.5', '1.5.8', '1.5.9', '1.6', '1.7', '10', '10.0', '10.0.2.5', '10.0.3', '10.0.4', '10.0.5', '10.1', '10.1.3', '10.1.3.5.6', '10.1.3.6', '10.1.4', '10.1.4.6', '10.1.5', '10.11', '10.11.3', '10.11.3.5', '10.2', '10.2.3', '10.2.4', '10.2.5', '10.3', '11', '11.0', '11.0.4', '11.0.4.6', '11.0.4.7', '11.0.5', '11.1', '11.1.4', '11.1.4.5', '11.1.5', '11.1.6', '11.2', '11.2.4', '11.2.4.6', '11.2.4.7', '11.2.5', '11.2.5.7', '11.2.6', '11.3', '11.3.5', '11.3.6', '11.4', '11.4.5', '2', '2.3', '2.3.7', '2.3.7.10', '2.3.7.9', '2.4', '2.4.5', '2.4.5.9', '2.4.6.9.11', '2.4.7', '2.4.7.10', '2.4.8', '2.4.9', '2.5', '2.5.7', '

## Preprocessing

We have 358 different categories, we map each of them to an integer using a vocabulary (just like if they are words):

In [8]:
n_vocab = len(pitchnames)

note_to_int = dict((note, number) for number, note in enumerate(pitchnames))
int_to_note = dict((number, note) for number, note in enumerate(pitchnames))

We create input sequences of 100 length and we input it to a network, and based on this 100 notes and chords we want to predict the next element of the sequence. 

We use one-hot encoding for the target sequence.

In [24]:
sequence_length = 100

network_input = []
network_output = []

for i in range(0, len(notes) - sequence_length, 1):
    sequence_in  = notes[i:i + sequence_length]
    sequence_out = notes[i + sequence_length]
    network_input.append([note_to_int[char] for char in sequence_in])
    network_output.append(note_to_int[sequence_out])

n_patterns = len(network_input)

network_input  = np.reshape(network_input, (-1, sequence_length, 1)) / float(n_vocab)
network_output = np_utils.to_categorical(network_output)

## The model

<img src="imgs/music_gen.png" width="50%" />

In [46]:
model = Sequential()
model.add(LSTM(512, input_shape=(network_input.shape[1], network_input.shape[2]),return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(512))
model.add(Dropout(0.25))
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Dense(n_vocab))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [47]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_37 (LSTM)               (None, 100, 512)          1052672   
_________________________________________________________________
dropout_37 (Dropout)         (None, 100, 512)          0         
_________________________________________________________________
lstm_38 (LSTM)               (None, 100, 512)          2099200   
_________________________________________________________________
dropout_38 (Dropout)         (None, 100, 512)          0         
_________________________________________________________________
lstm_39 (LSTM)               (None, 512)               2099200   
_________________________________________________________________
dropout_39 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 256)               131328    
__________

### Training

In [None]:
checkpoint = ModelCheckpoint(
        "./piano_weights/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5",
        monitor='loss',
        save_best_only=True,
)

model.fit(network_input, network_output, epochs=200, batch_size=64, callbacks=[checkpoint])

In [48]:
model.load_weights("musicgen_weight.hdf5")

## Generate song

In [60]:
start = np.random.randint(0, len(network_input)-1)

pattern = network_input[start]
prediction_output = []

for note_index in range(500):
    prediction_input = np.reshape(pattern, (1, len(pattern), 1)) / float(n_vocab)

    prediction = model.predict(prediction_input, verbose=0)

    index = np.random.choice(n_vocab, 1, p=prediction[0,:])[0]
    
    prediction_output.append(int_to_note[index])

    pattern = np.append(pattern, index)[1:]

## Create song file

### Remove lot of repetition

In [61]:
processed = []
pit = iter(prediction_output)
for c in pit:
    x = c
    same = []
    while c==x:
        same.append(c)
        c = next(pit)
    if len(same)>2:
        processed.extend(same[0:2])
    else:
        processed.extend(same)
    processed.append(c)

StopIteration: 

In [62]:
offset = 0
output_notes = []

for pattern in prediction_output:
    # pattern is a chord
    if ('.' in pattern) or pattern.isdigit():
        notes_in_chord = pattern.split('.')
        notes = []
        for current_note in notes_in_chord:
            new_note = note.Note(int(current_note))
            new_note.storedInstrument = instrument.Piano()
            notes.append(new_note)
        new_chord = chord.Chord(notes)
        new_chord.offset = offset
        output_notes.append(new_chord)
    # pattern is a note
    else:
        new_note = note.Note(pattern)
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)

    # increase offset each iteration so that notes do not stack
    offset += 0.5

midi_stream = stream.Stream(output_notes)

midi_stream.write('midi', fp="test.mid")

'test.mid'

The notebook is based on this <a href="https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5">article</a>. 