In [1]:
import mido
import os
import numpy as np
from scipy import stats
from os import listdir
from os.path import isfile, join
import tensorflow as tf
from tensorflow.contrib import rnn
import time

import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = (12, 8)

In [2]:
TICKS_PER_BEAT = 480
TEMPO = int(mido.bpm2tempo(120))

In [3]:
%run get_training_data.py

### Create training data

In [4]:
def get_training_data(num):
    """Generate training data array for all files in "midis_processed/" directory.
       
    Returns:
        Numpy array of training data.
    """
    training_data = []
    for filename in os.listdir('midis_processed/')[:num]:
        if filename.endswith(".mid"):
            vector = midi_to_vector('midis_processed/' + filename)
            if len(vector) > 50:
                training_data.append(vector)
            else:
                print("Faulty training data: " + filename + ". len(vector)=" + str(len(vector)))
    return np.array(training_data)

In [5]:
training_data = get_training_data(100)

Faulty training data: Adventure_Island_II-Eggplant.mid. len(vector)=0
Faulty training data: Adventure_Island_II-Game_Over.mid. len(vector)=0
Faulty training data: Adventure_Island_II-Victory.mid. len(vector)=0
Faulty training data: Alfred_Chicken_Level_1.mid. len(vector)=0
Faulty training data: alien3sb.mid. len(vector)=0
Faulty training data: alphago.mid. len(vector)=0
Faulty training data: Amagon-Death.mid. len(vector)=0
Faulty training data: Amagon-Start_2.mid. len(vector)=0


In [None]:
training_data = np.array([np.array(x) for x in training_data])
training_data

### Parameters

In [7]:
vocab_size = len(training_data[0][0])
vocab_size

1124

In [8]:
# Parameters
learning_rate = 0.001
training_iters = 5000
display_step = 10
n_input = 50

# number of units in RNN cell
n_hidden = 512

# tf Graph input
x = tf.placeholder("float", [n_input, vocab_size])
y = tf.placeholder("int32", [1, n_input])

# RNN output node weights and biases
softmax_w = tf.Variable(tf.random_normal([n_hidden, vocab_size]))
softmax_b = tf.Variable(tf.random_normal([vocab_size]))

max_grad_norm = 5

In [9]:
# tf Graph generative input
x_generation = tf.placeholder("float", [1, vocab_size])

## Create tensor flow graph

In [10]:
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden),rnn.BasicLSTMCell(n_hidden)])

In [11]:
x_split = tf.split(x, n_input, 0)
x_split

[<tf.Tensor 'split:0' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:1' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:2' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:3' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:4' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:5' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:6' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:7' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:8' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:9' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:10' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:11' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:12' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:13' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:14' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:15' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:16' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:17' shape=(1, 1124) dtype=float32>,
 <tf.Tensor 'split:1

In [12]:
outputs, states = rnn.static_rnn(rnn_cell, x_split, dtype=tf.float32)
outputs

[<tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_2:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_5:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_8:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_11:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_14:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_17:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_20:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_23:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_26:0' shape=(1, 512) dtype=float32>,
 <tf.Tensor 'rnn/rnn/multi_rnn_cell/cell

In [13]:
logits = [tf.nn.xw_plus_b(output, softmax_w, softmax_b) for output in outputs]

In [14]:
logits = tf.reshape(logits, [1, n_input, vocab_size])
logits

<tf.Tensor 'Reshape:0' shape=(1, 50, 1124) dtype=float32>

In [15]:
loss = tf.contrib.seq2seq.sequence_loss(logits, y, weights=tf.ones([1, n_input], dtype=tf.float32), # check this
                                        average_across_timesteps=False, average_across_batch=True)
loss

<tf.Tensor 'sequence_loss/truediv:0' shape=(50,) dtype=float32>

In [16]:
cost = tf.reduce_sum(loss)
cost

<tf.Tensor 'Sum:0' shape=() dtype=float32>

In [17]:
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)
optimizer

<tf.Operation 'RMSProp' type=NoOp>

In [18]:
tvars = tf.trainable_variables()
tvars

[<tf.Variable 'Variable:0' shape=(512, 1124) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(1124,) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/kernel:0' shape=(1636, 2048) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_0/basic_lstm_cell/bias:0' shape=(2048,) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_1/basic_lstm_cell/kernel:0' shape=(1024, 2048) dtype=float32_ref>,
 <tf.Variable 'rnn/multi_rnn_cell/cell_1/basic_lstm_cell/bias:0' shape=(2048,) dtype=float32_ref>]

## Run tensor flow session

In [19]:
session = tf.Session()

In [20]:
def get_midi_xy(midi_file_number):
    offset = np.random.randint(0, len(training_data[midi_file_number]) - n_input - 1)
    midi_x = training_data[midi_file_number][offset:offset+n_input]
    midi_y = training_data[midi_file_number][1 + offset:1 + offset+n_input].dot(range(vocab_size)).reshape(1, n_input)
    return midi_x, midi_y

In [21]:
def seconds_to_hoursminsecstr(seconds):
    hours = seconds//3600
    seconds -= 3600*hours
    minutes = seconds//60
    seconds -= 60*minutes
    if hours < 10: hours = "0" + str(hours)
    if minutes < 10: minutes = "0" + str(minutes)
    if seconds < 10: seconds = "0" + str(seconds)
        
    return str(hours) + ":" + str(minutes) + "." + str(seconds)

In [22]:
training_loss_list = []

init = tf.global_variables_initializer()
session.run(init)
step = 0
loss_total = 0

start_time = time.time()

while step < training_iters:
    for i in range(len(training_data)):
        midi_x, midi_y = get_midi_xy(i)

        _, loss = session.run([optimizer, cost], feed_dict={x: midi_x, y: midi_y})
        loss_total += loss
        if (step+1) % display_step == 0:
            print("Iter=" + str(step+1) + ", Average Loss={:.6f}, Time Elapsed={}".format(loss_total/display_step, 
                 seconds_to_hoursminsecstr(int(time.time()-start_time))))
            loss_total = 0
        training_loss_list.append(loss_total/display_step)
        step += 1

Iter=10, Average Loss=351.220113, Time Elapsed=00:00.13
Iter=20, Average Loss=345.714005, Time Elapsed=00:00.28
Iter=30, Average Loss=378.550507, Time Elapsed=00:00.42
Iter=40, Average Loss=337.493622, Time Elapsed=00:00.55
Iter=50, Average Loss=379.733463, Time Elapsed=00:01.07
Iter=60, Average Loss=320.364847, Time Elapsed=00:01.21
Iter=70, Average Loss=357.275253, Time Elapsed=00:01.35
Iter=80, Average Loss=350.950595, Time Elapsed=00:01.49
Iter=90, Average Loss=337.162985, Time Elapsed=00:02.05
Iter=100, Average Loss=292.205197, Time Elapsed=00:02.20
Iter=110, Average Loss=311.932263, Time Elapsed=00:02.36
Iter=120, Average Loss=309.456189, Time Elapsed=00:02.52
Iter=130, Average Loss=283.006604, Time Elapsed=00:03.10
Iter=140, Average Loss=303.543643, Time Elapsed=00:03.25
Iter=150, Average Loss=269.062836, Time Elapsed=00:03.42
Iter=160, Average Loss=302.218753, Time Elapsed=00:03.58
Iter=170, Average Loss=297.125015, Time Elapsed=00:04.13
Iter=180, Average Loss=308.622328, Time 

KeyboardInterrupt: 

## Generate music from trained model

In [23]:
N_GENERATED_NOTE_VECTORS = 1000

In [24]:
outputs_generation, states_generation = rnn.static_rnn(rnn_cell, [x_generation], dtype=tf.float32)
outputs_generation

[<tf.Tensor 'rnn/rnn/multi_rnn_cell/cell_1/cell_1/basic_lstm_cell/mul_152:0' shape=(1, 512) dtype=float32>]

In [25]:
logits_generation = tf.nn.xw_plus_b(outputs_generation[0], softmax_w, softmax_b)
logits_generation

<tf.Tensor 'xw_plus_b_50:0' shape=(1, 1124) dtype=float32>

In [26]:
seed_note = np.zeros(vocab_size)
seed_note[0] = 1
seed_note = np.reshape(seed_note, [1, vocab_size])

generated_music = [seed_note]
for i in range(N_GENERATED_NOTE_VECTORS):
    onehot_pred = session.run(logits_generation, feed_dict={x_generation: generated_music[-1]})

    next_note = np.zeros(vocab_size)
    next_note[np.argmax(onehot_pred, 1)] = 1
    next_note = np.reshape(next_note,  [1, vocab_size])
    
    generated_music.append(next_note)

# Convert generated music to midi

In [27]:
def get_track_sequences(vector_sequence):
    """Get note sequences for each track from vector sequence output from RNN.

    Args:
       vector_sequence: List of one-hot vectors containing 128 * NUM_TRACKS note_on events, 128 * NUM_TRACKS note_off events,
        and NUM_TIMESHIFTS timeshift events in intervals of 10 ms each.
       
    Returns:
        List of track vector sequences.
    """
    track_sequences = [[] for _ in range(NUM_TRACKS)]
    start_time = 0
    for vector in vector_sequence:
        index = vector.index(1)
        # vector is a time event.
        if index >= NUM_MIDI_PITCHES * 2 * NUM_TRACKS:
            time = (index - NUM_MIDI_PITCHES * 2 * NUM_TRACKS + 1) * TIMESHIFT_LENGTH
            start_time += time
        else:
            # vector is a note_off event.
            if index >= NUM_MIDI_PITCHES * NUM_TRACKS:
                note_type = 'note_off'
                index -= NUM_MIDI_PITCHES * NUM_TRACKS
            # vector is a note_on event.
            else:
                note_type = 'note_on'
            track_num = int(index / NUM_MIDI_PITCHES)
            note = index % NUM_MIDI_PITCHES
            track_sequences[track_num].append({"type": note_type, "note": note, "start_time": start_time})
            time_delay = 0
            
    return track_sequences
    

In [28]:
def get_midi_from_vector_sequence(track_sequences):
    """Get midi from track sequences.

    Args:
       track_sequences: List of vector sequences for each track.
       
    Returns:
        MidiFile object.
    """
    mid = mido.MidiFile()
    tracks = [mido.MidiTrack() for _ in range(NUM_TRACKS)]
    mid.tracks.extend(tracks)

    for i, ts in enumerate(track_sequences):
        prev_start_time = 0
        for event in ts:
            time = event["start_time"] - prev_start_time
            prev_start_time = event["start_time"]
            # I picked a random number for velocity.
            mid.tracks[i].append(mido.Message(event["type"], note=event["note"], velocity=50,
                                              time=int(mido.second2tick(time, TICKS_PER_BEAT, TEMPO))))
    
    return mid

In [29]:
track_sequence = get_track_sequences([list(x[0]) for x in generated_music])
track_sequence

[[{'note': 0, 'start_time': 0, 'type': 'note_on'},
  {'note': 44, 'start_time': 0.01, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.02, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.03, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.04, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.05, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.060000000000000005, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.07, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.08, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.09, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.09999999999999999, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.10999999999999999, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.11999999999999998, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.12999999999999998, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.13999999999999999, 'type': 'note_off'},
  {'note': 44, 'start_time': 0.15, 'type': 'note_off'},
  {'note': 44, 'st

In [30]:
midi = get_midi_from_vector_sequence(track_sequence)
midi

<midi file None type 1, 4 tracks, 501 messages>

# Save midi

In [31]:
midi.save('generated_music.mid')