#### Setting up Kaggle Dataset (run this section only while working in colab environment)

In [None]:
#! pip install -q kaggle

In [None]:
#from google.colab import files

In [None]:
#files.upload()

In [None]:
#!unzip "/content/musicnet_midis.zip"

In [None]:
#!unzip ""

#### kaggle data collection (not needed while working in kaggle because the data is already in kaggle)

In [None]:
#! mkdir ~/.kaggle

In [None]:
#! cp kaggle.json ~/.kaggle/

In [None]:
#! chmod 600 ~/.kaggle/kaggle.json

In [None]:
#! kaggle datasets list

In [None]:
#! kaggle datasets download -d imsparsh/musicnet-dataset

In [None]:
#! unzip /content/musicnet-dataset.zip

#### Peeping into data.

In [None]:
!pip install mido

In [None]:
from mido import MidiFile, MidiTrack, Message
import mido
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.models import load_model
import os
from tqdm import *

In [None]:
!pip install pygame

In [None]:
!pip install IPython

In [None]:
!pip install music21

In [None]:
import pygame
import IPython
import matplotlib.pyplot as plt
import librosa.display
from IPython import *
from music21 import *
from music21 import converter, instrument, note, chord, stream, midi #just so that I remember what to use, you could delete this line.
import glob
import time
import numpy as np
import keras.utils as utils
import pandas as pd

In [None]:
#visualise the tracks in midi file.
mid=MidiFile('../input/musicnet-dataset/musicnet_midis/musicnet_midis/Beethoven/2313_qt15_1.mid',clip=True)

mid.tracks

In [None]:
#visualise each message in one track
for i in mid.tracks[1] :
    print(i)

#### Music21 and MuseScore set up

Setting up the environment so that we can visualise the notes in old school sheet style. Basically these libraries run upon linux environment. So we are setting it up so that it can run in windows as well.

In [None]:
!pip install --upgrade music21

In [None]:
!add-apt-repository ppa:mscore-ubuntu/mscore-stable -y
!apt-get update
!apt-get install musescore -y

In [None]:
!apt-get install xvfb -y

In [None]:
!sh -e /etc/init.d/x11-common start

In [None]:
import os
os.putenv('DISPLAY', ':99.0')

In [None]:
!start-stop-daemon --start --pidfile /var/run/xvfb.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -screen 0 1024x768x24 -ac +extension GLX +render -noreset

In [None]:
from music21 import *
us = environment.UserSettings()
us['musescoreDirectPNGPath'] = '/usr/bin/mscore'
us['directoryScratch'] = '/tmp'


#### Convert between Midi file and numpy array in melody format.

Music is more complex than regular text that we feed into sequential model. The quantisation of music will give us Notes.

Combinations of notes become chords. And together in a pattern they make beautiful melody.

We are using Music21 library to read MIDI music files and convert to an array of notes. And also to convert the predicted note array to music stream.

* All complex rythms are simplified to sixteenth note versions.
* Chords are simplified to the highest note.

This encoding and decoding is borrowed from Melody-RNN's code. 

In [None]:
# Melody-RNN Format is a sequence of 8-bit integers indicating the following:
# MELODY_NOTE_ON = [0, 127] # (note on at that MIDI pitch)
MELODY_NOTE_OFF = 128 # (stop playing all previous notes)
MELODY_NO_EVENT = 129 # (no change from previous event)
# Each element in the sequence lasts for one sixteenth note.
# This can encode monophonic music only.

def streamToNoteArray(stream):
    """
    Convert a Music21 sequence to a numpy array of int8s into Melody-RNN format:
        0-127 - note on at specified pitch
        128   - note off
        129   - no event
    """
    # Part one, extract from stream
    total_length = np.int(np.round(stream.flat.highestTime / 0.25)) # in semiquavers
    stream_list = []
    for element in stream.flat:
        if isinstance(element, note.Note):
            stream_list.append([np.round(element.offset / 0.25), np.round(element.quarterLength / 0.25), element.pitch.midi])
        elif isinstance(element, chord.Chord):
            stream_list.append([np.round(element.offset / 0.25), np.round(element.quarterLength / 0.25), element.sortAscending().pitches[-1].midi])
    np_stream_list = np.array(stream_list, dtype=np.int)
    df = pd.DataFrame({'pos': np_stream_list.T[0], 'dur': np_stream_list.T[1], 'pitch': np_stream_list.T[2]})
    df = df.sort_values(['pos','pitch'], ascending=[True, False]) # sort the dataframe properly
    df = df.drop_duplicates(subset=['pos']) # drop duplicate values
    # part 2, convert into a sequence of note events
    output = np.zeros(total_length+1, dtype=np.int16) + np.int16(MELODY_NO_EVENT)  # set array full of no events by default.
    # Fill in the output list
    for i in range(total_length):
        if not df[df.pos==i].empty:
            n = df[df.pos==i].iloc[0] # pick the highest pitch at each semiquaver
            output[i] = n.pitch # set note on
            output[i+n.dur] = MELODY_NOTE_OFF
    return output


def noteArrayToDataFrame(note_array):
    """
    Convert a numpy array containing a Melody-RNN sequence into a dataframe.
    """
    df = pd.DataFrame({"code": note_array})
    df['offset'] = df.index
    df['duration'] = df.index
    df = df[df.code != MELODY_NO_EVENT]
    df.duration = df.duration.diff(-1) * -1 * 0.25  # calculate durations and change to quarter note fractions
    df = df.fillna(0.25)
    return df[['code','duration']]


def noteArrayToStream(note_array):
    """
    Convert a numpy array containing a Melody-RNN sequence into a music21 stream.
    """
    df = noteArrayToDataFrame(note_array)
    melody_stream = stream.Stream()
    for index, row in df.iterrows():
        if row.code == MELODY_NO_EVENT:
            new_note = note.Rest() # bit of an oversimplification, doesn't produce long notes.
        elif row.code == MELODY_NOTE_OFF:
            new_note = note.Rest()
        else:
            new_note = note.Note(row.code)
        new_note.quarterLength = row.duration
        melody_stream.append(new_note)
    return melody_stream


#### Showing the Notes

In [None]:
wm_mid = converter.parse("../input/musicnet-dataset/musicnet_midis/musicnet_midis/Beethoven/2313_qt15_1.mid")
wm_mid.show()
wm_mel_rnn = streamToNoteArray(wm_mid)
print(wm_mel_rnn)
noteArrayToStream(wm_mel_rnn).show()

#### Model code that is simple. 

This implementation is for simpler models. You skip it and go for the complex one. Basically this one is just experimental.

In [None]:
#gettinng the note on values from the messages on 50 midi files
note_on=[]
n=50
for m in range(n):
    mid=MidiFile('../input/musicnet-dataset/musicnet_midis/musicnet_midis/Beethoven/'+os.listdir('../input/musicnet-dataset/musicnet_midis/musicnet_midis/Beethoven')[m],clip=True)
    for j in range(len(mid.tracks)):
        for i in mid.tracks[j] :
            if str(type(i))!="<class 'mido.midifiles.meta.MetaMessage'>" :
                x=str(i).split(' ')
                if x[0]=='note_on':
                    note_on.append(int(x[2].split('=')[1]))

In [None]:
#making data to train
training_data=[]
labels=[]
for i in range(20,len(note_on)):
    training_data.append(note_on[i-20:i])
    labels.append(note_on[i])

In [None]:

training_data[0]
labels[0]

In [None]:
different_labels=set(labels)

Normal LSTM model.

In [None]:
model1= Sequential()

model1.add(LSTM(200,input_shape=(10,1),unroll=True,return_sequences=True))
model1.add(Dropout(0.4))
model1.add(LSTM(100))
model1.add(Dense(100,'relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1,'relu'))

model1.compile(loss='MSE',optimizer='adam')


WAVENET without residual and skip connection. (You can implement it. It was experimental for me)

In [None]:
from keras.layers import *
from keras.models import *
from keras.callbacks import *
import keras.backend as K

#the residual and skip connection used, only helps to converge faster. 
model = Sequential()
    
#embedding layer
model.add(Embedding(len(unique_x), 100, input_length=32,trainable=True)) 

model.add(Conv1D(64,3, padding='causal',activation='relu'))
model.add(Dropout(0.2))
model.add(MaxPool1D(2))
    
model.add(Conv1D(128,3,activation='relu',dilation_rate=2,padding='causal'))
model.add(Dropout(0.2))
model.add(MaxPool1D(2))

model.add(Conv1D(256,3,activation='relu',dilation_rate=4,padding='causal'))
model.add(Dropout(0.2))
model.add(MaxPool1D(2))
          
#model.add(Conv1D(256,5,activation='relu'))    
model.add(GlobalMaxPool1D())
    
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='relu'))
    
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

model.summary()

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, verbose=0)

training_data=np.array(training_data)
training_data=training_data.reshape((training_data.shape[0],training_data.shape[1],1))
labels=np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(training_data, labels, test_size=0.05, random_state=42)

In [None]:
X_train.shape

In [None]:
model.fit(X_train,y_train,epochs=200,batch_size=32 * strategy.num_replicas_in_sync,
          validation_data=(X_test,y_test),callbacks=[early_stop])

In [None]:
model.save('musicnetgen.h5')

This small encoder and decoder code is from Puru Behl's. Well with the model I am not using it as well. So if u want you can use it if you are using the simpler model otherwise move forward.

In [None]:
#prediction 

n=200
starter_notes=training_data[0]
x=training_data[0].reshape(1,20,1)
tune=list(training_data[0].reshape(-1,))
for i in range(n) :
    pred=int(model.predict(x)[0][0])
    if round(pred)==round(tune[-1]):
        p=np.random.choice(['a','b','c'])
        if p=='a':
            pred=65
        elif p=='b':
            pred=60
        else:
            pred=70
    tune.append(pred)
    x=tune[-10:]
    x=np.array(x)
    x=x.reshape(1,10,1)
    
tune=list(np.array(tune).astype('float32'))

In [None]:
#encoder

offset = 0
output_notes = []
# create note and chord objects based on the values generated by the model
for patterns in tune:
    pattern=str(patterns)
    # pattern is a chord
    if ('.' in pattern) or pattern.isdigit():
        notes_in_chord = pattern.split('.')
        notes = []
        for current_note in notes_in_chord:
            new_note = note.Note(int(current_note))
            new_note.storedInstrument = instrument.Piano()
            notes.append(new_note)
        new_chord = chord.Chord(notes)
        new_chord.offset = offset
        output_notes.append(new_chord)
    # pattern is a note
    else:
        new_note = note.Note(pattern)
        new_note.offset = offset
        new_note.storedInstrument = instrument.Piano()
        output_notes.append(new_note)
    # increase offset each iteration so that notes do not stack
    offset += 0.5

#### Code complex

This is the model I am running. 

In [None]:
import time
midi_files = glob.glob("../input/musicnet-dataset/musicnet_midis/musicnet_midis/Beethoven/*.mid") # this won't work, no files there.

training_arrays = []
for f in midi_files:
    start = time.perf_counter()
    try:
        s = converter.parse(f)
    except:
        continue
#     for p in s.parts: # extract all voices
#         arr = streamToNoteArray(p)
#         training_arrays.append(p)
    arr = streamToNoteArray(s.parts[0]) # just extract first voice
    training_arrays.append(arr)
    print("Converted:", f, "it took", time.perf_counter() - start)

training_dataset = np.array(training_arrays)
np.savez('melody_training_dataset.npz', train=training_dataset)

In [None]:
# Training Hyperparameters:
VOCABULARY_SIZE = 130 # known 0-127 notes + 128 note_off + 129 no_event
SEQ_LEN = 30
BATCH_SIZE = 64
HIDDEN_UNITS = 256
EPOCHS = 30
SEED = 2345  # 2345 seems to be good.
np.random.seed(SEED)

## Load up some melodies I prepared earlier...
with np.load('../input/melody-training-datasetnpz/melody_training_dataset.npz', allow_pickle=True) as data:
    train_set = data['train']

print("Training melodies:", len(train_set))

In [None]:
def slice_sequence_examples(sequence, num_steps):
    """Slice a sequence into redundant sequences of lenght num_steps."""
    xs = []
    for i in range(len(sequence) - num_steps - 1):
        example = sequence[i: i + num_steps]
        xs.append(example)
    return xs

def seq_to_singleton_format(examples):
    """
    Return the examples in seq to singleton format.
    """
    xs = []
    ys = []
    for ex in examples:
        xs.append(ex[:-1])
        ys.append(ex[-1])
    return (xs,ys)

# Prepare training data as X and Y.
# This slices the melodies into sequences of length SEQ_LEN+1.
# Then, each sequence is split into an X of length SEQ_LEN and a y of length 1.

# Slice the sequences:
slices = []
for seq in train_set:
    slices +=  slice_sequence_examples(seq, SEQ_LEN+1)

# Split the sequences into Xs and ys:
X, y = seq_to_singleton_format(slices)
# Convert into numpy arrays.
X = np.array(X)
y = np.array(y)

# Look at the size of the training corpus:
print("Total Training Corpus:")
print("X:", X.shape)
print("y:", y.shape)
print()

# Have a look at one example:
print("Looking at one example:")
print("X:", X[95])
print("y:", y[95])
# Note: Music data is sparser than text, there's lots of 129s (do nothing)
# and few examples of any particular note on.
# As a result, it's a bit harder to train a melody-rnn.

In [None]:
# Do some stats on the corpus.
all_notes = np.concatenate(train_set)
print("Number of notes:")
print(all_notes.shape)
all_notes_df = pd.DataFrame(all_notes)
print("Notes that do appear:")
unique, counts = np.unique(all_notes, return_counts=True)
print(unique)
print("Notes that don't appear:")
print(np.setdiff1d(np.arange(0,129),unique))

print("Plot the relative occurences of each note:")
import matplotlib.pyplot as plt
%matplotlib inline

#plt.style.use('dark_background')
plt.bar(unique, counts)
plt.yscale('log')
plt.xlabel('melody RNN value')
plt.ylabel('occurences (log scale)')

##### Training RNN

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import load_model

# build the model: 2-layer LSTM network.
# Using Embedding layer and sparse_categorical_crossentropy loss function 
# to save some effort in preparing data.

print('Build model...')
model_train = Sequential()
model_train.add(Embedding(VOCABULARY_SIZE, HIDDEN_UNITS, input_length=SEQ_LEN))

# LSTM part
model_train.add(LSTM(HIDDEN_UNITS, return_sequences=True))
model_train.add(LSTM(HIDDEN_UNITS))

# Project back to vocabulary
model_train.add(Dense(VOCABULARY_SIZE, activation='softmax'))
model_train.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model_train.summary()

In [None]:
model_train.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS)
model_train.save("zeldic-rnn.h5")

In [None]:
# Build a decoding model (input length 1, batch size 1, stateful)
model_dec = Sequential()
model_dec.add(Embedding(VOCABULARY_SIZE, HIDDEN_UNITS, input_length=1, batch_input_shape=(1,1)))
# LSTM part
model_dec.add(LSTM(HIDDEN_UNITS, stateful=True, return_sequences=True))
model_dec.add(LSTM(HIDDEN_UNITS, stateful=True))

# project back to vocabulary
model_dec.add(Dense(VOCABULARY_SIZE, activation='softmax'))
model_dec.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model_dec.summary()
# set weights from training model
#model_dec.set_weights(model_train.get_weights())
model_dec.load_weights("zeldic-rnn.h5")

##### sampling from model

In [None]:
def sample(preds, temperature=1.0):
    """ helper function to sample an index from a probability array"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Sampling function

def sample_model(seed, model_name, length=400, temperature=1.0):
    '''Samples a musicRNN given a seed sequence.'''
    generated = []  
    generated.append(seed)
    next_index = seed
    for i in range(length):
        x = np.array([next_index])
        x = np.reshape(x,(1,1))
        preds = model_name.predict(x, verbose=0)[0]
        next_index = sample(preds, temperature)        
        generated.append(next_index)
    return np.array(generated)

In [None]:
model_dec.reset_states() # Start with LSTM state blank
o = sample_model(60, model_dec, length=127, temperature=15.0) # generate 8 bars of melody

melody_stream = noteArrayToStream(o) # turn into a music21 stream
melody_stream.show() # show the score.


In [None]:
print(o)

In [None]:
for i in range(len(melody_stream)):
  print(melody_stream[i])

In [None]:
#sp = midi.realtime.StreamPlayer(melody_stream)
#sp.play()

In [None]:
midi_stream = stream.Stream(melody_stream)

In [None]:
midi_stream.write('midi','melody_stream.mid')

In [None]:
pip install pydub

In [None]:
from collections import defaultdict
from mido import MidiFile
from pydub import AudioSegment
from pydub.generators import Sine

def note_to_freq(note, concert_A=440.0):
  '''
  from wikipedia: http://en.wikipedia.org/wiki/MIDI_Tuning_Standard#Frequency_values
  '''
  return (2.0 ** ((note - 69) / 12.0)) * concert_A

mid = MidiFile("./melody_stream.mid")
output = AudioSegment.silent(mid.length * 1000.0)

tempo = 100 # bpm

def ticks_to_ms(ticks):
  tick_ms = (60000.0 / tempo) / mid.ticks_per_beat
  return ticks * tick_ms
  

for track in mid.tracks:
  # position of rendering in ms
  current_pos = 0.0

  current_notes = defaultdict(dict)
  # current_notes = {
  #   channel: {
  #     note: (start_time, message)
  #   }
  # }
  
  for msg in track:
    current_pos += ticks_to_ms(msg.time)

    if msg.type == 'note_on':
      current_notes[msg.channel][msg.note] = (current_pos, msg)
    
    if msg.type == 'note_off':
      start_pos, start_msg = current_notes[msg.channel].pop(msg.note)
  
      duration = current_pos - start_pos
  
      signal_generator = Sine(note_to_freq(msg.note))
      rendered = signal_generator.to_audio_segment(duration=duration-50, volume=-20).fade_out(100).fade_in(30)

      output = output.overlay(rendered, start_pos)

output.export("melody_stream.wav", format="wav")

In [None]:
IPython.display.Audio('./melody_stream.wav')

In [None]:
# figure out where the user settings are kept.
# from music21 import *
# us = environment.UserSettings()
# us.getSettingsPath()