## Incrementally add new voices to a chorale.
<p>The goals of this round:
    
- Take a chorale, keep three voices, synthesize the fourth. save the new one in an array.
- Repeat: keep three voices, including the new one, and discard one of the original ones, synthesize the missing one, save it.
- Continue dropping voices, create a new one, and save it. Do this for a while, always discarding the oldest one until you have a multi-voice chorale that sounds interesting and without too many wrong notes. 
    
 </p>

In [36]:
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data
import matplotlib.pyplot as plt
import pandas as pd
import mido
import time
from midi2audio import FluidSynth
from IPython.display import Audio, display
import os
import muspy
import piano 
import subprocess
from numpy.random import default_rng
rng = default_rng(42) # random seed in parens.

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
softmax = torch.nn.functional.softmax

base_dir = ''
CSD_FILE = 'goldberg_aria1.csd'
NOTES_FILE = "goldberg_aria1.mac.csv"
LOGNAME = 'goldberg5.log'

In [37]:
# set global variables

I = 4 # number of voices
T = 32 # length of samples (32 = two 4/4 measures in 1/16th note increments)
P = (86-30) +1 # number of different pitches
print(f'I voices: {I}, T sample length: {T}, P number of distinct pitches in the input chorales: {P}')

I voices: 4, T sample length: 32, P number of distinct pitches in the input chorales: 57


In [38]:
# function for converting arrays of shape (T, 4) into midi files
# the input array has entries that are np.nan (representing a rest)
# or an integer between 0 and 127 inclusive
#
# Altered to accept pieces of arbitrary number of voices. 
# Mine are all 264 notes by 16 voices per chorale
# what comes into this is (16,264)
def piano_roll_to_midi(piece):
    """
    piece is a an array of shape (T, 4) for some T.
    The (i,j)th entry of the array is the midi pitch of the jth voice at time i. It's an integer in range(128).
    outputs a mido object mid that you can convert to a midi file by called its .save() method
    """
    # piece = np.concatenate([piece, [[np.nan, np.nan, np.nan, np.nan]]], axis=0)

    bpm = 50
    microseconds_per_beat = 60 * 1000000 / bpm

    mid = mido.MidiFile()
    
    # modified to make the number of voices dependent on what is passed into the function
    v = 0
    tracks = {}
    past_pitches = {}
    delta_time = {}
    for voice in piece:
        tracks['piano' + str(v)] = mido.MidiTrack()
        past_pitches['piano' + str(v)] = np.nan
        delta_time['piano' + str(v)] = 0
        v += 1
    
    # create a track containing tempo data
    metatrack = mido.MidiTrack()
    metatrack.append(mido.MetaMessage('set_tempo',
                                      tempo=int(microseconds_per_beat), time=0))
    mid.tracks.append(metatrack)

    # create the N voice tracks (was 4)
    for voice in tracks:
        mid.tracks.append(tracks[voice])
        tracks[voice].append(mido.Message(
            'program_change', program=0, time=0)) # choir aahs=52, piano = 0

    # add notes to the N voice tracks
    # this function expects an array in this form: chorale type: <class 'numpy.ndarray'>
    # piece.shape: (33, 4) 
    # mine are (16,264)
    
    pitches = {}
    for i in range(piece[1].shape[0]): # 0 - 263 in my case
        v = 0
        for voice in piece: # 0-15 in my case
            pitches['piano'+str(v)] = piece[v,i] # i is from 0 to 263, v is 0 to 15
            v += 1
        for voice in tracks:
            if np.isnan(past_pitches[voice]):
                past_pitches[voice] = None
            if np.isnan(pitches[voice]):
                pitches[voice] = None
            if pitches[voice] != past_pitches[voice]:
                if past_pitches[voice]:
                    tracks[voice].append(mido.Message('note_off', note=int(past_pitches[voice]),
                                                      velocity=64, time=delta_time[voice]))
                    delta_time[voice] = 0
                if pitches[voice]:
                    tracks[voice].append(mido.Message('note_on', note=int(pitches[voice]),
                                                      velocity=64, time=delta_time[voice]))
                    delta_time[voice] = 0
            past_pitches[voice] = pitches[voice]
            # 480 ticks per beat and each line of the array is a 16th note
            delta_time[voice] += 120

    return mid

In [39]:
class Chorale:
    """
    A class to store and manipulate an array self.arr that stores a chorale.
    """
    def __init__(self, arr, subtract_30=False):
        # arr is an array of shape (4, 32) with values in range(0, 57)
        self.arr = arr.copy()
        if subtract_30:
            self.arr -= 30
            
        # the one_hot representation of the array
        reshaped = self.arr.reshape(-1)
        self.one_hot = np.zeros((I*T, P))
        r = np.arange(I*T)
        self.one_hot[r, reshaped] = 1
        self.one_hot = self.one_hot.reshape(I, T, P)
        

    def to_image(self):
        # visualize the four tracks as a images
        soprano = self.one_hot[0].transpose()
        alto = self.one_hot[1].transpose()
        tenor = self.one_hot[2].transpose()
        bass = self.one_hot[3].transpose()
        
        fig, axs = plt.subplots(1, 4)
        axs[0].imshow(np.flip(soprano, axis=0), cmap='hot', interpolation='nearest')
        axs[0].set_title('soprano')
        axs[1].imshow(np.flip(alto, axis=0), cmap='hot', interpolation='nearest')
        axs[1].set_title('alto')
        axs[2].imshow(np.flip(tenor, axis=0), cmap='hot', interpolation='nearest')
        axs[2].set_title('tenor')
        axs[3].imshow(np.flip(bass, axis=0), cmap='hot', interpolation='nearest')
        axs[3].set_title('bass')
        fig.set_figheight(5)
        fig.set_figwidth(15)
        return fig, axs
    
    def play(self, filename='midi_track.mid'):
        # display an in-notebook widget for playing audio
        # saves the midi file as a file named name in base_dir/midi_files
        
        midi_arr = self.arr.transpose().copy()
        midi_arr += 30
        midi = piano_roll_to_midi(midi_arr)
        midi.save(base_dir + 'midi_files/' + filename)
        play_midi('midi_files/' + filename,10)
        
    def elaborate_on_voices(self, voices, model):
        # voice is a set consisting of 0, 1, 2, or 3
        # create a mask consisting of the given voices
        # generate a chorale with the same voices as in voices
        mask = np.zeros((I, T))
        y = np.random.randint(P, size=(I, T))
        for i in voices:
            mask[i] = 1
            y[i] = self.arr[i].copy()
        return harmonize(y, mask, model)
    
    # I think we could improve this scoring method. It's pretty lame.
    def score(self):
        consonance_dict = {0: 1, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 0, 
                           7: 1, 8: 1, 9: 1, 10: 0, 11: 0}
        consonance_score = 0
        for k in range(32):
            for i in range(4):
                for j in range(i):
                    consonance_score += consonance_dict[((self.arr[i, k] - self.arr[j, k]) % 12)]
        
        note_score = 0
        for i in range(4):
            for j in range(1, 32):
                if self.arr[i, j] != self.arr[i, j-1]:
                    note_score += 1
        return consonance_score, note_score
        
# harmonize a melody
def harmonize(y, C, model):
    """
    Generate an artificial Bach Chorale starting with y, and keeping the pitches
    where C==1.
    Here C is an array of shape (4, 32) whose entries are 0 and 1.
    The pitches outside of C are repeatedly resampled to generate new values.
    For example, to harmonize the soprano line, let y be random except y[0] 
    contains the soprano line, let C[1:] be 0 and C[0] be 1.
    """
    model.eval()
    with torch.no_grad():
        x = y
        C2 = C.copy()
        num_steps = int(2*I*T)
        alpha_max = .999
        alpha_min = .001
        eta = 3/4
        for i in range(num_steps):
            p = np.maximum(alpha_min, alpha_max - i*(alpha_max-alpha_min)/(eta*num_steps))
            sampled_binaries = np.random.choice(2, size = C.shape, p=[p, 1-p])
            C2 += sampled_binaries
            C2[C==1] = 1
            x_cache = x
            x = model.pred(x, C2)
            x[C2==1] = x_cache[C2==1]
            C2 = C.copy()
        return x
    
def generate_random_chorale(model): # 
    """
    Calls harmonize with random initialization and C=0, masking none 
    and so generates a new sample that sounds like Bach.
    """
    y = np.random.randint(P, size=(I, T)).astype(int)
    C = np.zeros((I, T)).astype(int)
    x = harmonize(y, C, model)
    return (x)

In [40]:
hidden_size = 32

class Unit(nn.Module):
    """
    Two convolution layers each followed by batchnorm and relu, 
    plus a residual connection.
    """
    def __init__(self):
        super(Unit, self).__init__()
        self.conv1 = nn.Conv2d(hidden_size, hidden_size, 3, padding=1)
        self.batchnorm1 = nn.BatchNorm2d(hidden_size)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(hidden_size, hidden_size, 3, padding=1)
        self.batchnorm2 = nn.BatchNorm2d(hidden_size)
        self.relu2 = nn.ReLU()
        
        
    def forward(self, x):
        y = x
        y = self.conv1(y)
        y = self.batchnorm1(y)
        y = self.relu1(y)
        y = self.conv2(y)
        y = self.batchnorm2(y)
        y = y + x
        y = self.relu2(y)
        return y
    
    

class Net(nn.Module):
    """
    A CNN that where you input a starter chorale and a mask and it outputs a prediction for the values
    in the starter chorale away from the mask that are most like the training data.
    """
    def __init__(self):
        super(Net, self).__init__()
        self.initial_conv = nn.Conv2d(2*I, hidden_size, 3, padding=1)
        self.initial_batchnorm = nn.BatchNorm2d(hidden_size)
        self.initial_relu = nn.ReLU()
        self.unit1 = Unit()
        self.unit2 = Unit()
        self.unit3 = Unit()
        self.unit4 = Unit()
        self.unit5 = Unit()
        self.unit6 = Unit()
        self.unit7 = Unit()
        self.unit8 = Unit()
        self.unit9 = Unit()
        self.unit10 = Unit()
        self.unit11 = Unit()
        self.unit12 = Unit()
        self.unit13 = Unit()
        self.unit14 = Unit()
        self.unit15 = Unit()
        self.unit16 = Unit()
        self.affine = nn.Linear(hidden_size*T*P, I*T*P)
        
    def forward(self, x, C):
        # x is a tensor of shape (N, I, T, P)
        # C is a tensor of 0s and 1s of shape (N, I, T)
        # returns a tensor of shape (N, I, T, P)
        
        # get the number of batches
        N = x.shape[0]
        
        # tile the array C out of a tensor of shape (N, I, T, P)
        tiled_C = C.view(N, I, T, 1)
        tiled_C = tiled_C.repeat(1, 1, 1, P)
        
        # mask x and combine it with the mask to produce a tensor of shape (N, 2*I, T, P)
        y = torch.cat((tiled_C*x, tiled_C), dim=1)
        
        # apply the convolution and relu layers
        y = self.initial_conv(y)
        y = self.initial_batchnorm(y)
        y = self.initial_relu(y)
        y = self.unit1(y)
        y = self.unit2(y)
        y = self.unit3(y)
        y = self.unit4(y)
        y = self.unit5(y)
        y = self.unit6(y)
        y = self.unit7(y)
        y = self.unit8(y)
        y = self.unit9(y)
        y = self.unit10(y)
        y = self.unit11(y)
        y = self.unit12(y)
        y = self.unit13(y)
        y = self.unit14(y)
        y = self.unit15(y)
        y = self.unit16(y)
            
        # reshape before applying the fully connected layer
        y = y.view(N, hidden_size*T*P)
        y = self.affine(y)
        
        # reshape to (N, I, T, P)
        y = y.view(N, I, T, P)
                
        return y
    
    def pred(self, y, C):
        # y is an array of shape (I, T) with integer entries in [0, P)
        # C is an array of shape (I, T) consisting of 0s and 1s
        # the entries of y away from the support of C should be considered 'unknown'
        
        # x is shape (I, T, P) one-hot representation of y
        compressed = y.reshape(-1)
        x = np.zeros((I*T, P))
        r = np.arange(I*T)
        x[r, compressed] = 1
        x = x.reshape(I, T, P)
        
        # prep x and C for the plugging into the model
        x = torch.tensor(x).type(torch.FloatTensor).to(device)
        x = x.view(1, I, T, P)
        C2 = torch.tensor(C).type(torch.FloatTensor).view(1, I, T).to(device)
        
        # plug x and C2 into the model
        with torch.no_grad():
            out = self.forward(x, C2).view(I, T, P).cpu().numpy()
            out = out.transpose(2, 0, 1) # shape (P, I, T)
            probs = np.exp(out) / np.exp(out).sum(axis=0) # shape (P, I, T)
            cum_probs = np.cumsum(probs, axis=0) # shape (P, I, T)
            u = np.random.rand(I, T) # shape (I, T)
            return np.argmax(cum_probs > u, axis=0)         

In [41]:
model = Net().to(device) # need this in order to load the model.

In [42]:
# uncomment to load the previously trained model
model.load_state_dict(torch.load('model1.pt'))

<All keys matched successfully>

In [43]:
def pad_number(n):
    """
    prepare numbers for better file storage
    """
    if n == 0:
        return '00000'
    else:
        digits = int(np.ceil(np.log10(n)))
        pad_zeros = 5 - digits
        return '0'* pad_zeros + str(n)


## Decompression of model output back to the 2 1/2 measure segment.
<p>This section turns the output of the model into a 40 slot segment from the output of the model. We compress the segment going into the model, so we decompress it coming out of the model. Decompress does several things. 
    
- expands the end of segment for 0,1,2,3 and convert it to 4x40 array
- fixes the end of the 4,5,6th by adding padding to convert to a 4x40 array
    
This will be called just after emerging from the model and before the midi file is written    

In [44]:
def decompress_end_of_segment(this_segment):
    # this function expects a single segment of shape (4,32)
    # this function will de-compress the last 8 1/16th notes into the space of 16 16/th notes
    # it moves elements in the four voices, one at a time the new, larger array

    expanded = np.zeros((4,40),dtype=int) # this is the original shape of the array before compression
    # print(f'this_segment.shape: {this_segment.shape}')
    
    for voice in range(4): # for all voices in the segment, copy the first 32 slots into a new larger numpy array
        # print(f'voice: {voice}')
        for source_index in range(24): # copy the first 24 slots with no change
            # print(f'source_index: {source_index}  ')
            # print(f'this_segment[{voice}][{source_index}]:  {this_segment[voice][source_index]}')                  
            expanded[voice][source_index] = this_segment[voice][source_index]
    
    
    for voice in range(4): # then for each voice in the segment, spread the last 8 slots over 16 slots in the expanded array.
        target_index = 24
        for source_index in range(24,32):
            # print(f'voice: {voice}, source_index: {source_index}')
            expanded[voice,target_index] = this_segment[voice,source_index]
            target_index += 1
            # print(f'source_index: {source_index}, target_index starts at 24 and is now: {target_index}')
            expanded[voice,target_index] = this_segment[voice,source_index]
            target_index += 1

    return(expanded) # return all four voices all notes in each voice. Return all 40 slots

def decompress(arr):
    s = 0
    my_expanded_segment = np.zeros((7,4,40),dtype=int)
    # for segments 0,1,2,3 passed into this function (that is the first ten measures of the chorale, which comprise four phrases, each 2 1/2 measures long)
    for seg in arr: 
        if s > 3: break # process the decompression on segments 0,1,2,3. If you reach 4, stop processing
        # print(f'arr.shape: {arr.shape}')
        # print(f'arr[s].shape: {arr.shape[s]}')
        my_expanded_segment[s] = decompress_end_of_segment(arr[s]) 
        s += 1

    pad8 = np.zeros((4,8))  # pad the end of the segment with zeros
    for i in range(4,7): # segments 4,5,6
        my_expanded_segment[i] = np.concatenate((arr[i],pad8),axis=1)
    return(my_expanded_segment)

## Transpose from the key of C to the original key
This is done to restore what the input midi file key was. I found that model inputs in the key of C are harmonized much better than those that are in other keys. I thought they took care of this in the model by transposing to different keys, but my experience suggests otherwize.
Add the value of root (F is 5) to each note in the array, with the exception of the 0's, which have to remain the same 0. 

In [45]:
def transpose_up_segment(my_segment,root):
    new_segment = np.copy(my_segment) # just make a copy, you will change the non zero elements 
    v = 0
    for voice in new_segment:
        n = 0
        for note in voice:
            if note > 0:
                new_segment[v,n] = note + root
            n += 1
        v += 1
        
    return(new_segment)

def transpose_up(segments,root): # read in 
    s = 0
    new_segment = np.copy(segments)
    for seg in segments:
        new_segment[s] = transpose_up_segment(seg,root)
    return(new_segment)   

## Load a midi file into a numpy array
Set certain values:

- the numpy array of the whole piece is stored in variable "sample'
- store the root key and mode (F major, for example)
- print the values of the time signature (must be 4/4 of you will need to do some extra work), quarter note clicks, clicks per 1/16th notes
- any transpositions that must be performed to restore the original key
- print the first 5 notes in each voice
- print the shape of the variable "sample" containing the whole midi file

In [46]:
# read in a midi file, check the key, load into piano roll, set up np.array containing Nx4 sample.
# calling program should slice the returned array as needed to create two measure segments for sending into the prediction model.

def midi_to_input(midi_file):
    music = muspy.read(midi_file)
    if music.key_signatures != []: # check if the midi file includes a key signature - some don't
        root = music.key_signatures[0].root 
        mode = music.key_signatures[0].mode # major or minor
    else: 
        print('Warning: no key signature found. Assuming C major')
        mode = "major"
        root = 0    
    if music.time_signatures != []: # check if the midi file includes a time signature - some don't
        numerator = music.time_signatures[0].numerator
        denominator = music.time_signatures[0].denominator 
    else: 
        print('Warning: no time signature found. Assuming 4/4')
        numerator = 4
        denominator = 4
    # turn it into a piano roll
    piano_roll = muspy.to_pianoroll_representation(music,encode_velocity=False) # boolean piano roll if False, default True
    # print(piano_roll.shape) # should be one time step for every click in the midi file
    q = music.resolution # quarter note value in this midi file. 
    q16 = q // 4 # my desired resolution is by 1/16th notes
    print(f'time signatures: {numerator}/{denominator}')
    time_steps = piano_roll.shape[0] // q16
    print(f'music.resolution is q: {q}. q16: {q16} time_steps: {time_steps} 1/16th notes')
    sample= np.zeros(shape=(time_steps,4)).astype(int) # default is float unless .astype(int)
    # This loop is able to load an array of shape N,4 with the notes that are being played in each time step
    for click in range(0,piano_roll.shape[0],q16): # q16 is skip 240 steps for 1/16th note resolution
        voice = 3 # start with the low voices and decrement for the higher voices as notes get higher
        for i in range(piano_roll.shape[1]): # check if any notes are non-zero
            time_interval = (click) // q16 
            if (piano_roll[click][i]): # if velocity anything but zero - unless you set encode_velocity = False
                # if time_interval % 16 == 0:
                #     print(f'time step: {click} at index {i}, time_interval: {time_interval}, voice: {voice}')
                # i is the midi note number. I want to transpose it into C
                sample[time_interval][voice] = i - root # index to the piano roll with a note - transposed by the key if not C which is 0
                voice -= 1 # next instrument will get the higher note
    return (sample,root,mode)            

In [47]:
# load the BWV 180 Schmucke dich, o liebe Seele Chorale - nice variety of phrase lengths.
# load a midi file into a list called sample - load the entire file, all tracks, all notes in all tracks
# if the midi file has a key signature, it will print what it is. 
# the notes will be transposed by the loader to the key of C, by subtracting the root from each note. F = 5
file_name = '/home/prent/Downloads/chorales_018007b_(c)greentree.mid'

# load the midi file into an instance of the music class from muspy.
sample, root, mode = midi_to_input(file_name) # sample is time interval, voice
keys = ['C ','C#','D ','D#','E ','F ','F#','G ','G#','A ','A#','B ']
print(f'{file_name}, \n{keys[root]} {mode} transposed into C and then used to create the segments')
i = 0
for t in sample: # for each time interval
    i += 1
    for v in t: # for each voice
        print(v,' ' , end='')
    print('')
    if i > 4: break

print(f'sample.shape: {sample.shape}. dtype(sample): {type(sample[0,0])}')

time signatures: 4/4
music.resolution is q: 1024. q16: 256 time_steps: 320 1/16th notes
/home/prent/Downloads/chorales_018007b_(c)greentree.mid, 
F  major transposed into C and then used to create the segments
64  60  55  48  
64  60  55  48  
64  60  55  48  
64  60  55  48  
62  59  55  43  
sample.shape: (320, 4). dtype(sample): <class 'numpy.int64'>


## Divide the sample into segments based on phrase length
In this case, the 1st four segments are 2 1/2 measures long. That Bach guy was full of surprises. The next two are repeats and can be discarded for now. The 4th and 5th are 2 measures long, which is what the model expects. The final one is the closing chord. At the end of this cell, you have a variable called "segment" which contains an array of 0 through 6 segments of the piece, each with 40 time slots for each of 4 voices.

In [48]:
# sample is a piano roll of pitches in 1/16th note intervals of dimension (320 time intervals, 4 voices, 1 pitch per time interval and voice)

seg_num = 0 # index into the segment array
segment = np.zeros((7,4,40),dtype=int)  # seg_num, voices, 1/16th note values
print(f'seg_num\tlength\tstart\tend')
pad8 = np.zeros((8,4)) # 8 zeros in each of four voices for segments 4 & 5

phrase_len = int(4 * 4 * 2.5) # the first segmenst have phrases of 2 1/2 measures in length 4*4*2.5 = 40 12/16th notes
for i in range(6): # sample 0 though 5, seg_num 0,1,2,3
    start = i * phrase_len 
    end = (i + 1) * phrase_len
    if i in (2,3): # note that the first two segments are repeated, so we can discard segments 2 & 3    
        pass
        # print(f'Ignore segments 2 & 3 they are repeats. seg_num: {seg_num}')
    else:
        print(f'{seg_num}\t{phrase_len}\t{start}\t{end-1}')
        transfer = sample[start:end]
        segment[seg_num] = transfer.transpose()
        seg_num += 1
    
phrase_len = int(4 * 4 * 2) # 32 1/16th notes   
for i in range(6, 8): # seg_num: 4 & 5
    start = end 
    end = (start + phrase_len)
    print(f'{seg_num}\t{phrase_len}\t{start}\t{end-1}')
    transfer = np.concatenate((sample[start:end],pad8),axis=0) # load the segment with the first 8 1/16th notes from the next segment. We will ignore these later.
    segment[seg_num] = transfer.transpose()
    seg_num += 1

phrase_len = int(4 * 2) # 8 1/16th notes in a whole note
for i in range(8,9): # seg_num 6
    start = end 
    end = (start + phrase_len)
    print(f'{seg_num}\t{phrase_len}\t{start}\t{end-1}')
    transfer = sample[start:end], # load the segment with the first 8 1/16th notes from the next segment. We will ignore these later.
    transfer = np.concatenate(transfer*5) # put 5 copies of the 8 1/16th notes one after the other fill out to 40 slots. Ignore the later slots.
    segment[seg_num] = transfer.transpose()

seg_num	length	start	end
0	40	0	39
1	40	40	79
2	40	160	199
3	40	200	239
4	32	240	271
5	32	272	303
6	8	304	311


## Compress the 40 slot segments down to 32 slots
This is done to match the model requirements. We create a helper function that compresses the last 16 slots down to 8 by skipping every other note in the 16. Not as crude at the clipping that was done in the mode, but it looses some information that cannot be retrieved upon decompressions. At the end of this process, we have a 7,4,32 array with 7 segments that are all 32 1/16th notes in length in a variable called "sub_segment".

In [49]:
# This function will take a 4,40 array and return a 4,32 array. It compresses the last 16 slots into 8 slots by skipping every other slot in the array.
def compress_end_of_segment(input_array):
    # let numpy do the slicing. It's better than a python list
    # np_input = np.array(input_array) # don't need it because it's already a np.array
    # this function will compress the last 16 1/16th notes into the space of 8 16/th notes
    # it looks at the four voices, one at a time and moves the 
    for v in range(4):
        n = 24 # start at this slot for each voice
        for i in range(n,40,2): # start at 24, increment until just before 40 by 2 each time
            input_array[v][n] = input_array[v][i]
            n += 1
    return(input_array[:,:-8]) # return all four voices all notes in each voice. Return only the first 32 slots

In [50]:
# compress segments 0,1,2,3 from 40 slots to 32 slots for all four voices
# It leaves segments 4 & 5 alone, and expands the held note on segment 6 to 32 time slices.
# print(segment)
print(segment.shape)
pad8 = np.reshape(pad8,(4,8))
for seg_num in range(4): # we need to take the 40 slot arrays and reduce them to 32 slots.
    print(f'seg_num: {seg_num} before compression') 
    print(f'segment[{seg_num}]: {segment[seg_num][0]}')
    my_segment = compress_end_of_segment(segment[seg_num])
    print('after compression')
    print(f'my_segment: {my_segment[0]}')
    segment[seg_num] = np.concatenate((my_segment,pad8),axis=1)
sub_segment = segment[:,:,:32] # chop off the 33-40'th 1/16th note in the piano roll leaving 32 slots    

(7, 4, 40)
seg_num: 0 before compression
segment[0]: [64 64 64 64 62 62 62 62 60 60 60 60 62 62 62 62  0  0 65 65 67 67 67 67
 65 65 65 65 65 65 65 65 64 64 64 64 64 64 64 64]
after compression
my_segment: [64 64 64 64 62 62 62 62 60 60 60 60 62 62 62 62  0  0 65 65 67 67 67 67
 65 65 65 65 64 64 64 64]
seg_num: 1 before compression
segment[1]: [67 67 67 67 64 64 64 64 65 65 65 65 64 64 62 62  0  0 62 62 64 64 64 64
 62 62 62 62 62 62 62 62 60 60 60 60 60 60 60 60]
after compression
my_segment: [67 67 67 67 64 64 64 64 65 65 65 65 64 64 62 62  0  0 62 62 64 64 64 64
 62 62 62 62 60 60 60 60]
seg_num: 2 before compression
segment[2]: [67 67 67 67 69 69 71 71 72 72 72 72 72 72 72 72 71 71 69 69 67 67 69 69
 69 69 69 69 69 69 69 69 67 67 67 67 67 67 67 67]
after compression
my_segment: [67 67 67 67 69 69 71 71 72 72 72 72 72 72 72 72 71 71 69 69 67 67 69 69
 69 69 69 69 67 67 67 67]
seg_num: 3 before compression
segment[3]: [67 67 67 67 69 69 71 71 72 72 72 72 72 72 72 72 71 71 69 69  0  

## What we have at this point

- We have an array of seven segments in the variable sub_segment with a shape (7,4,32).
- These are compressed versions of the original chorale, with the 40 slot segments compressed down to 32 slots
- These segments can each be individually sent to the model for potential replacements, since the model expects 4 voices and 32 time steps.

## What I need to do next:

- Create a function that takes in a segment and masks one of the four voices and asks the model to synthesize the missing voice and return the replacement voice as an array. 
- store that array for later use in subsequent synthesis activities.
- repeat by masking a different voice, ideally the oldest one, and generating a replacement for that voice.
- keep at it as many times as you can.

## How will we do this:

- use the function: def harmonize(y, C, model):
- per the docs:
--  Generate an artificial Bach Chorale starting with y, and keeping the pitches where C==1.
--  Here C is an array of shape (4, 32) whose entries are 0 and 1.
--  The pitches outside of C are repeatedly resampled to generate new values.
--  For example, to harmonize the soprano line, let y be random except y[0] contains the soprano line, let C[1:] be 0 and C[0] be 1.
    

In [51]:
print(sub_segment.shape)

(7, 4, 32)


In [52]:
def quick_play(chorale):
    midi_output = piano_roll_to_midi(chorale) # convert to mido object
    music = muspy.from_mido(midi_output) # convert mido to muspy music
    muspy.write_midi('test.midi', music)
    muspy.write_audio('test.wav', music,'wav','font.sf2',44100,)
    audio = Audio('test.wav')
    display(audio)

In [53]:
def predict_to_numpy(segments):
    # each of these predictions takes about 19 seconds of wall clock time 19 * 4 = 5 minutes * seven segments = 9 minutes
    #                     +--- which segment
    #                     | +--- which of 4 copies stacked vertically
    #                     | | +--- which voicesegment
    #                     | | | +--- notes in the segment
    new_voice = np.zeros((7,4,4,32),dtype=int)
    s = 0
    for segment in segments: # for each of 7 segments in the input chorale
        print(f'process segment {s}')
        old_chorale = segment - 30 # start with the segment, but reduce it to fit in the model MIDI number limits
        for chorale in range(4): # make a total of four chorales to stack on top of each other
            print(f'synthesize 4 voice chorale {chorale}')
            v = 0
            for voice in segment: # for each voice in the segment, mask it, then predict a new harmonization
                # print(f'process voice {v}')
                mask = mask_voice(v) # set this voice to zero to drop it from the voices
                new_chorale = harmonize(old_chorale,mask,model) # spend about 19 seconds doing the inference
                old_chorale = new_chorale # make sure the next round starts with the new harmonization
                v += 1
            new_voice[s,chorale] = new_chorale # save the current chorale in an array
        s += 1
    return(np.reshape(new_voice,(7,16,32))) # make it a array of segments times a 16,32 array for the segment

In [54]:
%%time
for chorales in range(0,1):
    print(f'predict chorale {chorales}')
    new_voices = predict_to_numpy(sub_segment)
    filename = os.path.join('segmented_chorales','chorale_' + str(chorales) + '.npy')
    print(f'saving new chorale to {filename}')
    np.save(filename,new_voices)

predict chorale 0
process segment 0
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
process segment 1
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
process segment 2
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
process segment 3
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
process segment 4
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
process segment 5
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
process segment 6
synthesize 4 voice chorale 0
synthesize 4 voice chorale 1
synthesize 4 voice chorale 2
synthesize 4 voice chorale 3
saving new chorale to segmented_chorales/cho