# Automated Music Composition Usings GANs 
## Data Cleaning and Pre-Processing

In [2]:
from music21 import converter, instrument, key, note, chord, pitch, meter, tempo, interval
import os
import shutil
import sys
import pandas as pd
import numpy as np
from imageio import imwrite, imread
from collections import defaultdict
import json
from matplotlib import pyplot as plt
from pathlib import Path

In [3]:
voices = ['S','A','T','B']

# Dictionary to convert between MIDI Numbers and Notes
midiNumberNote = {n:str(pitch.Pitch(n)) for n in range(1,128)}
midiNoteNumber = {v:k for k,v in midiNumberNote.items()}

def save_image(img, out_fn, out_path):
    Path(out_path).mkdir(parents=True, exist_ok=True)
    imwrite(os.path.join(out_path,out_fn), img.astype(np.uint8))

## Preparing MIDI Files
### Transpose to C Major/A Minor and Convert Instruments to Piano

In [10]:
parallels = ['CMCm','C#MC#m','DMDm','D#MD#m','EMEm','FMFm','F#MF#m','GMGm','G#MG#m','AMAm','A#MA#m','BMBm']
relatives = ['CMAm','C#MA#m','DMBm','D#MCm','EMC#m','FMDm','F#MD#m','GMEm','G#MFm','AMF#m','A#MGm','BMG#m']

def transpose_midi(key_tp,mode):
    
    midi_path = '../Data/MidiFiles/'
    in_dir = os.path.join(midi_path,'JSB-Midi')
    out_dir = os.path.join(midi_path,'JSB-Midi-TP/{}/JSB-Midi-TP-{}'.format(mode,key_tp))
    if not os.path.exists(out_dir): os.makedirs(out_dir)
    if not os.path.exists(out_dir): os.makedirs(out_dir)
        
    major_key = key_tp[:2]    if '#' in key_tp[:2]  else key_tp[0]
    minor_key = key_tp[-3:-1] if '#' in key_tp[-2:] else key_tp[-2]
    tp_keys = {'major':major_key,'minor':minor_key}
    
    print('Transposing to {} Major and {} Minor'.format(major_key,minor_key))
    
    # Loop thorugh all MIDI files
    for file in os.listdir(in_dir):
        # Read MIDI file with music21
        score = converter.parse(os.path.join(in_dir,file))
        
        # Extract the key signature of the music and find the transposition interval
        key = score.analyze('key')
        i = interval.Interval(key.tonic, pitch.Pitch(tp_keys[key.mode]))
        
        # Write the transposed MIDI file, and validate correct transposition
        newscore = score.transpose(i)
        newkey = newscore.analyze('key')
        
        # Save the transposed MIDI file
        newscore.write('midi',os.path.join(out_dir,'tp_'+file))
        
    return midi_path
                
for k in parallels:
    midi_path = transpose_midi(k,'Parallel')
for k in relatives:
    midi_path = transpose_midi(k,'Relative')


## Converting MIDI Files to Arrays

In [12]:
def midi2array(tp, mode):
    
    
    notes_df = pd.DataFrame(index=os.listdir(os.path.join(midi_path,'JSB-Midi')),columns=voices)
    notes_letter_df = pd.DataFrame(index=os.listdir(os.path.join(midi_path,'JSB-Midi')),columns=voices)
    durations_df = pd.DataFrame(index=os.listdir(os.path.join(midi_path,'JSB-Midi')),columns=voices)
    start_df = pd.DataFrame(index=os.listdir(os.path.join(midi_path,'JSB-Midi')),columns=voices)

    notes_dict, notes_letter_dict, durations_dict, start_dict = {},{},{},{}

    inpath = "JSB-Midi" if tp == "notTP" else os.path.join(midi_path,"JSB-Midi-TP",mode,f"JSB-Midi-TP-{tp}")
    outpath = os.path.join(os.path.join('../Data','ProcessedData',mode,tp)
    if tp not in os.listdir(f'ProcessedData/{mode}'): os.makedirs(outpath) 

    full_dict = defaultdict(dict)

    for file in os.listdir("JSB-Midi"):

        if tp != "notTP": file = f"tp_{file}"
        fn = os.path.join(inpath,file)
        
        original_score = converter.parse(fn)

        for i,p in enumerate(voices):
            notes, notes_letter, durations, start = [],[],[],[]

            if len(original_score.parts) == 4:
                for element in original_score.parts[i].flat:
                    if isinstance(element, note.Note):
                        notes.append(element.pitch.ps)
                        if element.isRest: notes_letter.append(str(element.name))
                        else:              notes_letter.append(str(element.nameWithOctave))
                        durations.append(element.duration.quarterLength)
                        start.append(element.offset)

                durations_df.loc[file,p] = durations
                notes_df.loc[file,p] = notes
                notes_letter_df.loc[file,p] = notes_letter
                start_df.loc[file,p] = start

                full_dict[file][p] = {"start":start, "pitch":notes, "dur":durations, "pitch_letter":notes_letter}
            
            else: print(file)
    with open(outpath+'/full_chorales_{}.json'.format(tp),'w') as fn:
        json.dump(full_dict,fn)

    durations_df.to_csv(outpath + '/JSB_durations_{}.csv'.format(tp))
    notes_df.to_csv(outpath + '/JSB_notes_{}.csv'.format(tp))

    start_df.to_csv(outpath + '/JSB_start_{}.csv'.format(tp))

    with open(outpath+'/full_chorales_{}.json'.format(tp),'r') as fn:
        read_full_dict = json.load(fn)

    
    return read_full_dict

midi2array('notTP',mode='notTP')
for k in relatives:
    midi2array(k,mode='Relative')
for k in parallels:
    midi2array(k,mode='Parallel')


## Data Exploration

In [220]:
def data_exploration(tp):
    
    outpath = "ProcessedData/{}/".format(tp)
    
    with open(outpath+'full_chorales_{}.json'.format(tp),'r') as fn:
        read_full_dict = json.load(fn)
    all_notes = []
    all_notenums = []
    lengths = []
    
    for n,d in read_full_dict.items():
        lengths.append((d['S']['start'][-1]+d['S']['dur'][-1])/4)
        for v,data in d.items():
            [all_notes.append(note) for note in data['pitch_letter']]
            [all_notenums.append(note) for note in data['pitch']]

    
    plt.figure()
    print('Highest Pitch in Dataset:',pitch.Pitch(max(all_notenums)))
    print('Lowest  Pitch in Dataset:',pitch.Pitch(min(all_notenums)))
    plt.hist(all_notenums)
    if tp=='CMAm':  plt.title('Pitch Distribution of Chorales Transposed to C Major and A Minor')
    if tp=='CMCm':   plt.title('Pitch Distribution of Chorales Transposed to C Major and C Minor')
    if tp=='notTP': plt.title('Pitch Distribution of Non-Transposed Chorales')
    
    unique_notes, unique_counts = np.unique(list(map(lambda x:x[:-1],all_notes)),return_counts=True)
       
    labels_histogram = dict(zip(unique_notes, unique_counts))
            
    note_order = ['C', 'C#', 'D', 'E-','E', 'F', 'F#', 'G', 'G#', 'A', 'B-', 'B' ]

    sorted_notecount = {n:labels_histogram[n] for n in note_order}   

    plt.figure()
    plt.bar(sorted_notecount.keys(),sorted_notecount.values())
    if tp=='CMAm':  plt.title('Note Distribution of Chorales Transposed to C Major and A Minor')
    if tp=='CMCm':   plt.title('Note Distribution of Chorales Transposed to C Major and C Minor')
    if tp=='notTP': plt.title('Note Distribution of Non-Transposed Chorales')
  
    plt.xlabel('Note')
    plt.ylabel('Occurences in Transposed Dataset')
#     plt.savefig('{}NoteDist.png'.format(tp))
    
    

data_exploration('CMCm')
data_exploration('CMAm')
data_exploration('notTP')


## Piano Roll (Image) Representation

### Array to Image

In [11]:
def array2image(midi_path,image_width,monophonic,tp,mode,q,square=True,save=True):

    # Load Arrays
    with open(os.path.join('ProcessedData',mode,tp,'full_chorales_{}.json'.format(tp)),'r') as fn:
        data = json.load(fn)[midi_path]
    
    min_pitch, max_pitch = 26, 90 # D1, F#6 in MIDI numbers
    image_arr = np.zeros((max_pitch-min_pitch,image_width)) # Empty array to be filled where notes occur
    out_dir = os.path.join('ProcessedData',mode,tp,'NewImages',str(image_width))

    for voice, values in data.items():
        for i,note in enumerate(values["pitch"]): # Loop for all occuring pitches

            # Normalising by q and converting start and duration of notes to int quantises them to chosen level
            dur = int(values["dur"][i]/q)
            start = int(values["start"][i]/q)

            # For each pitch that occurs in the music, populate all occurences across the image
            if dur+start < image_width:
                for j in range(start,start+dur):
                    if j >= 0:
                        image_arr[(max_pitch-min_pitch)-int(note-min_pitch),j] = 255
            else: break
           
        if save and monophonic:
            # Saving Monophonic Images part-by-part (S, A, T, B)
            out_fn = midi_path.replace(".mid",f"_{voice}_{tp}.png")
            save_image(image_arr, out_fn, os.path.join(out_dir,'Monophonic',voice))
            image_arr = np.zeros((max_pitch-min_pitch,image_width))
    
    if save and not monophonic:
        # Saving Polyphonic Images with All Parts
        out_fn = midi_path.replace(".mid",f"_{tp}.png")
        save_image(image_arr, out_fn, os.path.join(out_dir,'Polyphonic'))

with open(os.path.join('ProcessedData','Relative','CMAm','full_chorales_CMAm.json'),'r') as fn:
    files = json.load(fn).keys()     

q = 0.5             # Quantisation Level
img_width = int(64) # Width of Image

for file in files:
    print(file,end=' ')
    print('Relatives...',end=' ')
    for k in relatives:
        array2image(file,img_width,False,k,'Relative',q)
        array2image(file,img_width,True,k,'Relative',q)
    print('Parallels...')
    for k in parallels:
        array2image(file,img_width,False,k,'Parallel',q)
        array2image(file,img_width,True,k,'Parallel',q)   




## Data Augmentation

In [249]:
def data_augmentation(img_width,mode):
    
    '''
    Combines all transpositions into one folder
    '''

    source = 'ProcessedData/Parallel'
    dest = f'ProcessedData/12keysNew/{mode}'
    Path(dest).mkdir(parents=True, exist_ok=True)
    
    for k in os.listdir(source):
        if not k.startswith('.'):
            img_path = os.path.join(source,k,'NewImages',str(img_width),'Polyphonic')
            for file in os.listdir(img_path):
                shutil.copyfile(os.path.join(img_path,file),os.path.join(dest,file))

data_augmentation(64,'Polyphonic')
data_augmentation(64,'Monophonic/S')
data_augmentation(64,'Monophonic/A')
data_augmentation(64,'Monophonic/T')
data_augmentation(64,'Monophonic/B')