## Reading Annotations

In [None]:
import pandas as pd

dataset_dir = "Datasets/asap-dataset"
# read in json file as pandas dataframe
annotations=pd.read_json(f"{dataset_dir}/asap_annotations.json").transpose()
# add column for score_filename to annotations that converts row name to score_filename
annotations['score_filename'] = annotations.index.map(lambda x: f"{'/'.join(x.split('/')[:-1])}/midi_score.mid")
# rename index to performance_filename
annotations.index=annotations.index.rename('performance_filename')
annotations.reset_index(inplace=True)

annotations.head(2)

# only keep rows of annotations where score_and_performance_aligned is True	
annotations = annotations[annotations['score_and_performance_aligned'] == False]
annotations.head(2)

## Storing tokenised data


In [None]:
from tqdm import tqdm
from midi_processing import mid2dat_anna
import pickle

store_folder="Store/asap-dataset"

# tokenise performances
def tokenise_performances(annotations,dataset_dir="Datasets/asap-dataset"):
    token_dict={}
    for index in tqdm(annotations.index):
        row=annotations.loc[index]
        # get performance filename
        performance_filename = row['performance_filename']
        # get performance file
        performance_file = f"{dataset_dir}/{performance_filename}"
        # tokenise performance
        token_dict[performance_filename] = mid2dat_anna(performance_file)
    return token_dict

# tokenise scores
def tokenise_scores(annotations,dataset_dir="Datasets/asap-dataset"):
    token_dict={}
    # get unique score filenames
    score_filenames = annotations['score_filename'].unique()
    for score_filename in tqdm(score_filenames):
        # get score filename
        score_filename = score_filename
        # get score file
        score_file = f"{dataset_dir}/{score_filename}"
        # tokenise score
        token_dict[score_filename] = mid2dat_anna(score_file)
    return token_dict

In [None]:
# tokenise and save pickle of performances
token_dict = tokenise_performances(annotations)
with open(f'{store_folder}/perf_dict.pickle', 'wb') as handle:
    pickle.dump(token_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
from section import section
from random import randint
import pickle
import pandas as pd

store_folder="Store/asap-dataset"
dataset_dir = "Datasets/asap-dataset"

# read in pickle of performances
with open(f'{store_folder}/perf_dict.pickle', 'rb') as handle:
    perf_dict = pickle.load(handle)
# read in pickle of scores
with open(f'{store_folder}/score_dict.pickle', 'rb') as handle:
    score_dict = pickle.load(handle)



In [None]:
from tqdm import tqdm
from datasets import MidiToken
from midi_processing import mid2dat_anna,dat2mid_anna

# read annotations
annotations=pd.read_json(f"{dataset_dir}/asap_annotations.json").transpose()
# add column for score_filename to annotations that converts row name to score_filename
annotations['score_filename'] = annotations.index.map(lambda x: f"{'/'.join(x.split('/')[:-1])}/midi_score.mid")
# rename index to performance_filename
annotations.index=annotations.index.rename('performance_filename')
annotations.reset_index(inplace=True)
# get unique performance filenames
performance_filenames = annotations['performance_filename'].unique()

# iterate over rows in annotations
for index in tqdm(annotations.index):
    # get row
    row=annotations.loc[index]
    # get performance filename
    performance_filename = row['performance_filename']
    # get performance file
    performance_file = f"{dataset_dir}/{performance_filename}"
    # get score filename
    score_filename = row['score_filename']
    # get score file
    score_file = f"{dataset_dir}/{score_filename}"
    ''' Get tokens'''
    # get performance
    performance_tokens = perf_dict[performance_filename]
    # get score
    score_tokens = score_dict[score_filename]
    ''' Get beats'''
    # get performance beats
    performance_beats =  row['performance_beats']
    # get score beats
    score_beats = row['midi_score_beats']

    start_beat=randint(0,len(performance_beats)-1)

    performance_section,score_section=section(performance_tokens,performance_beats,score_tokens,score_beats)
    
    # add a SET_VELOCITY token to the beginning of the score section
    score_section.insert(0,MidiToken("SET_VELOCITY",8)) # Could skip

    print(f"Length of performance section: {len(performance_section)}")

    # write performance section to midi file
    dat2mid_anna(performance_section,f"p.mid")

    print(f"Length of score section: {len(score_section)}")
    # write score section to midi file
    dat2mid_anna(score_section,f"s.mid")

    midi_image(f"p.mid")
    midi_image(f"s.mid")

    
    break

In [None]:
'''
1) Take a start beat from the performance and find the end beat which maximises the number of performance tokens
2) Now we know the start beat and end beat, we can find the corresponding tokens in the score
'''
    
# Save a dictionary {beat:token} for each performance and score
def section(performance_tokens, performance_beats,score_tokens, scores_beats, start_beat=0):
    start_beat=start_beat
    end_beat=get_end_beat(performance_tokens, start_beat, performance_beats)
    print(f"start_beat: {start_beat}")
    print(f"end_beat: {end_beat}")

    # For performance
    start_token=beat2TokenPosition(start_beat, performance_beats, performance_tokens)
    end_token=beat2TokenPosition(end_beat, performance_beats, performance_tokens)
    # print("performance")
    # print(f"start_token: {start_token}")
    # print(f"end_token: {end_token}")
    performance_section=performance_tokens[start_token:end_token]

    # For score
    start_token=beat2TokenPosition(start_beat, scores_beats, score_tokens)
    end_token=beat2TokenPosition(end_beat, scores_beats, score_tokens)
    # print("score")
    # print(f"start_token: {start_token}")
    # print(f"end_token: {end_token}")
    score_section=score_tokens[start_token:end_token]

    return performance_section, score_section



def beat2TokenPosition(beat, beats, tokens):
    beat_time=beats[beat]

    time_shift_positions=[]

    time_elapsed=0
    # Iterate through performance_tokens, if the token is a time_shift token then add the time_shift to the time_elapsed
    for i in range(len(tokens)):
        token=tokens[i]
        if token.type=="TIME_SHIFT":
            time_elapsed+=token.value/1000
            time_shift_positions.append((i, time_elapsed))
    
    # Iterate through time_shift_positions, if the time_elapsed is greater than the beat_time then return the index of the token
    
    for i in range(len(time_shift_positions)):
        if time_shift_positions[i][1]>beat_time:
            return time_shift_positions[i-1][0]
    

def get_end_beat(performance_tokens, start_beat, performance_beats,max_tokens=512):
    end_beat=start_beat
    
    end_beat_token_position=beat2TokenPosition(end_beat, performance_beats, performance_tokens)
    while end_beat_token_position<max_tokens:
        end_beat+=1
        end_beat_token_position=beat2TokenPosition(end_beat, performance_beats, performance_tokens)
    return end_beat-1

# a function that splits tokens into a list of list of tokens corresponding to beats

def splitTokens(tokens,beats):
    split_tokes=[]
    # find tokens between consequtive beats
    for i in range(len(beats)-1):
        start_beat=beats[i]
        end_beat=beats[i+1]
        start_token=beat2TokenPosition(start_beat, beats, tokens)
        end_token=beat2TokenPosition(end_beat, beats, tokens)
        split_tokes.append(tokens[start_token:end_token])

    return split_tokes



In [None]:
# pretty midi read score_file
from pretty_midi import PrettyMIDI
midi_score = PrettyMIDI(score_file)

start_beat=5
position=beat2TokenPosition(start_beat,score_beats,score_tokens)
print(position)
print(score_tokens[position])

## Plotting
Remember instruments

In [None]:
import pretty_midi as pm
import matplotlib.pyplot as plt

# function to midi image
def midi_image(midi_file):
    # read in midi file
    midi_file = pm.PrettyMIDI(f"{midi_file}")
    # get piano roll
    piano_roll =  midi_file.get_piano_roll(fs=100) # shape=(pitch, timestep)
    # plot the piano roll with length of yaxis=xaxis
    
    plt.imshow(piano_roll, aspect='auto', origin='lower')
    plt.show()

# plot first performance and corresponding score
perf_index=0
midi_image(f"{dataset_dir}/{annotations.iloc[perf_index]['performance_filename']}")
midi_image(f"{dataset_dir}/{annotations.iloc[perf_index]['score_filename']}")