Construct Data Pipeline:
Raw Data in Directory --> Extract Semantics Data --> Get Encoding --> Collect into batches --> Yield labels from expert system --> Model Training

In [3]:
import os
import sys
import constraint

from itertools import chain
from string import digits

from pprint import pprint
# Technically no need to import semantics_generator for this experiment,
# Can simply use semantics file as ground truth
# sys.path.insert(1, os.path.join(sys.path[0], '..'))
# import semantics_generator

In [4]:
DATA_DIR = ['../data/package_aa', '../data/package_ab']
BATCH_SIZE = 10

Convert Notes into black/non-black key encoding and semitone distances

In [5]:
# static variables for encoding inputs into black/non-black keys and semitone distances
BLACK_KEYS = set(['C#','Bb','D#','Eb','F#','Gb','G#','Ab','A#','Bb'])
KEY_MAP = {'Cb': -1, 'C': 0, 'C#': 1, 'Db': 1, 'D': 2, 'D#': 3, 'Eb': 3,
           'E': 4, 'Fb': 4, 'E#': 5, 'F': 5, 'F#': 6, 'Gb': 6, 'G': 7, 'G#': 8,
           'Ab': 8, 'A': 9, 'A#': 10, 'Bb': 10, 'B': 11, 'B#': 12}
REMOVE_DIGITS = str.maketrans('', '', digits)
OCTAVE_SEMITONES = 12

In [6]:
# Enumerate black keys. Do not filter by checking for '#' or 'b' in the string, because
# composers sometimes write things like E# or Cb, which are not black keys.
# The string translate() method is faster than iterating manually.
def is_black_key(key):
    key = key.translate(REMOVE_DIGITS)
    return key in BLACK_KEYS

In [7]:
# Get semitone distance
# if no octave digit indicators in input, assume all to be the same
# octave 4 was arbitrarily picked as C4 is middle C.
def get_semitone_distances(notes):
    # fill in missing octave digits
    notes = [x for x in map(lambda n: n if any(i.isdigit() for i in n) else n+'4', notes)]
    
    # compute semitone differences
    # initialize first note semitone distance to 0
    notes = [notes[0]] + notes
    diffs = []
    for i in range(len(notes) - 1):
        before = notes[i]
        after = notes[i+1]
        
        before = int(before[-1]) * OCTAVE_SEMITONES + KEY_MAP[before[:-1]]
        after = int(after[-1]) * OCTAVE_SEMITONES + KEY_MAP[after[:-1]]
        diffs.append(after - before)
    return diffs

In [8]:
# encodes tokens as inputs to model
# for the piano model, we don't really care which octave we're at
# in fact, we can simplify the notes into semitone distance from previous note,
# and whether the current note is a white or black key (affects ease of playing)
# note that the primus dataset does not have double-sharp/flat as inputs
def get_encoding(tokens):
    # combine multiple lines, if any
    tokens = '\t'.join(tokens)
    tokens = tokens.split('\t')
    # only use notes
    tokens = [y for y in filter(lambda x: x.startswith('note-') or x.startswith('gracenote-'), tokens)]
    # ignore note lengths
    tokens = [y for y in map(lambda x: x.split('_')[0][5:] if x.startswith('note-') else x.split('_')[0][10:]
                             , tokens)]
    # get black_keys
    black_keys = [x for x in map(is_black_key, tokens)]
    semitone_distances = get_semitone_distances(tokens)
    
    return [(x,y) for x, y in zip(black_keys, semitone_distances)]
    

Heuristic-based labelling

In [18]:
# Treats this as a constraint satisfaction problem, with heuristics as strict constraints
# An alternative neighbourhood-search algorithm exists at https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/11801/Herremans%20A%20variable%20neighborhood%20search%20algorithm%202015%20Accepted.pdf
# but it is over-engineered for this purpose
def solve(enc):
    variables = ['v{}'.format(i) for i in range(len(enc))]
    problem = constraint.Problem()
    for v in variables:
        problem.addVariable(v, range(1,6))
        
    for i in range(0, len(variables) - 1):
        problem.addConstraint(constraint.ExactSumConstraint(30))
        
    print(problem.getSolution())

Pull data from data directory/directories

In [9]:
# Get semantic data from dataset
# assumes data path is data_src[i]/folder_name/folder_name.semantic, in-line with primus dataset
def get_semantics_data(data_src):
    for src in data_src:
        for i, folder in enumerate(next(os.walk(src))[1]):
            # print(os.path.join(DATA_DIR, folder)) # this looks disgusting on windows lmao
            path_to_file = os.path.join(src, folder, "{}.semantic".format(folder))
            with open(path_to_file, "r") as f:
                try:
                    yield f.readlines()
                except FileNotFoundError:
                    logging.warn("Data file not found: {}".format(path_to_file))
    

Execution Main

In [10]:
def main():
    for i, semantics_data in enumerate(get_semantics_data(DATA_DIR)):
        encoding = get_encoding(semantics_data)
        pseudo_labels = solve(encoding)
        pprint(semantics_data)
        pprint(encoding)
        break

In [19]:
main()

{'v0': 5, 'v1': 5, 'v10': 5, 'v11': 5, 'v12': 1, 'v13': 1, 'v2': 1, 'v3': 1, 'v4': 1, 'v5': 1, 'v6': 1, 'v7': 1, 'v8': 1, 'v9': 1}
['clef-G2\tkeySignature-EbM\ttimeSignature-3/4\tnote-Bb5_quarter\t'
 'note-Eb5_eighth\tnote-Bb5_eighth\tnote-C6_eighth\tnote-Bb5_eighth\tbarline\t'
 'note-Ab5_eighth\tnote-Ab5_eighth\trest-sixteenth\tnote-Ab5_sixteenth\t'
 'note-G5_sixteenth\tnote-Ab5_sixteenth\tnote-Bb5_sixteenth\t'
 'note-Ab5_sixteenth\tnote-G5_sixteenth\tnote-Ab5_sixteenth\tbarline\t']
[(True, 0),
 (True, -7),
 (True, 7),
 (False, 2),
 (True, -2),
 (True, -2),
 (True, 0),
 (True, 0),
 (False, -1),
 (True, 1),
 (True, 2),
 (True, -2),
 (False, -1),
 (True, 1)]
