In [1]:
from pathlib import Path
import json

path_to_data = Path.cwd().parent / "data"

# Bach chorales

In [3]:
def read_bach_chorales(
    path, return_split=True
) -> dict[str, list[list[list[int]]]] | list[list[int]]:
    """Read Bach chorales dataset

    Args:
        path: path to jsb-chorales-16th.json
        return_split: Return. Defaults to True.

    Returns:
        If return split is True:
            dict with "train", "valid", "test", each a list of songs.
        else:
            list of songs ("train", "valid", "test" combined)
    """
    with open(path) as f:
        data: dict[str, list[list]] = json.load(f)
    if return_split:
        return data
    else:
        return [beat for song in data.values() for beat in song]


def make_monophonic(
    song: list[list[int | str]],
    num_voices=4,
    end_token: str = None,
    rest_token: str = None,
) -> list[int | str]:
    """Flatten a multi-voice song into one note sequence

    Essentially a np.reshape, but with handling of different-sized elements.

    Args:
        song: A song as a list of beats, each a list of up to `num_voices` notes (int or str)
        num_voices: Number of voices. Defaults to 4.
        end_token: If not None, `end_token` is appended at the end of each voice. Defaults to None.
        rest_token: If not None, `rest_token` is appended whenever a voice is missing (i.e. when len(beat) < num_voices). Defaults to None.

    Returns:
        list of notes
    """
    mono = []
    for i in range(num_voices):
        for beat in song:
            try:
                mono.append(beat[i])
            except IndexError:
                if rest_token is not None:
                    mono.append(rest_token)
        if end_token is not None:
            mono.append(end_token)
    return mono


bach = read_bach_chorales(path_to_data / "jsb-chorales-16th.json")

Bach 

- contains `train`, `valid`, `test`
- each contains chorales, which are lists of beats
- each beat is a list of notes (one for each of 4 voices)
    - I expected each beat to always be a list of 4, but it can be anything from 0 to 4

In [4]:
[f"{k}: {len(v)}" for k, v in bach.items()]

['test: 77', 'train: 229', 'valid: 76']

In [5]:
from collections import Counter

bach_tr = bach["train"]

Counter(
    [f"Beat length = {len(beat)}" for song in bach_tr for beat in song]
).most_common()

[('Beat length = 4', 55097),
 ('Beat length = 0', 72),
 ('Beat length = 1', 24),
 ('Beat length = 3', 19),
 ('Beat length = 2', 16)]

For a monophonic version, each voice will be a separate melody.

In [21]:
song = bach_tr[0][:3]
song[1] = [74, 72]
print(song)
print(make_monophonic(song))
print(make_monophonic(song, end_token="|", rest_token="."))

[[74, 70, 65, 58], [74, 72], [74, 70, 65, 58]]
[74, 74, 74, 70, 72, 70, 65, 65, 58, 58]
[74, 74, 74, '|', 70, 72, 70, '|', 65, '.', 65, '|', 58, '.', 58, '|']


In [None]:
bach_tr_mono = [
    make_monophonic(song, end_token="|", rest_token=".") for song in bach_tr
]

In [None]:
bachs = read_bach_chorales(path_to_data / "jsb-chorales-16th.json", return_split=False)
bachs = [
    make_monophonic(song, num_voices=4, end_token="|", rest_token=".") for song in bachs
]

In [48]:
bach_str = " ".join([str(beat) for song in bachs for beat in song])

In [63]:
def process_bach(path_to_raw, rest_token=".", end_token="|"):
    bachs = read_bach_chorales(path_to_raw, return_split=False)
    bachs = [
        make_monophonic(song, num_voices=4, end_token=end_token, rest_token=rest_token)
        for song in bachs
    ]
    return bachs


# process_bach(path_to_data / "jsb-chorales-16th.json")[:4]

In [11]:
from collections import Counter

set([note for song in bach["train"] for beat in song for note in beat])

{36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81}