<a href="https://colab.research.google.com/github/sauravkokane/Data-Science-Training/blob/master/Melody_generation_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Connect to google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Nessessary Libraries

In [1]:
import random
import json
import pickle

In [3]:
import os
import music21 as m21

In [4]:
import numpy as np
import pandas as pd
import tensorflow.keras as keras
import matplotlib.pyplot as plt

## File Paths

In [5]:
KERN_DATASET_PATH = "/content/drive/MyDrive/Datasets/essen"
DATASET_PATH = "/content/drive/MyDrive/Datasets/Melody_Dataset"

In [6]:
FILE_DATASET_PATH = "/content/drive/MyDrive/Datasets/Melody Generation/file_dataset.txt"
MAPPING_PATH = "/content/drive/MyDrive/Datasets/Melody Generation/mapping.json"
DELIMETER = "/ "
SEQUENCE_LENGTH = 64

In [2]:
# Inputs and outputs saving paths (pickle files)
TRAINING_INPUT_PATH = "/content/drive/MyDrive/Datasets/Melody Generation/songs_inputs.pkl"
TRAINING_ONE_HOT_ENCODED_INPUT_PATH = "/content/drive/MyDrive/Datasets/Melody Generation/one_hot_inputs.pkl"
TRAINING_TARGET_PATH = "/content/drive/MyDrive/Datasets/Melody Generation/songs_targets.pkl"

In [None]:
TEST_PATH = "/content/drive/MyDrive/Datasets/essen/europa/deutschl/test"

In [None]:
ACCEPTABLE_DURATIONS = [
    0.25,    # 16th Note
    0.5,     # 8th note
    0.75,    # dotted note
    1.0,     # quarter note
    1.5,     # dotted quarter note
    2,       # half note
    3,       # three quarter notes
    4        # full note
]

In [None]:
os.listdir(TEST_PATH)

['deut5153.krn',
 'deut5152.krn',
 'deut5155.krn',
 'deut5156.krn',
 'deut5157.krn',
 'deut5147.krn',
 'deut5146.krn',
 'deut5148.krn',
 'deut5150.krn',
 'deut5151.krn',
 'deut5149.krn',
 'deut5154.krn',
 'CKSUM']

## Preprocessing functions

In [None]:
def load_songs_in_kern(dataset_path):
    # go through all files in dataset and load them using music21
    songs = []
    for path, subdirs, files in os.walk(dataset_path):
        for file in files:
            if file[-3:] == "krn":
                song = m21.converter.parse(os.path.join(path, file))
                songs.append(song)
    return songs


In [None]:
def has_acceptable_duration(song:m21.stream.base.Score, acceptable_durations:list):
    for note in song.flatten().notesAndRests:
        if note.duration.quarterLength not in acceptable_durations:
            return False
    return True

In [None]:
def transpose(song: m21.stream.base.Score):
    # get key from the score
    try:
        parts = song.getElementsByClass(m21.stream.Part)
        measure0 = parts[0].getElementsByClass(m21.stream.Measure)
        key = measure0[0][4]
    except IndexError:
        key =  song.analyze("key")

    # Predict key using music21 if key is not present in first measure
    if not isinstance(key, m21.key.Key):
        key =  song.analyze("key")

    # get interval for transposition
    if key.mode == "major":
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch("C"))
    elif key.mode == "minor":
        interval = m21.interval.Interval(key.tonic, m21.pitch.Pitch("A"))

    # Transpose the song by calculated interval
    transposed_song = song.transpose(interval)
    return transposed_song

In [None]:
def encode(song, time_step=0.25):
    # p = 60, d= 1.0 => [60, "_", "_", "_"]
    encoded_song = []
    for event in song.flatten().notesAndRests:
        # handle notes
        if isinstance(event, m21.note.Note):
            symbol = event.pitch.midi   # 60

        # handle rests
        elif isinstance(event, m21.note.Rest):
            symbol = "r"

        # convert the note/rest into time series notation
        steps = int(event.duration.quarterLength // time_step)

        for step in range(steps):
            if step==0:
                encoded_song.append(symbol)
            else:
                encoded_song.append("_")
    # change encoded song from list to string
    encoded_song = " ".join(map(str, encoded_song))
    return encoded_song

In [None]:
def preprocess(dataset_path):
    # Load the Folk songs
    print("Song Loading is started")
    loaded_songs = load_songs_in_kern(dataset_path)
    print(f"Loaded {len(loaded_songs)} songs.")

    for i, song in enumerate(loaded_songs):

        # Filter out the songs which have unacceptable length
        if not has_acceptable_duration(song, ACCEPTABLE_DURATIONS):
            continue
        try:
            # Transpose songs to C Major / A Minor scale
            transposed_song = transpose(song)

            # Encode songs with music time series representation
            encoded_song = encode(transposed_song)

            # Save songs in a text file
            save_path = os.path.join(DATASET_PATH, str(i))
            with open(save_path, "w") as f:
                f.write(encoded_song)
        except:
            pass

In [None]:
# preprocess(KERN_DATASET_PATH)

In [None]:
def load_encoded_song(file_path):
    with open(file_path, "r") as file:
        song = file.read()
    return song

In [None]:
def create_single_file_dataset(dataset_path, full_dataset_file_path, delimiter, sequence_length):
    """
    Loads all the songs in the dataset_path, adds a song delimiter (delimiter * sequence_length) after each song, and saves the songs in a single file located at full_dataset_file_path.

    Parameters
    ----------
    dataset_path : str
        The path to the root directory of the dataset containing songs to be processed.
    full_dataset_file_path : str
        The path to the file that will contain the whole dataset.
    delimiter : str
        The delimiter that will be used to separate the songs in the single file.
    sequence_length : int
        The length of the sequence that will be used to separate the songs in the single file.

    Returns
    -------
    str
        The string containing the songs and the delimiters.
    """
    song_delimiter = delimiter * sequence_length
    songs = []
    number_of_songs = 0
    for path, _, files in os.walk(dataset_path):
        for file in files:
            file_path = os.path.join(path, file)
            song = load_encoded_song(file_path)
            songs.append(song)
            number_of_songs += 1
    print(f"Loaded {number_of_songs} songs.")

    songs_str = ' '.join(songs + [song_delimiter])

    with open(full_dataset_file_path, 'w') as f:
        f.write(songs_str)

    return songs_str

In [None]:
def create_mapping(songs, mapping_file_path):

    """
    Creates a mapping from unique symbols in the songs to numeric indices and saves it to a file.

    This function identifies the unique vocabulary of symbols (e.g., notes, rests) in the provided
    songs, assigns each symbol a unique integer index, and saves this mapping to a specified JSON file.

    Parameters
    ----------
    songs : str
        A string representing a sequence of encoded songs, with symbols separated by spaces.
    mapping_file_path : str
        The file path where the mapping of symbols to indices will be saved in JSON format.

    Returns
    -------
    dict
        A dictionary where keys are symbols from the songs and values are their corresponding indices.
    """

    mappings = {}

    # identify the vocabulary
    songs = songs.split()
    vocabulary = list(set(songs))

    for i, note in enumerate(vocabulary):
        mappings[note] = i

    # save the mapping in a text file
    with open(mapping_file_path, "w") as f:
        json.dump(mappings, f, indent=4)

    return mappings

In [None]:
def convert_songs_to_numeric(encoded_songs, mapping_file_path):
    """
    Converts a string of encoded songs to a list of numeric values by mapping
    each symbol to its corresponding index in the provided mapping file.

    Parameters
    ----------
    encoded_songs : str
        A string representing a sequence of encoded songs, with symbols
        separated by spaces.
    mapping_file_path : str
        The file path where the mapping of symbols to indices is saved in JSON
        format.

    Returns
    -------
    list of int
        A list of numeric values corresponding to the input songs.
    """
    # Load mappings
    with open(mapping_file_path, "r") as file:
        mappings = json.load(file)

    # Split the songs into list of events
    encoded_songs = encoded_songs.split()

    # Map songs to numeric values
    numeric_songs = [mappings[symbol] for symbol in encoded_songs]

    return numeric_songs

In [None]:
def generate_training_sequences(full_dataset_file_path, mapping_file_path, sequence_length):
    """
    Generates training sequences from a file containing a sequence of songs.

    This function first loads the encoded songs from the specified file path,
    then converts them to numeric values using the provided mapping file path.
    The function then generates the input sequences and targets for training a
    sequence prediction model. The number of sequences is equal to the length
    of the input sequence minus the sequence length.

    Parameters
    ----------
    full_dataset_file_path : str
        The path to the file containing the sequence of encoded songs.
    mapping_file_path : str
        The file path where the mapping of symbols to indices is saved in JSON
        format.
    sequence_length : int
        The length of the sequences to be generated.

    Returns
    -------
    inputs : numpy.ndarray
        A 3-dimensional array of shape (number of sequences, sequence length,
        vocabulary size) containing the input sequences after one-hot encoding.
    targets : numpy.ndarray
        A 1-dimensional array of shape (number of sequences,) containing the
        targets for the input sequences.
    """
    encoded_songs = load_encoded_song(full_dataset_file_path)

    # Convert songs to numeric values
    numeric_songs = convert_songs_to_numeric(encoded_songs, mapping_file_path)

    # Generate the training sequences
    input_sequences = []
    target_values = []

    num_sequences = len(numeric_songs) - sequence_length
    for i in range(num_sequences):
        # Input sequence
        input_sequences.append(numeric_songs[i:i + sequence_length])

        # Target value
        target_values.append(numeric_songs[i + sequence_length])

    # One-hot encode the sequences
    # input dimension = (number of sequences, sequence length) =-> (number of sequences, sequence length, vocabulary size)
    # [[0, 1, 2], [1, 0, 1], [2, 1, 0]] =-> [[[1, 0, 0], [0, 1, 0], [0, 0, 1]], [[0, 1, 0], [1, 0, 0], [0, 1, 0]], [[0, 0, 1], [0, 1, 0], [1, 0, 0]]]
    vocabulary_size = len(set(numeric_songs))
    input_sequences = np.array(input_sequences, dtype=np.int8)
    # inputs = keras.utils.to_categorical(input_sequences, num_classes=vocabulary_size).astype(np.int8)
    targets = np.array(target_values)

    return input_sequences, targets



## Execution of Preprocessing

In [None]:
# songs = create_single_file_dataset(DATASET_PATH, FILE_DATASET_PATH, DELIMETER, SEQUENCE_LENGTH)

Loaded 7429 songs.


In [None]:
songs = load_encoded_song(FILE_DATASET_PATH)

In [None]:
create_mapping(songs, MAPPING_PATH)

{'73': 0,
 '48': 1,
 '79': 2,
 '66': 3,
 '90': 4,
 '60': 5,
 '72': 6,
 '50': 7,
 '53': 8,
 '43': 9,
 '98': 10,
 '95': 11,
 '74': 12,
 '67': 13,
 '82': 14,
 '76': 15,
 '61': 16,
 '71': 17,
 '49': 18,
 '83': 19,
 '70': 20,
 '84': 21,
 '59': 22,
 '55': 23,
 '62': 24,
 '85': 25,
 '56': 26,
 '64': 27,
 '58': 28,
 '69': 29,
 '81': 30,
 '78': 31,
 '75': 32,
 '52': 33,
 '45': 34,
 '57': 35,
 '51': 36,
 'r': 37,
 '91': 38,
 '_': 39,
 '68': 40,
 '54': 41,
 '77': 42,
 '80': 43,
 '/': 44,
 '93': 45,
 '63': 46,
 '86': 47,
 '47': 48,
 '65': 49,
 '88': 50}

In [None]:
inputs, targets = generate_training_sequences(FILE_DATASET_PATH, MAPPING_PATH, SEQUENCE_LENGTH)
print(type(inputs), type(targets))
print(inputs.shape, targets.shape)
# print memory size of inputs and targets
print(inputs.nbytes, targets.nbytes)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
(1139206, 64) (1139206,)
72909184 9113648


In [8]:
with open(TRAINING_INPUT_PATH, "rb") as f:
    inputs = pickle.load(f)

with open(TRAINING_TARGET_PATH, "rb") as f:
    targets = pickle.load(f)

In [9]:
inputs.shape, targets.shape

((1139206, 64), (1139206,))

In [7]:
oneHotEncodedInputs = []

In [8]:
for input in inputs:
    oneHotInput = []
    for event in input:
        zero_array = np.zeros(51, dtype=np.int8)
        zero_array[event] = 1
        oneHotInput.append(zero_array)
        del zero_array
    oneHotInput = np.array(oneHotInput, dtype=np.int8)
    oneHotEncodedInputs.append(oneHotInput)
    del oneHotInput

In [10]:
oneHotEncodedInputs = np.array(oneHotEncodedInputs, dtype=np.int8)

In [21]:
with open(TRAINING_ONE_HOT_ENCODED_INPUT_PATH, "rb") as f:
    oneHotEncodedInputs = pickle.load(f)

In [22]:
oneHotEncodedInputs.shape

(1139206, 64, 51)