# Divide Songs Into Chunks

The script below is designed to process MIDI files from the POP909 dataset by chunking them into smaller segments, ensuring each segment contains a sufficient number of notes for analysis. The code begins by specifying parameters such as the maximum number of bars per chunk (MAX_NB_BAR) and the minimum number of notes required in a chunk (MIN_NB_NOTES). It also sets up the output directory for the processed MIDI files (merged_out_dir) and iterates over folders and MIDI files within the dataset directory.

For each MIDI file, the script determines the output directory and checks if chunked versions of the file already exist. If chunks are found, the script skips processing that MIDI file. Otherwise, it loads the MIDI file, divides it into chunks based on the specified parameters, and saves each chunk as a separate MIDI file. The chunking process ensures that each segment contains a sufficient number of notes for meaningful analysis. Additionally, the script handles exceptions that may occur during the processing of MIDI files.

Overall, this script facilitates the preprocessing of MIDI data, enabling the segmentation of large MIDI files into smaller, more manageable chunks suitable for further analysis and modeling tasks.

In [None]:
import os
from copy import deepcopy
from math import ceil
from miditoolkit import MidiFile
from tqdm import tqdm

MAX_NB_BAR = 8 # Maximum number of bars per chunk
MIN_NB_NOTES = 20 # Minimum number of notes required in a chunk
dataset = "POP909"  # Name of the dataset

# Output directory for the processed MIDI files
merged_out_dir = os.path.join("C:/Users/naomi/Thesis/Thesis/Thesis-main/output", f"{dataset}-chunked") 
os.makedirs(merged_out_dir, exist_ok=True)

# Adjust the root folder to your dataset
root_folder = 'C:/Users/naomi/Thesis/Thesis/Thesis-main/POP909'

# Iterate over folders and MIDI files
for folder_name in os.listdir(root_folder):
    folder_path = os.path.join(root_folder, folder_name)

    # Check if the item in the directory is a folder
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_name}")

        # Iterate over MIDI files in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.mid'):
                midi_file_path = os.path.join(folder_path, file_name)

                try:
                    # Determine the output directory for this file
                    relative_path = os.path.relpath(midi_file_path, root_folder)
                    output_dir = os.path.join(merged_out_dir, os.path.dirname(relative_path))
                    os.makedirs(output_dir, exist_ok=True)

                    # Check if chunks already exist
                    midi_filename = os.path.splitext(os.path.basename(midi_file_path))[0]
                    chunk_paths = [f for f in os.listdir(output_dir) if f.startswith(f"{midi_filename}_") and f.endswith('.mid')]
                    if len(chunk_paths) > 0:
                        print(f"Chunks for {midi_file_path} already exist, skipping...")
                        continue

                    # Loads MIDI, merges, and saves it
                    midi = MidiFile(midi_file_path)
                    ticks_per_cut = MAX_NB_BAR * midi.ticks_per_beat * 4
                    nb_cuts = ceil(midi.max_tick / ticks_per_cut)
                    if nb_cuts < 2:
                        continue

                    print(f"Processing {midi_file_path}")
                    midis = [deepcopy(midi) for _ in range(nb_cuts)]

                    for j, track in enumerate(midi.instruments):  # sort notes as they are not always sorted right
                        track.notes.sort(key=lambda x: x.start)
                        for midi_short in midis:  # clears notes from shorten MIDIs
                            midi_short.instruments[j].notes = []
                        for note in track.notes:
                            cut_id = note.start // ticks_per_cut
                            note_copy = deepcopy(note)
                            note_copy.start -= cut_id * ticks_per_cut
                            note_copy.end -= cut_id * ticks_per_cut
                            midis[cut_id].instruments[j].notes.append(note_copy)

                    # Saving MIDIs
                    for j, midi_short in enumerate(midis):
                        if sum(len(track.notes) for track in midi_short.instruments) < MIN_NB_NOTES:
                            continue
                        output_filename = f"{midi_filename}_{j}.mid"
                        output_path = os.path.join(output_dir, output_filename)
                        midi_short.dump(output_path)

                except Exception as e:
                    print(f"An error occurred while processing {midi_file_path}: {e}")