In [None]:
import pandas as pd
import os

In [20]:
data_dir = "./data/maestro/maestro-v3.0.0"
metadata = pd.read_csv(os.path.join(data_dir, "maestro-v3.0.0.csv"))
# Extract paths of MIDI files
midi_paths = metadata["midi_filename"].apply(lambda x: os.path.join(data_dir, x)).tolist()

# Print sample MIDI file paths
print(midi_paths[:5])

['./data/maestro/maestro-v3.0.0/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi', './data/maestro/maestro-v3.0.0/2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MID--AUDIO_03_R2_2008_wav--2.midi', './data/maestro/maestro-v3.0.0/2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-split_07-07-17_Piano-e_3-02_wav--3.midi', './data/maestro/maestro-v3.0.0/2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MID--AUDIO_21_R1_2004_01_Track01_wav.midi', './data/maestro/maestro-v3.0.0/2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MID--AUDIO_17_R1_2006_04_Track04_wav.midi']


In [None]:
output_dir = "./data/maestro/tokenized_data"
os.makedirs(output_dir, exist_ok=True)

In [None]:
import pickle
import mido

# Function to extract MIDI events (Note On/Off, Control Change, and Time)
def extract_midi_events(midi_path):
    midi_file = mido.MidiFile(midi_path)
    
    events = []
    for track in midi_file.tracks:
        time = 0  # Time in ticks
        for msg in track:
            time += msg.time  # Increment time with the delta-time of the event
            
            # Store the relevant MIDI events
            if msg.type == 'note_on' or msg.type == 'note_off':
                events.append({
                    'type': msg.type,
                    'note': msg.note,
                    'velocity': msg.velocity,
                    'time': time
                })
            elif msg.type == 'control_change':
                events.append({
                    'type': msg.type,
                    'control': msg.control,
                    'value': msg.value,
                    'time': time
                })
    return events

# Function to tokenize MIDI events
def tokenize_midi_events(events):
    tokens = []
    for event in events:
        if event['type'] == 'note_on':
            token = f"note_on_{event['note']}"
        elif event['type'] == 'note_off':
            token = f"note_off_{event['note']}"
        elif event['type'] == 'control_change':
            token = f"control_change_{event['control']}_{event['value']}"
        tokens.append(token)
    return tokens

# Main processing function
def process_midi(midi_path, tokenizer):
    try:
        # Extract MIDI events from the MIDI file
        midi_events = extract_midi_events(midi_path)
        
        # Tokenize the MIDI events
        tokens = tokenize_midi_events(midi_events)
        
        return tokens
    except Exception as e:
        print(f"Error processing {midi_path}: {e}")
        return None

# List of MIDI paths (Replace with your actual list of paths)
midi_paths = ["path_to_midi_file1.mid", "path_to_midi_file2.mid", ...]  # Add your MIDI file paths here

# Example tokenizer (In your case, you might use a different tokenizer, such as a pretrained tokenizer)
tokenized_data = []
for midi_path in midi_paths[:10]:  # Process only the first 10 MIDI files
    tokens = process_midi(midi_path, tokenizer=None)  # No external tokenizer in this example
    if tokens is not None:
        tokenized_data.append(tokens)

# Save the tokenized data to a pickle file
output_dir = "path/to/output"  # Specify your output directory
os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
with open(os.path.join(output_dir, "tokenized_data.pkl"), "wb") as f:
    pickle.dump(tokenized_data, f)

print("MIDI files processed and tokenized")

midi files processed and tokenized
