# Imports

In [1]:
import os
import sys
from collections import defaultdict
from collections import Counter
import shutil
import numpy as np
import re
from symusic import Score, Tempo
import pickle

# Add the absolute path to the root directory
script_dir = os.path.abspath('.')
root_dir = os.path.dirname(script_dir)
sys.path.insert(0, root_dir)

from data import *

# Globals

In [2]:
dataset_root = os.path.join(script_dir, "serialized")
os.makedirs(dataset_root, exist_ok=True)

def get_midi_files(dataset_root: str) -> dict:
    '''
    Count and get the number of MIDI files in the dataset.
    Return a dictionary with the following structure:
    {
        'subdir': {
            'files': list of MIDI files,
            'count': number of MIDI files
        }
    }
    '''
    groove_extensions = ['.mid', '.MID', '.Mid']

    midi_files = {}
    print("\nMIDI file counts in each first-level subdirectory of root (recursive):")
    root_subdirs = [d for d in os.listdir(dataset_root) if os.path.isdir(os.path.join(dataset_root, d))]
    for subdir in root_subdirs:
        subdir_path = os.path.join(dataset_root, subdir)
        midi_count = 0
        files_list = []
        for root, dirs, files in os.walk(subdir_path):
            midi_files_in_dir = [os.path.join(root, f) for f in files if any(f.endswith(ext) for ext in groove_extensions)]
            midi_count += len(midi_files_in_dir)
            files_list.extend(midi_files_in_dir)
            assert len(files_list) == midi_count
            midi_files[subdir] = {
                'files': files_list,
                'count': midi_count
            }

    total_count = sum(data['count'] for data in midi_files.values())
    sorted_midi_files = dict(sorted(midi_files.items(), key=lambda x: x[1]['count'], reverse=True))
    print(f"Total MIDI files: {total_count}")

    for subdir, data in sorted_midi_files.items():
        print(f"{subdir}: {data['count']} MIDI files")

    return sorted_midi_files

# MIDIMAN Processing

## Process 50s Drummer Autumn Dataset

### Count files in each subdirectory

In [3]:
midi_dir = os.path.join(script_dir, 'raw', '50s Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 800
Indie: 200 MIDI files
Pop: 100 MIDI files
Blues: 100 MIDI files
Rock: 100 MIDI files
Soul: 100 MIDI files
Country: 100 MIDI files
Jazz: 100 MIDI files


### Create dataset

In [4]:
dataset = []
ID = 0
name = "50sAutumn"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=AUTUMN_50S_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=AUTUMN_50S_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        midi_bytes = feature.score.dumps_midi()
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'autumn_50s',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Indie: avg_dropped_notes: 0.0
dropped_set: set()
Pop: avg_dropped_notes: 0.0
dropped_set: set()
Blues: avg_dropped_notes: 0.0
dropped_set: set()
Rock: avg_dropped_notes: 0.0
dropped_set: set()
Soul: avg_dropped_notes: 0.0
dropped_set: set()
Country: avg_dropped_notes: 0.0
dropped_set: set()
Jazz: avg_dropped_notes: 0.0
dropped_set: set()
Saved 800 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/50sAutumn.pkl


### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process 60s Drummer Early Dataset

### Count files in each subdirectory

In [5]:
midi_dir = os.path.join(script_dir, 'raw', '60s Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 1000
Rock: 600 MIDI files
Soul: 200 MIDI files
Jazz: 200 MIDI files


### Create dataset

In [6]:
dataset = []
ID = 0
name = "60sEarly"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=EARLY_60S_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=EARLY_60S_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        midi_bytes = feature.score.dumps_midi()
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'early_60s',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Rock: avg_dropped_notes: 0.0
dropped_set: set()
Soul: avg_dropped_notes: 0.0
dropped_set: set()
Jazz: avg_dropped_notes: 0.0
dropped_set: set()
Saved 1000 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/60sEarly.pkl


### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process 70s Drummer Open Dataset

### Count files in each subdirectory

In [7]:
midi_dir = os.path.join(script_dir, 'raw', '70s Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 1000
Rock: 400 MIDI files
Funk: 300 MIDI files
Pop: 200 MIDI files
Reggae: 100 MIDI files


### Create dataset

In [8]:
dataset = []
ID = 0
name = "70sOpen"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=OPEN_70S_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=OPEN_70S_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        midi_bytes = feature.score.dumps_midi()
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'open_70s',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Rock: avg_dropped_notes: 0.005
dropped_set: {106}
Funk: avg_dropped_notes: 0.0
dropped_set: set()
Pop: avg_dropped_notes: 0.0
dropped_set: set()
Reggae: avg_dropped_notes: 0.0
dropped_set: set()
Saved 1000 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/70sOpen.pkl


### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process 80s Drummer Black Dataset

### Count files in each subdirectory

In [9]:
midi_dir = os.path.join(script_dir, 'raw', '80s Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 1000
Pop: 300 MIDI files
Metal: 300 MIDI files
Indie: 200 MIDI files
Punk: 200 MIDI files


### Create dataset

In [10]:
dataset = []
ID = 0
name = "80sBlack"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=BLACK_80S_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=BLACK_80S_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        midi_bytes = feature.score.dumps_midi()
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': ID,
                    'map': 'black_80s',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Pop: avg_dropped_notes: 0.0
dropped_set: set()
Metal: avg_dropped_notes: 0.0
dropped_set: set()
Indie: avg_dropped_notes: 0.0
dropped_set: set()
Punk: avg_dropped_notes: 0.0
dropped_set: set()
Saved 1000 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/80sBlack.pkl


### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process Studio Drummer Session Dataset

### Count files in each subdirectory

In [3]:
midi_dir = os.path.join(script_dir, 'raw', 'Studio Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 3305
Rock: 925 MIDI files
Pop: 900 MIDI files
Funk: 450 MIDI files
Metal: 350 MIDI files
Jazz: 350 MIDI files
Country: 330 MIDI files


### Create dataset

In [5]:
dataset = []
ID = 0
name = "sessionStudio"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=SESSION_STUDIO_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            print(f"Error processing file: {file} - {e}")
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=SESSION_STUDIO_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file} - {e}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        try:
            time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        except Exception as e:
            print(f"Error processing file: {file} - {e}")
            from symusic import TimeSignature
            if '6-8' in file:
                time_signature = "6/8"
            elif '3-4' in file:
                time_signature = "3/4"
            else:
                time_signature = "4/4"
            time_sig_tuple = tuple(map(int, time_signature.split('/')))
            feature.score.time_signatures = [TimeSignature(time=0, numerator=time_sig_tuple[0], denominator=time_sig_tuple[1])]

        assert feature.is_valid(), f"Feature is not valid: {feature.validity_report()}"
        midi_bytes = feature.score.dumps_midi()
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'session_studio',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Error processing file: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Studio Drummer MIDI files/Rock/Hard Rock/08 Groove 130BPM/15 16th Ride.mid - ('Invalid file! Failed checks:', ['track_is_drum', 'has_single_tempo'])
Error processing file: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Studio Drummer MIDI files/Rock/Hard Rock/08 Groove 130BPM/11 16th Ride.mid - ('Invalid file! Failed checks:', ['track_is_drum', 'has_single_tempo'])
Error processing file: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Studio Drummer MIDI files/Rock/Hard Rock/08 Groove 130BPM/16 16th Ride.mid - ('Invalid file! Failed checks:', ['track_is_drum', 'has_single_tempo'])
Error processing file: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Studio Drummer MIDI files/Rock/Hard Rock/08 Groove 130BPM/18 8th Cym Toms.mid - ('Invalid file! Failed checks:', ['track_is_drum', 'has_single_tempo'])
Error processing file: /Users/puruboii/Desktop/Github-Local/drum-geni

### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process Vintage Drummer Ebony Dataset


### Count files in each subdirectory

In [6]:
midi_dir = os.path.join(script_dir, 'raw', 'Vintage Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 750
Jazz: 350 MIDI files
Indie: 200 MIDI files
Soul: 100 MIDI files
Showtunes: 50 MIDI files
Funk: 50 MIDI files


### Create dataset

In [7]:
dataset = []
ID = 0
name = "ebonyVintage"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=EBONY_VINTAGE_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=EBONY_VINTAGE_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        midi_bytes = feature.score.dumps_midi()
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'ebony_vintage',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Jazz: avg_dropped_notes: 0.3142857142857143
dropped_set: {74, 98}
Indie: avg_dropped_notes: 0.0
dropped_set: set()
Soul: avg_dropped_notes: 0.0
dropped_set: set()
Showtunes: avg_dropped_notes: 0.0
dropped_set: set()
Funk: avg_dropped_notes: 0.0
dropped_set: set()
Saved 750 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/ebonyVintage.pkl


### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process Modern Drummer Sparkle Dataset


### Count files in each subdirectory

In [8]:
midi_dir = os.path.join(script_dir, 'raw', 'Modern Drummer MIDI files')
midi_files = get_midi_files(midi_dir)


MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 1000
Rock: 400 MIDI files
Metal: 200 MIDI files
RnB: 200 MIDI files
Electronic: 200 MIDI files


### Create dataset

In [9]:
dataset = []
ID = 0
name = "sparkleModern"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=SPARKLE_MODERN_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e:
            try:    
                # Single tempo fix: extract tempo from file name
                try:
                    tempo = int(re.search(r'(\d+)\s*BPM', file).group(1))
                except Exception as e1: # If tempo is not in file name, use first tempo from score
                    tempo = score.tempos[0].qpm
                score = Score.from_file(file)
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    note_list_copy.extend(score.tracks[1].notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    score.tracks.pop(1)
                feature = DrumMIDIFeature.from_score(score, drum_map=SPARKLE_MODERN_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e: # If error, print score info and exit
                print(f"Error processing file: {file}")
                print(e)
                for track in score.tracks:
                    print(f"start: {track.start()}, end: {track.end()}, is_drum: {track.is_drum}")


        style = subdir.lower()
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        midi_bytes = feature.score.dumps_midi()
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'sparkle_modern',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': midi_bytes
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Rock: avg_dropped_notes: 0.0175
dropped_set: {99}
Metal: avg_dropped_notes: 0.0
dropped_set: set()
RnB: avg_dropped_notes: 0.195
dropped_set: {20}
Electronic: avg_dropped_notes: 0.01
dropped_set: {99}
Saved 1000 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/sparkleModern.pkl


### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process 2-Bar groove

### Count files in each subdirectory

In [10]:
midi_dir = os.path.join(script_dir, 'raw', "GMD", '2bar-midionly.pkl')
with open(midi_dir, "rb") as f:
    midi_files = pickle.load(f)

midi_files = midi_files['train'] + midi_files['test'] + midi_files['validation']
print(f"2bar-midionly: {len(midi_files)} MIDI files")

2bar-midionly: 22619 MIDI files


### Create dataset

In [11]:
dataset = []
ID = 0
name = "2barGMD"

num_dropped_notes = 0
num_failures = 0
dropped_set = set()
for data in midi_files:
    style = data['style_primary']
    metadata = [data['style_secondary'], data['time_signature'], data['bpm']]
    type = data['type']
    midi_bytes = data['midi']

    try:
        feature = DrumMIDIFeature(midi_bytes, drum_map=ROLAND_TD_11_REDUCED_DRUM_MAP)
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        num_dropped_notes += feature.num_dropped_notes
        dropped_set.update(feature.dropped_set)        
    except Exception as e: 
        print(f"Error: {data['id'][:10]} - {e}")
        num_failures += 1
        continue

    dataset.append({
                'id': f"{name}_{ID}",
                'map': 'roland_td_11',
                'style': style,
                'time_signature': time_signature,
                'type': type,
                'metadata': metadata,
                'midi_bytes': feature.score.dumps_midi()
            })
    
    ID += 1
    

avg_dropped_notes = num_dropped_notes / len(dataset)
print(f"{name}: avg_dropped_notes: {avg_dropped_notes}")
print(f"dropped_set: {dropped_set}")
print(f"num_failures: {num_failures}")
out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track_is_drum'])
Error: drummer1/s - ('Invalid file! Failed checks:', ['has_single_track', 'track

### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

## Process Full groove

### Count files in each subdirectory

In [3]:
midi_dir = os.path.join(script_dir, 'raw', "GMD", 'full-midionly.pkl')
with open(midi_dir, "rb") as f:
    midi_files = pickle.load(f)

midi_files = midi_files['train'] + midi_files['test'] + midi_files['validation']
print(f"full-midionly: {len(midi_files)} MIDI files")

full-midionly: 1150 MIDI files


### Create dataset

In [10]:
dataset = []
ID = 0
name = "fullGMD"

num_dropped_notes = 0
num_failures = 0
dropped_set = set()
for data in midi_files:
    style = data['style_primary']
    metadata = [data['style_secondary'], data['time_signature'], data['bpm']]
    type = data['type']
    midi_bytes = data['midi']

    try:
        feature = DrumMIDIFeature(midi_bytes, drum_map=ROLAND_TD_11_REDUCED_DRUM_MAP)
        time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        num_dropped_notes += feature.num_dropped_notes
        dropped_set.update(feature.dropped_set)        
    except Exception as e:
        try:    
            # Multiple tempo fix: extract first tempo from score
            score = Score.from_midi(midi_bytes)
            tempo = score.tempos[0].qpm
            score.tracks[0].is_drum = True
            score.tempos = [Tempo(time=0, qpm=tempo)]
            feature = DrumMIDIFeature.from_score(score, drum_map=ROLAND_TD_11_REDUCED_DRUM_MAP)
            time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)
        except Exception as e2:
            print(f"Error: {data['id'][:10]} - {e2}")
            num_failures += 1
            continue

    dataset.append({
                'id': f"{name}_{ID}",
                'map': 'roland_td_11',
                'style': style,
                'time_signature': time_signature,
                'type': type,
                'metadata': metadata,
                'midi_bytes': feature.score.dumps_midi()
            })
    
    ID += 1
    

avg_dropped_notes = num_dropped_notes / len(dataset)
print(f"{name}: avg_dropped_notes: {avg_dropped_notes}")
print(f"dropped_set: {dropped_set}")
print(f"num_failures: {num_failures}")
out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Error: Invalid file!
Time signatures: [TimeSignature(0, 4, 4, 'Tick'), TimeSignature(0, 4, 4, 'Tick')]
Tempos: [Tempo(0, 85.00004250002125, 705882, 'Tick'), Tempo(0, 85.00004250002125, 705882, 'Tick')]
Error: Invalid file!
Time signatures: [TimeSignature(0, 4, 4, 'Tick')]
Tempos: [Tempo(0, 109.99990833340972, 545455, 'Tick'), Tempo(0, 109.99990833340972, 545455, 'Tick')]
fullGMD: avg_dropped_notes: 0.0
dropped_set: set()
num_failures: 0
Saved 1150 entries to /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/serialized/fullGMD.pkl


### Test 5 random samples

In [11]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")

style: gospel, time_signature: 4/4, type: fill
dropped_notes: 0
style: rock, time_signature: 4/4, type: beat
dropped_notes: 0
style: rock, time_signature: 4/4, type: fill
dropped_notes: 0
style: country, time_signature: 4/4, type: fill
dropped_notes: 0
style: rock, time_signature: 4/4, type: beat
dropped_notes: 0


## Process Toontrack Drum MIDI Dataset

### Count files in each subdirectory

In [12]:
midi_dir = os.path.join(script_dir, 'raw', 'Toontrack Drums MIDI Files')
midi_files = get_midi_files(midi_dir)



MIDI file counts in each first-level subdirectory of root (recursive):
Total MIDI files: 161678
Metal: 41060 MIDI files
Rock: 32254 MIDI files
Unknown: 28745 MIDI files
Latin: 9998 MIDI files
Jazz: 7884 MIDI files
Blues: 6517 MIDI files
Fusion: 5088 MIDI files
Progressive: 4981 MIDI files
Country: 4343 MIDI files
Pop: 4167 MIDI files
Electronic: 3157 MIDI files
RnB: 2731 MIDI files
Punk: 2553 MIDI files
Funk: 1863 MIDI files
Reggae: 1675 MIDI files
Soul: 1274 MIDI files
Gospel: 624 MIDI files
HipHop: 594 MIDI files
Disco: 583 MIDI files
IndieFolk: 540 MIDI files
Americana: 411 MIDI files
Indie: 375 MIDI files
Grindcore: 211 MIDI files
Djent: 32 MIDI files
Afrobeat: 18 MIDI files


### Create dataset

In [14]:
dataset = []
ID = 0
name = "toontrack"

for subdir, data in midi_files.items():
    if "avg_dropped_notes" not in data:
        data['avg_dropped_notes'] = 0

    num_dropped_notes = 0
    num_failures = 0
    dropped_set = set()
    for file in data['files']:
        try:
            feature = DrumMIDIFeature.from_file(file, drum_map=GM_EXTENDED_REDUCED_DRUM_MAP)
            num_dropped_notes += feature.num_dropped_notes
            dropped_set.update(feature.dropped_set)        
        except Exception as e1: 
            try:  
                # Single tempo fix: extract tempo from file name
                score = Score.from_file(file)
                tempo = score.tempos[0].qpm
                score.tracks[0].is_drum = True
                score.tempos = [Tempo(time=0, qpm=tempo)]
                # Multiple tracks fix: merge all tracks into one
                if len(score.tracks) > 1:
                    note_list_copy = score.tracks[0].notes.copy()
                    for i, track in enumerate(score.tracks):
                        if i == 0:
                            continue
                        note_list_copy.extend(track.notes)
                    note_list_copy.sort(key=None, reverse=False, inplace=True)
                    score.tracks[0].notes = note_list_copy
                    while len(score.tracks) > 1:
                        score.tracks.pop()

                feature = DrumMIDIFeature.from_score(score, drum_map=GM_EXTENDED_REDUCED_DRUM_MAP)
                num_dropped_notes += feature.num_dropped_notes
                dropped_set.update(feature.dropped_set)
            except Exception as e2:
                print(f"Error: {file} - {e1}")
                print(f"Error: {file} - {e2}")
                num_failures += 1
                continue

        style = subdir.lower()
        try:
            time_signature = f"{feature.score.time_signatures[0].numerator}/{feature.score.time_signatures[0].denominator}"
        except Exception as e:
            print(f"Skipping {file} due to missing or invalid time signature: {e}")
            num_failures += 1
            continue
        assert feature.is_valid(), f"Feature is not valid: {feature.validity_report()}"
        # Keep everything after the subdir
        parts = file.split(os.sep)
        metadata = parts[parts.index(subdir) + 1:]
        metadata[-1] = metadata[-1].split('.')[0]
        type = "fill" if "fill" in file.lower() else "beat"

        dataset.append({
                    'id': f"{name}_{ID}",
                    'map': 'gm_extended',
                    'style': style,
                    'time_signature': time_signature,
                    'type': type,
                    'metadata': metadata,
                    'midi_bytes': feature.score.dumps_midi()
                })
        ID += 1
    data['avg_dropped_notes'] = num_dropped_notes / len(data['files'])
    print(f"{subdir}: avg_dropped_notes: {data['avg_dropped_notes']}")
    print(f"dropped_set: {dropped_set}")
    print(f"num_failures: {num_failures}")

out_name = os.path.join(dataset_root, f"{name}.pkl")
with open(out_name, "wb") as f:
    pickle.dump(dataset, f)
print(f"Saved {len(dataset)} entries to {out_name}")

Error: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Toontrack Drums MIDI Files/Metal/NU_METAL_ESSENTIALS/Beats/178bpmStupifiedBeat3.mid - ('Invalid file! Failed checks:', ['has_single_tempo'])
Error: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Toontrack Drums MIDI Files/Metal/NU_METAL_ESSENTIALS/Beats/178bpmStupifiedBeat3.mid - pyvec::index out of range: 0
Error: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Toontrack Drums MIDI Files/Metal/NU_METAL_ESSENTIALS/Beats/165bpmFreakyBeat6.mid - ('Invalid file! Failed checks:', ['has_single_tempo'])
Error: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Toontrack Drums MIDI Files/Metal/NU_METAL_ESSENTIALS/Beats/165bpmFreakyBeat6.mid - pyvec::index out of range: 0
Error: /Users/puruboii/Desktop/Github-Local/drum-genie/dataset/raw/Toontrack Drums MIDI Files/Metal/NU_METAL_ESSENTIALS/Beats/156bpmBlindPump5.mid - ('Invalid file! Failed checks:', ['has_single_tempo'])
Error: /Users/puruboii

### Test 5 random samples

In [None]:
with open(out_name, "rb") as f:
    dataset = pickle.load(f)

for i in range(5):
    random_index = np.random.randint(0, len(dataset))
    datapoint = dataset[random_index]
    print(f"style: {datapoint['style']}, time_signature: {datapoint['time_signature']}, type: {datapoint['type']}")
    feature = DrumMIDIFeature(datapoint['midi_bytes'], drum_map=GM_REDUCED_DRUM_MAP)
    feature.play()
    print(f"dropped_notes: {feature.num_dropped_notes}")