In [None]:
!pip install kagglehub

In [9]:
global kagglehub, shutil, os
import kagglehub
import shutil
import os
import zipfile

In [11]:
# -------------------------------
# Constants
# -------------------------------
COMPOSERS = ['Bach', 'Beethoven', 'Chopin', 'Mozart']
DATASET_ID = "blanderbuss/midi-classic-music"
MIDI_DATA_DIR = '/content/midi_data'
DESTINATION_DIR = "/tmp"
SOURCE_DATA_DIR = DESTINATION_DIR
SUPPORTED_EXT = ('.mid', '.midi')

In [15]:
# ----------------------------------
# Delete a folder and all its contents
# ----------------------------------
def delete_folder_recursive(folder_path):

    if os.path.exists(folder_path):
        try:
            shutil.rmtree(folder_path)
            print(f" Deleted folder and its contents: {folder_path}")
        except Exception as e:
            print(f"[ERROR] Failed to delete {folder_path}: {e}")
    else:
        print(f"[INFO] Folder does not exist: {folder_path}")

delete_folder_recursive("/content/midi_data")


 Deleted folder and its contents: /content/midi_data


In [17]:
# -------------------------------
# Download Dataset via KaggleHub
# -------------------------------
def download_dataset(dataset_id):
    print(f"Downloading dataset: {dataset_id}")
    dataset_path = kagglehub.dataset_download(dataset_id)
    print(f"Dataset downloaded to temporary path: {dataset_path}")
    return dataset_path

# -------------------------------
# Copy Dataset to Destination
# -------------------------------
def copy_dataset(source_path, destination_path):
    if not os.path.exists(destination_path):
        os.makedirs(destination_path)
        print(f"Created destination folder: {destination_path}")

    try:
        shutil.copytree(source_path, destination_path, dirs_exist_ok=True)
        print(f"Dataset copied from '{source_path}' to '{destination_path}'")
    except Exception as e:
        print(f"[ERROR] Could not copy dataset: {e}")

# ----------------------------------
# Create base output folder and composer folders
# ----------------------------------
def create_composer_folders(base_dir, composers):
    os.makedirs(base_dir, exist_ok=True)
    for composer in composers:
        composer_dir = os.path.join(base_dir, composer)
        os.makedirs(composer_dir, exist_ok=True)

# ----------------------------------
# Recursively extract all .zip files in a directory
# ----------------------------------
def extract_all_nested_zips(base_path):
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith('.zip'):
                zip_path = os.path.join(root, file)
                extract_path = os.path.splitext(zip_path)[0]
                try:
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_path)
                        print(f"Extracted: {zip_path} -> {extract_path}")
                except zipfile.BadZipFile:
                    print(f"[WARNING] Skipped invalid ZIP: {zip_path}")
                    continue
                except Exception as e:
                    print(f"[ERROR] Failed to extract {zip_path}: {e}")
                    continue
                # Recursively extract deeper levels
                extract_all_nested_zips(extract_path)

# ----------------------------------
# Copy .midi files to composer folders
# ----------------------------------
def copy_midi_files_to_composer_folders(source_dir, target_dir, composers):
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith('.mid') or file.lower().endswith('.midi'):
                full_path = os.path.join(root, file)
                for composer in composers:
                    if composer.lower() in full_path.lower():
                        destination = os.path.join(target_dir, composer, file)
                        try:
                            shutil.copy2(full_path, destination)
                            print(f"Copied: {full_path} -> {destination}")
                        except Exception as e:
                            print(f"[ERROR] Failed to copy {full_path}: {e}")
                        break

# ----------------------------------
# Orchestrator function
# ----------------------------------
def organize_midi_data():
    # Step 1: Create output structure
    create_composer_folders(MIDI_DATA_DIR, COMPOSERS)

    # Step 2: Extract all ZIPs (even nested)
    extract_all_nested_zips(SOURCE_DATA_DIR)

    # Step 3: Copy relevant .midi files to target composer folders
    copy_midi_files_to_composer_folders(SOURCE_DATA_DIR, MIDI_DATA_DIR, COMPOSERS)

# ----------------------------------
# Print summary of .mid/.midi files per composer
# ----------------------------------
def print_midi_file_summary(composer_dir, composers):
    summary = {}

    for composer in composers:
        folder_path = os.path.join(composer_dir, composer)
        count = 0

        # Count .mid and .midi files only
        for file in os.listdir(folder_path):
            ext = os.path.splitext(file)[1].lower()
            if ext in ['.mid', '.midi']:
                count += 1

        summary[composer] = count

    print("\n MIDI File Summary per Composer:")
    print("----------------------------------")
    for composer, count in summary.items():
        print(f"{composer}: {count} files")



# -------------------------------
# Full Setup Pipeline
# -------------------------------
def setup_dataset():
    source = download_dataset(DATASET_ID)
    copy_dataset(source, DESTINATION_DIR)
    organize_midi_data()


setup_dataset()
print_midi_file_summary(MIDI_DATA_DIR, COMPOSERS)



Downloading dataset: blanderbuss/midi-classic-music
Dataset downloaded to temporary path: /kaggle/input/midi-classic-music
Dataset copied from '/kaggle/input/midi-classic-music' to '/tmp'
Extracted: /tmp/midiclassics.zip -> /tmp/midiclassics
Extracted: /tmp/midiclassics/Frescobaldi/Messa della Domenica (Orbis factor) from the Fiori Musicali.zip -> /tmp/midiclassics/Frescobaldi/Messa della Domenica (Orbis factor) from the Fiori Musicali
Extracted: /tmp/midiclassics/Ravel/La Valse.zip -> /tmp/midiclassics/Ravel/La Valse
Extracted: /tmp/midiclassics/Copland/Concerto for Clarinet.zip -> /tmp/midiclassics/Copland/Concerto for Clarinet
Extracted: /tmp/midiclassics/Copland/Appalacian Spring.zip -> /tmp/midiclassics/Copland/Appalacian Spring
Extracted: /tmp/midiclassics/Griffes/Piano Sonata.zip -> /tmp/midiclassics/Griffes/Piano Sonata
Extracted: /tmp/midiclassics/Strauss, J/The blue danube Op.314.zip -> /tmp/midiclassics/Strauss, J/The blue danube Op.314
Extracted: /tmp/midiclassics/Ginastera

In [18]:
# ----------------------------------
# Create DataFrame of all MIDI file paths per composer
# ----------------------------------
def create_midi_file_dataframe(base_dir, composers):
    import os
    import pandas as pd

    matched_paths = []

    for composer in composers:
        composer_dir = os.path.join(base_dir, composer)
        if not os.path.exists(composer_dir):
            print(f"[WARNING] Missing folder: {composer_dir}")
            continue

        for root, _, files in os.walk(composer_dir):
            for f in files:
                if f.lower().endswith(('.mid', '.midi')):
                    file_path = os.path.join(root, f)
                    matched_paths.append(file_path)

    # Create and return DataFrame
    df = pd.DataFrame(matched_paths, columns=['filepath'])
    pd.set_option('display.max_colwidth', None)
    return df

df = create_midi_file_dataframe(MIDI_DATA_DIR, COMPOSERS)
print(df)

                                                                          filepath
0                                             /content/midi_data/Bach/008906b_.mid
1                                             /content/midi_data/Bach/WTCII14A.MID
2                                             /content/midi_data/Bach/024437b_.mid
3     /content/midi_data/Bach/Piano version of Bachs two part inventions No.13.mid
4                                         /content/midi_data/Bach/Variation 26.mid
...                                                                            ...
1662                          /content/midi_data/Mozart/K570 Piano Sonata 1mov.mid
1663                          /content/midi_data/Mozart/Symphony n39 K543 1mov.mid
1664                          /content/midi_data/Mozart/Symphony n29 K201 2mov.mid
1665                           /content/midi_data/Mozart/Piano Sonata n09 K310.mid
1666                          /content/midi_data/Mozart/K284 Piano Sonata n06 .mid

[16