In [1]:
!pip install datasets torchcodec



## Step 1: Inspect metadata + audio structure

In [2]:
from datasets import load_dataset, Audio # Import Audio feature

ds = load_dataset("mteb/nsynth-mini")["train"]
ds = ds.cast_column("audio", Audio()) # Explicitly cast the audio column for proper decoding

# Inspect 3 metadata entries
for i in range(3):
    print(f"\n--- Metadata Entry {i} ---")
    print({k: ds[i][k] for k in ds[i] if k != "audio"})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



--- Metadata Entry 0 ---
{'note': 180871, 'sample_rate': 16000, 'pitch': 81, 'instrument_source': 0, 'instrument_family_str': 'mallet', 'instrument_str': 'mallet_acoustic_065', 'note_str': 'mallet_acoustic_065-081-127', 'qualities_str': ['fast_decay', 'percussive'], 'instrument_source_str': 'acoustic', 'velocity': 127, 'instrument_family': 5, 'instrument': 676, 'qualities': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0]}

--- Metadata Entry 1 ---
{'note': 176097, 'sample_rate': 16000, 'pitch': 106, 'instrument_source': 1, 'instrument_family_str': 'mallet', 'instrument_str': 'mallet_electronic_004', 'note_str': 'mallet_electronic_004-106-075', 'qualities_str': ['bright', 'distortion', 'nonlinear_env'], 'instrument_source_str': 'electronic', 'velocity': 75, 'instrument_family': 5, 'instrument': 192, 'qualities': [1, 0, 1, 0, 0, 0, 1, 0, 0, 0]}

--- Metadata Entry 2 ---
{'note': 82568, 'sample_rate': 16000, 'pitch': 58, 'instrument_source': 2, 'instrument_family_str': 'flute', 'instrument_str': 'flute_s

## Step 2: Load 2â€“3 audio examples and print instrument labels

In [3]:
import soundfile as sf

examples_to_load = [0, 1, 2]  # change indices as you like

for idx in examples_to_load:
    item = ds[idx]
    audio_array = item["audio"]["array"]
    sample_rate = item["audio"]["sampling_rate"]

    # Extract metadata
    instrument = item["instrument"]
    family = item["instrument_family_str"]
    source  = item["instrument_source_str"]
    filename = item["note_str"] + ".wav"  # NSynth-style naming (approx)

    print(f"\nLoaded Audio Index {idx}")
    print("Instrument:", instrument)
    print("Family:", family)
    print("Source:", source)
    print("Sample Rate:", sample_rate)
    print("Audio Shape:", audio_array.shape)


Loaded Audio Index 0
Instrument: 676
Family: mallet
Source: acoustic
Sample Rate: 16000
Audio Shape: (64000,)

Loaded Audio Index 1
Instrument: 192
Family: mallet
Source: electronic
Sample Rate: 16000
Audio Shape: (64000,)

Loaded Audio Index 2
Instrument: 608
Family: flute
Source: synthetic
Sample Rate: 16000
Audio Shape: (64000,)


## Step 3: Create Instrument Mapping Table

In [4]:
import pandas as pd

records = []

for idx in examples_to_load:
    item = ds[idx]
    record = {
        "file_name": item["note_str"] + ".wav",
        "instrument": item["instrument"],
        "family": item["instrument_family_str"],
        "source": item["instrument_source_str"]
    }
    records.append(record)

df = pd.DataFrame(records)
print(df)

                           file_name  instrument  family      source
0    mallet_acoustic_065-081-127.wav         676  mallet    acoustic
1  mallet_electronic_004-106-075.wav         192  mallet  electronic
2    flute_synthetic_006-058-127.wav         608   flute   synthetic
