<a href="https://colab.research.google.com/github/politeles/audio_classification/blob/main/PrepareViolinAudioDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare the dataset to use it with Hugging Face Datasets
The dataset consist on multiple samples of different violin techniques.
We recorded a set of audio files with 4 different devices:
- Rode NT1A
- Thomann instrumet mic.
- Samsung Galaxy S10e.
- Samsung Tablet S22 ultra.

The audio files are organized in folders for the technique and folders for the different mics. Audio files with the same name differ on the microphone.

Also, in most of the techniques, the tone is provided in the international system. I.e. central C is marked as C4. The string in which the sound is produced on the violin is also recorded.

Let's get started by traversing the audio files and createin the metadata from them.

In [None]:
import os
import re

In [None]:
# List the files in the mounted directory
print(os.listdir('/data'))

['audio_samples', 'normalized_samples_1sec']


In [None]:
!pip install librosa soundfile



In [None]:
# !pip install mutagen only for mp3

In [None]:
import librosa
import soundfile as sf

In [None]:
#from mutagen.mp3 import MP3

In [None]:
base_dir = '/data/audio_samples'

In [None]:
for technique in os.listdir(base_dir):
    technique_path = os.path.join(base_dir, technique)
    if os.path.isdir(technique_path):
        for microphone in os.listdir(technique_path):
            microphone_path = os.path.join(technique_path, microphone)
            if os.path.isdir(microphone_path):
                for audio_file in os.listdir(microphone_path):
                    audio_file_path = os.path.join(microphone_path, audio_file)
                    if audio_file.endswith('.wav'):
                        # Process the audio file here
                        print(f"Processing: {audio_file_path}")
                        y, sr = librosa.load(audio_file_path, sr=None)
                        # Get duration in seconds
                        duration = librosa.get_duration(y=y, sr=sr)

                        print(f"  Duration: {duration:.2f} seconds")
                        print(f"  Sample Rate: {sr} bps")

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
  Duration: 1.59 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s10right/GstrF4_sample-0.wav
  Duration: 1.00 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s10right/GstrG3.wav
  Duration: 1.35 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s10right/GstrG3_sample-0.wav
  Duration: 1.00 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s10right/GstrGb4.wav
  Duration: 1.72 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s10right/GstrGb4_sample-0.wav
  Duration: 1.00 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s22ultra/AstrA4.wav
  Duration: 1.64 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s22ultra/AstrA4_sample-0.wav
  Duration: 1.00 seconds
  Sample Rate: 48000 bps
Processing: /data/audio_samples/pizz/s22ultra/AstrB4.wav
  Duration: 1.80 seconds
  Samp

# Normalize the duration to 1 second
The code will create multiple audio samples if the file lenght is greater than one second and will add some silence if the audio lenght is less than one second.

In [None]:
def normalize_duration_and_split(audio_file_path, target_duration=1):
    """
    Normalizes audio duration by splitting long examples and creating new ones
    with 1 second of audio length.
    """

    y, sr = librosa.load(audio_file_path, sr=None)
    current_duration = librosa.get_duration(y=y, sr=sr)

    num_samples = int(current_duration / target_duration)
    if num_samples == 0:
        # Create a copy with _sample-0 suffix
        base_name, ext = os.path.splitext(audio_file_path)
        new_filename = f"{base_name}_sample-0{ext}"
        sf.write(new_filename, y, sr)  # Save the entire audio as sample-0
    else:
        for i in range(num_samples):
            start_time = i * target_duration
            end_time = start_time + target_duration

            # Extract the audio segment
            audio_segment = y[int(start_time * sr):int(end_time * sr)]

            # Create a new filename with sample number
            base_name, ext = os.path.splitext(audio_file_path)
            new_filename = f"{base_name}_sample-{i}{ext}"

            # Save the audio segment
            sf.write(new_filename, audio_segment, sr)


In [None]:
def cleanup_samples(base_dir):
    """
    Cleans up audio samples with the pattern *_sample-n.wav within the
    /technique/microphone structure.

    Args:
        base_dir: The base directory containing the audio samples.
    """
    for technique in os.listdir(base_dir):
        technique_path = os.path.join(base_dir, technique)
        if os.path.isdir(technique_path):
            for microphone in os.listdir(technique_path):
                microphone_path = os.path.join(technique_path, microphone)
                if os.path.isdir(microphone_path):
                    for audio_file in os.listdir(microphone_path):
                        if re.match(r".*_sample-\d+\.wav", audio_file):
                            audio_file_path = os.path.join(microphone_path, audio_file)
                            print(f"Removing: {audio_file_path}")
                            os.remove(audio_file_path)

In [None]:
cleanup_samples(base_dir)

Removing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-0.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-1.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-2.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-3.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-4.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-0.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-1.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-10.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-11.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-12.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-13.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-14.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-15.wav
Removing: /data/audio_samples/bariolage/rodemic/bariolage2

In [None]:
for technique in os.listdir(base_dir):
    technique_path = os.path.join(base_dir, technique)
    if os.path.isdir(technique_path):
        for microphone in os.listdir(technique_path):
            microphone_path = os.path.join(technique_path, microphone)
            if os.path.isdir(microphone_path):
                for audio_file in os.listdir(microphone_path):
                    audio_file_path = os.path.join(microphone_path, audio_file)
                    if audio_file.endswith('.wav'):
                        # Process the audio file here
                        print(f"Processing: {audio_file_path}")
                        normalize_duration_and_split(audio_file_path)

Processing: /data/audio_samples/bariolage/rodemic/bariolage1.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage3.wav
Processing: /data/audio_samples/bariolage/s10left/bariolage1.wav
Processing: /data/audio_samples/bariolage/s10left/bariolage2.wav
Processing: /data/audio_samples/bariolage/s10left/bariolage3.wav
Processing: /data/audio_samples/bariolage/s10right/bariolage1.wav
Processing: /data/audio_samples/bariolage/s10right/bariolage2.wav
Processing: /data/audio_samples/bariolage/s10right/bariolage3.wav
Processing: /data/audio_samples/bariolage/s22ultra/bariolage1.wav
Processing: /data/audio_samples/bariolage/s22ultra/bariolage2.wav
Processing: /data/audio_samples/bariolage/s22ultra/bariolage3.wav
Processing: /data/audio_samples/bariolage/thomannmic/bariolage1.wav
Processing: /data/audio_samples/bariolage/thomannmic/bariolage2.wav
Processing: /data/audio_samples/bariolage/thomannmic/bariolage3.wav
Processing

# Extract metadata
We will keep the following:
 - technique
 - microphone
 - audio file

In [None]:
def extract_metadata(audio_file_path):
    """Extracts metadata from the audio file path."""
    metadata = {}

    # Extract technique and microphone from path
    parts = audio_file_path.split(os.sep)
    metadata['technique'] = parts[-3]
    metadata['microphone'] = parts[-2]

    # Extract string and note from filename (if present)
    filename = parts[-1]
    match = re.search(r"([AGED])str([a-gA-G][bs]?[\d]+)", filename)
    if match:
        metadata['string'] = match.group(1).lower()
        metadata['note'] = match.group(2)
    else:
        metadata['string'] = None  # Or some default value
        metadata['note'] = None  # Or some default value

    return metadata

In [None]:
extract_metadata('/data/audio_samples/sulponticello/s10right/EstrG5_2.wav')

{'technique': 'sulponticello',
 'microphone': 's10right',
 'string': 'e',
 'note': 'G5'}

# Create a Hugging Face dataset

In [None]:
!pip install datasets



In [None]:
import datasets

In [None]:
from datasets import ClassLabel, Features, Value, Audio

In [None]:
dataset = datasets.Dataset.from_dict({
    'audio': [],
    'technique': [],
    'microphone': [],
    'string': [],
    'note': [],
})

In [None]:
for technique in os.listdir(base_dir):
    technique_path = os.path.join(base_dir, technique)
    if os.path.isdir(technique_path):
        for microphone in os.listdir(technique_path):
            microphone_path = os.path.join(technique_path, microphone)
            if os.path.isdir(microphone_path):
                for audio_file in os.listdir(microphone_path):

                    if re.match(r".*_sample-\d+\.wav", audio_file):
                        audio_file_path = os.path.join(microphone_path, audio_file)
                        # Process the audio file here
                        print(f"Processing: {audio_file_path}")
                        metadata = extract_metadata(audio_file_path)
                        dataset = dataset.add_item({
                        'audio': audio_file_path,
                        'technique': metadata['technique'],
                        'microphone': metadata['microphone'],
                        'string': metadata['string'],
                        'note': metadata['note'],
                          })

Processing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-0.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-1.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-2.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-3.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage1_sample-4.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-0.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-1.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-10.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-11.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-12.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-13.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-14.wav
Processing: /data/audio_samples/bariolage/rodemic/bariolage2_sample-15.wav
Processing: /data/audio_samples/

In [None]:
from google.colab import userdata
# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [None]:
from datasets import ClassLabel

In [None]:
features = Features({
    'audio': Audio(sampling_rate=48000),
    'technique': ClassLabel(names=dataset.unique("technique")),
    'microphone': ClassLabel(names=dataset.unique("microphone")),
    'string': Value('string'),
    'note': Value('string'),
})

# Update dataset schema with the defined features
dataset = dataset.cast(features)

Casting the dataset:   0%|          | 0/2515 [00:00<?, ? examples/s]

In [None]:
dataset[0]

{'audio': {'path': '/data/audio_samples/bariolage/rodemic/bariolage1_sample-0.wav',
  'array': array([-0.01071167, -0.01193237, -0.01132202, ...,  0.0479126 ,
         -0.03659058, -0.10913086]),
  'sampling_rate': 48000},
 'technique': 0,
 'microphone': 0,
 'string': None,
 'note': None}

In [None]:
dataset.features

{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'technique': ClassLabel(names=['bariolage', 'chords', 'doublestops', 'legato', 'martele', 'pizz', 'ricochet', 'spicatto', 'sulponticello'], id=None),
 'microphone': ClassLabel(names=['rodemic', 's10left', 's10right', 's22ultra', 'thomannmic'], id=None),
 'string': Value(dtype='string', id=None),
 'note': Value(dtype='string', id=None)}

In [None]:
dataset.push_to_hub("politeles/vads")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/2515 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/politeles/vads/commit/148500d3527eb0ed0da82e6d45daae39ba1b4856', commit_message='Upload dataset', commit_description='', oid='148500d3527eb0ed0da82e6d45daae39ba1b4856', pr_url=None, pr_revision=None, pr_num=None)