In [3]:
import os
import numpy as np
import librosa
import pandas as pd
from tqdm import tqdm


In [4]:
DATA_PATH = "IRMAS-TrainingData"

SAMPLE_RATE = 22050        # STANDARD & CORRECT
DURATION = 3.0             # seconds
SAMPLES = int(SAMPLE_RATE * DURATION)

N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512


In [5]:
dataset = []

for root, dirs, files in os.walk(DATA_PATH):
    for file in files:
        if file.lower().endswith(".wav"):
            label = os.path.basename(root)   # folder name = label
            file_path = os.path.join(root, file)
            dataset.append((file_path, label))

print("Total audio files:", len(dataset))
print("Example:", dataset[0])


Total audio files: 6705
Example: ('IRMAS-TrainingData\\IRMAS-TrainingData\\cel\\008__[cel][nod][cla]0058__1.wav', 'cel')


In [6]:
labels = sorted(list(set(label for _, label in dataset)))

label_map = {label: idx for idx, label in enumerate(labels)}
print("Label map:", label_map)
print("Number of classes:", len(label_map))


Label map: {'cel': 0, 'cla': 1, 'flu': 2, 'gac': 3, 'gel': 4, 'org': 5, 'pia': 6, 'sax': 7, 'tru': 8, 'vio': 9, 'voi': 10}
Number of classes: 11


In [7]:
def preprocess_audio(file_path):
    # Load audio
    y, sr = librosa.load(
        file_path,
        sr=SAMPLE_RATE,
        mono=True
    )

    # Fix length (pad or cut)
    if len(y) < SAMPLES:
        y = np.pad(y, (0, SAMPLES - len(y)))
    else:
        y = y[:SAMPLES]

    # Normalize waveform
    y = librosa.util.normalize(y)

    # Mel spectrogram
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS,
        fmin=20,
        fmax=sr // 2
    )

    # Log scale
    mel_db = librosa.power_to_db(mel, ref=np.max)

    return mel_db


In [8]:
import warnings
from cryptography.utils import CryptographyDeprecationWarning
warnings.filterwarnings("ignore", category=CryptographyDeprecationWarning)

In [9]:
X = []
y = []

print("Processing audio files...")

for file_path, label in tqdm(dataset):
    mel_db = preprocess_audio(file_path)

    # Ensure fixed shape (128 × 128)
    if mel_db.shape[1] < 128:
        continue

    mel_db = mel_db[:, :128]

    X.append(mel_db)
    y.append(label_map[label])

print("Finished processing")


Processing audio files...


100%|██████████████████████████████████████████████████████████████████████████████| 6705/6705 [03:13<00:00, 34.58it/s]

Finished processing





In [10]:
X = np.stack(X)             # (N, 128, 128)
y = np.array(y)             # (N,)

# Add channel dimension for CNN
X = X[..., np.newaxis]      # (N, 128, 128, 1)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (6705, 128, 128, 1)
y shape: (6705,)


In [12]:
X = (X - np.mean(X, axis=(1,2), keepdims=True)) / \
    (np.std(X, axis=(1,2), keepdims=True) + 1e-6)


In [13]:
np.save("X_mel.npy", X)
np.save("y_labels.npy", y)

print("✅ Preprocessed data saved")


✅ Preprocessed data saved
