Multilabel testing

In [None]:
#Install & download IRMAS Testing Data (KaggleHub)
!pip install -q kagglehub

import kagglehub
import os

TEST_DATA_PATH = kagglehub.dataset_download("rajeevriya/irmas-testingdata")
print("Testing dataset path:", TEST_DATA_PATH)

os.listdir(TEST_DATA_PATH)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rajeevriya/irmas-testingdata?dataset_version_number=1...


100%|██████████| 7.29G/7.29G [04:15<00:00, 30.7MB/s]

Extracting files...





Testing dataset path: /root/.cache/kagglehub/datasets/rajeevriya/irmas-testingdata/versions/1


['IRMAS-TestingData']

In [None]:
#Imports
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tqdm import tqdm

In [None]:
# Audio preprocessing parameters
TARGET_SR = 16000
TARGET_DURATION = 5.0
MIN_SILENCE_THRESH = 0.01

# Mel parameters
N_MELS = 128
HOP_LENGTH = 512
IMG_SIZE = 128

# Prediction threshold
THRESHOLD = 0.2

In [None]:
#Instrument List (MUST match multilabel training order)
INSTRUMENTS = [
    "cello", "clarinet", "flute", "guitar", "organ",
    "piano", "saxophone", "trumpet", "violin", "voice", "others"
]

In [None]:
#Load Trained Multilabel Model
MODEL_PATH = "/content/drive/MyDrive/irmas_multilabel_cnn.keras"

model = tf.keras.models.load_model(MODEL_PATH)
model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
#Audio Preprocessing Function (REUSED)
def preprocess_audio(file_path):
    audio, sr = librosa.load(file_path, sr=None, mono=True)

    if sr != TARGET_SR:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
        sr = TARGET_SR

    max_amp = np.max(np.abs(audio))
    if max_amp > 0:
        audio = audio / max_amp

    non_silent = np.where(np.abs(audio) > MIN_SILENCE_THRESH)[0]
    if len(non_silent) > 0:
        audio = audio[non_silent[0]: non_silent[-1]]

    target_len = int(TARGET_SR * TARGET_DURATION)
    if len(audio) < target_len:
        audio = np.pad(audio, (0, target_len - len(audio)))
    else:
        audio = audio[:target_len]

    return audio, sr

In [None]:
#Audio → Mel-Spectrogram Image
def audio_to_mel_image(audio, sr):
    mel = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_mels=N_MELS, hop_length=HOP_LENGTH
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)

    plt.figure(figsize=(3,3))
    plt.axis("off")
    librosa.display.specshow(mel_db, sr=sr, hop_length=HOP_LENGTH, cmap="magma")
    plt.tight_layout(pad=0)
    plt.savefig("temp.png", bbox_inches="tight", pad_inches=0)
    plt.close()

    img = load_img("temp.png", target_size=(IMG_SIZE, IMG_SIZE))
    img = img_to_array(img) / 255.0
    return img

In [None]:
#Collect ALL Test Audio Files
audio_files = []

for root, _, files in os.walk(TEST_DATA_PATH):
    for f in files:
        if f.lower().endswith(".wav"):
            audio_files.append(os.path.join(root, f))

print("Total test audio files:", len(audio_files))

Total test audio files: 2874


In [None]:
#Run Multilabel Predictions (CORE STEP)
results = []

for path in tqdm(audio_files):
    try:
        audio, sr = preprocess_audio(path)
        img = audio_to_mel_image(audio, sr)

        pred = model.predict(img[np.newaxis, ...], verbose=0)[0]

        predicted_labels = [
            INSTRUMENTS[i]
            for i, score in enumerate(pred)
            if score >= THRESHOLD
        ]

        results.append({
            "file": os.path.basename(path),
            "predicted_instruments": ", ".join(predicted_labels),
            "raw_scores": pred.tolist()
        })

    except Exception as e:
        print("Skipped:", path, e)

100%|██████████| 2874/2874 [14:09<00:00,  3.38it/s]


In [None]:
#Save Predictions to CSV (FINAL OUTPUT)
df_results = pd.DataFrame(results)
df_results.head()

df_results.to_csv("instrunet_multilabel_test_predictions.csv", index=False)
print("Saved predictions to instrunet_multilabel_test_predictions.csv")

Saved predictions to instrunet_multilabel_test_predictions.csv


In [None]:
# Print first prediction's probabilities
print("File:", df_results.iloc[0]["file"])
print("Predicted instruments:", df_results.iloc[0]["predicted_instruments"])
print("Raw scores:", np.round(df_results.iloc[0]["raw_scores"], 3))

File: brian eno - apollo- atmospheres and soundtracks - 01 - under stars-2.wav
Predicted instruments: flute, trumpet, violin
Raw scores: [0.028 0.178 0.466 0.045 0.097 0.049 0.041 0.363 0.247 0.15  0.072]
