# **Inference Pipeline (Top-K aggregation)**

In [1]:
# Imports & Model Loading
# =====================================================
import numpy as np
import librosa
import tensorflow as tf
import json
import os
from tensorflow.keras.models import load_model

MODEL_PATH = "/content/drive/MyDrive/multilabel_cnn_improved.keras"
model = load_model(MODEL_PATH)
print("Model loaded successfully")

Model loaded successfully


In [2]:
#Configuration (MUST match training)
# =====================================================
SR = 16000

SEGMENT_DURATION = 2.0   # seconds
HOP_DURATION = 1.0       # seconds

SEGMENT_SAMPLES = int(SEGMENT_DURATION * SR)
HOP_SAMPLES = int(HOP_DURATION * SR)

N_MELS = 128
HOP_LENGTH = 512
IMG_SIZE = 128

TOP_K = 3   # Top-K aggregation parameter

In [3]:
#Instruments
# =====================================================
INSTRUMENTS = [
    "cel", "cla", "flu", "gac", "gel",
    "org", "pia", "sax", "tru", "vio", "voi"
]

In [4]:
# Per-instrument thresholds (tuned)
# =====================================================
PER_CLASS_THRESHOLDS = {
    "cel": 0.30,
    "cla": 0.30,
    "flu": 0.30,
    "gac": 0.40,
    "gel": 0.40,
    "org": 0.30,
    "pia": 0.35,
    "sax": 0.35,
    "tru": 0.35,
    "vio": 0.30,
    "voi": 0.35
}

In [5]:
# Audio Loading
# =====================================================
def load_and_preprocess_audio(audio_path):
    audio, _ = librosa.load(audio_path, sr=SR, mono=True)

    # Peak normalization
    if np.max(np.abs(audio)) > 0:
        audio = audio / np.max(np.abs(audio))

    return audio

In [6]:
# Segmentation
# =====================================================
def segment_audio(audio):
    segments, times = [], []

    for start in range(0, len(audio) - SEGMENT_SAMPLES + 1, HOP_SAMPLES):
        segments.append(audio[start:start + SEGMENT_SAMPLES])
        times.append(start / SR)

    return segments, times

In [7]:
# Mel Feature Extraction
# =====================================================
def segment_to_mel(segment):
    mel = librosa.feature.melspectrogram(
        y=segment,
        sr=SR,
        n_mels=N_MELS,
        hop_length=HOP_LENGTH
    )

    mel_db = librosa.power_to_db(mel, ref=np.max)

    mel_db = tf.image.resize(
        mel_db[..., np.newaxis],
        (IMG_SIZE, IMG_SIZE)
    ).numpy()

    mel_db = np.repeat(mel_db, 3, axis=-1)

    # Normalize to [0,1]
    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)

    return mel_db

In [8]:
# Segment-wise Prediction
# =====================================================
def predict_segments(segments):
    probs = []

    for seg in segments:
        mel = segment_to_mel(seg)
        mel = np.expand_dims(mel, axis=0)
        probs.append(model.predict(mel, verbose=0)[0])

    return np.array(probs)

In [9]:
# Top-K Aggregation
# =====================================================
def aggregate_predictions_topk(segment_probs, k=TOP_K, mode="mean"):
    """
    segment_probs: (num_segments, num_classes)
    """
    agg_probs = []

    for class_idx in range(segment_probs.shape[1]):
        class_probs = segment_probs[:, class_idx]

        topk_vals = np.sort(class_probs)[-k:]

        if mode == "mean":
            agg_probs.append(np.mean(topk_vals))
        else:
            agg_probs.append(np.max(topk_vals))

    return np.array(agg_probs)


In [10]:
# Apply Per-Instrument Thresholds
# =====================================================
def apply_thresholds(probs):
    output = {}

    for i, inst in enumerate(INSTRUMENTS):
        thr = PER_CLASS_THRESHOLDS[inst]
        output[inst] = {
            "probability": float(probs[i]),
            "predicted": int(probs[i] >= thr)
        }

    return output

In [11]:
# Upload Audio File
# =====================================================
from google.colab import files
uploaded = files.upload()
audio_path = list(uploaded.keys())[0]

Saving 01 - Inolvidable-9.wav to 01 - Inolvidable-9.wav


In [12]:
# RUN INFERENCE
# =====================================================
audio = load_and_preprocess_audio(audio_path)
segments, times = segment_audio(audio)

segment_probs = predict_segments(segments)
agg_probs = aggregate_predictions_topk(segment_probs)

In [13]:
# JSON Output
# =====================================================
segments_json = []
for i, (t, probs) in enumerate(zip(times, segment_probs)):
    segments_json.append({
        "segment_index": i,
        "start_time_sec": round(t, 2),
        "end_time_sec": round(t + SEGMENT_DURATION, 2),
        "probabilities": {
            INSTRUMENTS[j]: float(probs[j]) for j in range(len(INSTRUMENTS))
        }
    })

final_output = {
    "metadata": {
        "task": "Music Instrument Recognition",
        "model_type": "CNN",
        "classification": "Multilabel",
        "input_features": "Mel-spectrogram",
        "num_classes": len(INSTRUMENTS)
    },
    "model_configuration": {
        "segment_duration_sec": SEGMENT_DURATION,
        "hop_duration_sec": HOP_DURATION,
        "aggregation_method": f"Top-K Aggregation (K={TOP_K}, mean)",
        "threshold_type": "Per-instrument (tuned)"
    },
    "audio_file": audio_path,
    "sample_rate": SR,
    "segments": segments_json,
    "aggregated_prediction": apply_thresholds(agg_probs)
}

json_path = audio_path.replace(".wav", "_inference_topk.json")
with open(json_path, "w") as f:
    json.dump(final_output, f, indent=4)

print("Inference completed")
print("Saved:", json_path)

Inference completed
Saved: 01 - Inolvidable-9_inference_topk.json


# **Conclusion**
Although the ground truth labels indicate the presence of piano and voice in the audio, the proposed Top-K aggregation–based inference pipeline did not predict these instruments. This behavior can be attributed to the weak-label nature of the dataset and the relatively low energy of piano and voice compared to dominant instruments such as trumpet and saxophone.

The results indicate that the model is more sensitive to instruments with strong and consistent spectral characteristics, while softer or background instruments are under-detected. This limitation persists even after applying Top-K aggregation, suggesting that inference-level aggregation alone cannot fully mitigate biases introduced during training due to class imbalance and dominant timbral features.

Nevertheless, the Top-K aggregation strategy provides more stable and robust predictions compared to temporal max pooling, reducing the influence of spurious segment-level activations. Therefore, the Top-K–based inference pipeline is considered appropriate for this project, with clearly documented limitations.