In [1]:
#Imports
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import load_model

In [2]:
# Load trained multilabel model
model = load_model("/content/drive/MyDrive/multilabel_cnn_improved.keras")

In [3]:
#Parameters
# Audio parameters
SR = 16000

# Segmentation parameters
SEGMENT_DURATION = 2.0   # seconds
HOP_DURATION = 1.0       # seconds (50% overlap)

SEGMENT_SAMPLES = int(SEGMENT_DURATION * SR)
HOP_SAMPLES = int(HOP_DURATION * SR)

# Spectrogram parameters
N_MELS = 128
HOP_LENGTH = 512
IMG_SIZE = 128

# Prediction threshold
THRESHOLD = 0.5

In [4]:
#Audio Segmentation Function
def segment_audio(audio, segment_samples, hop_samples):
    segments = []
    for start in range(0, len(audio) - segment_samples + 1, hop_samples):
        seg = audio[start:start + segment_samples]
        segments.append(seg)
    return segments

In [5]:
#Segment â†’ Mel Spectrogram
def segment_to_mel(segment, sr):
    mel = librosa.feature.melspectrogram(
        y=segment,
        sr=sr,
        n_mels=N_MELS,
        hop_length=HOP_LENGTH
    )

    mel_db = librosa.power_to_db(mel, ref=np.max)

    mel_db = tf.image.resize(
        mel_db[..., np.newaxis],
        (IMG_SIZE, IMG_SIZE)
    ).numpy()

    mel_db = np.repeat(mel_db, 3, axis=-1)

    mel_db = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-6)

    return mel_db

In [6]:
#Segment-wise Prediction (NO AGGREGATION)
def predict_segments(audio_path):
    audio, sr = librosa.load(audio_path, sr=SR, mono=True)

    segments = segment_audio(audio, SEGMENT_SAMPLES, HOP_SAMPLES)

    predictions = []

    for i, seg in enumerate(segments):
        mel_img = segment_to_mel(seg, sr)
        mel_img = np.expand_dims(mel_img, axis=0)

        probs = model.predict(mel_img, verbose=0)[0]
        preds = (probs >= THRESHOLD).astype(int)

        predictions.append((i, probs, preds))

    return segments, predictions

In [7]:
#TEMPORAL MAX POOLING (AGGREGATION)
def temporal_max_pooling(predictions):
    """
    Max pooling over segment probabilities
    """
    segment_probs = np.array([probs for _, probs, _ in predictions])
    pooled_probs = np.max(segment_probs, axis=0)
    pooled_preds = (pooled_probs >= THRESHOLD).astype(int)
    return pooled_probs, pooled_preds

In [8]:
#Test audio
TEST_AUDIO = "/content/drive/MyDrive/irmas_multilabel_audio/mix_0.wav"

In [9]:
#RUN PIPELINE
segments, predictions = predict_segments(TEST_AUDIO)
# Print number of segments
print(f"\nTotal segments: {len(predictions)}")


Total segments: 4


In [10]:
#OUTPUT: WITHOUT AGGREGATION
print("\nWITHOUT AGGREGATION (Segment-wise predictions)\n")

for i, probs, preds in predictions:
    print(f"Segment {i}")
    print("Probabilities:", np.round(probs, 3))
    print("Predicted labels:", preds)
    print("-" * 50)


WITHOUT AGGREGATION (Segment-wise predictions)

Segment 0
Probabilities: [0.041 0.057 0.072 0.029 0.073 0.019 0.052 0.474 0.752 0.082 0.011]
Predicted labels: [0 0 0 0 0 0 0 0 1 0 0]
--------------------------------------------------
Segment 1
Probabilities: [0.048 0.056 0.081 0.035 0.089 0.017 0.059 0.48  0.714 0.089 0.015]
Predicted labels: [0 0 0 0 0 0 0 0 1 0 0]
--------------------------------------------------
Segment 2
Probabilities: [0.059 0.141 0.12  0.071 0.043 0.055 0.112 0.431 0.866 0.13  0.014]
Predicted labels: [0 0 0 0 0 0 0 0 1 0 0]
--------------------------------------------------
Segment 3
Probabilities: [0.49  0.489 0.491 0.491 0.492 0.49  0.487 0.493 0.493 0.487 0.49 ]
Predicted labels: [0 0 0 0 0 0 0 0 0 0 0]
--------------------------------------------------


In [11]:
#OUTPUT: WITH MAX POOLING AGGREGATION
agg_probs, agg_preds = temporal_max_pooling(predictions)

print("\nWITH TEMPORAL MAX POOLING (Clip-level prediction)\n")
print("Aggregated probabilities:", np.round(agg_probs, 3))
print("Aggregated predicted labels:", agg_preds)


WITH TEMPORAL MAX POOLING (Clip-level prediction)

Aggregated probabilities: [0.49  0.489 0.491 0.491 0.492 0.49  0.487 0.493 0.866 0.487 0.49 ]
Aggregated predicted labels: [0 0 0 0 0 0 0 0 1 0 0]


In [12]:
# REFLECTION (what changed, is it worth it?)
print("\n=== REFLECTION ===")
print(f"Segment-level outputs before aggregation: {len(predictions)}")
print("Clip-level outputs after aggregation: 1")

if len(predictions) > 1:
    print("Reflection: Aggregation is worthwhile as it converts multiple unstable segment predictions into a single stable clip-level decision.")
else:
    print("Reflection: Aggregation has limited impact due to few segments.")


=== REFLECTION ===
Segment-level outputs before aggregation: 4
Clip-level outputs after aggregation: 1
Reflection: Aggregation is worthwhile as it converts multiple unstable segment predictions into a single stable clip-level decision.


# **Conclusion**
Applying temporal max pooling converted unstable segment-wise predictions into a single, stable clip-level output. Strong evidence from informative segments was preserved while weak or noisy segments did not suppress detection. This makes aggregation a worthwhile addition, improving robustness without retraining the model.