Evaluation of multilabel trained model

In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [2]:
#Paths & Parameters
MODEL_PATH = "/content/drive/MyDrive/irmas_multilabel_cnn.keras"
LABELS_CSV = "/content/drive/MyDrive/multilabel_labels.csv"
MEL_DIR = "/content/drive/MyDrive/irmas_multilabel_mels"

IMG_SIZE = 128

In [3]:
#Load Trained Model
model = tf.keras.models.load_model(MODEL_PATH)
print("✅ Model loaded successfully")

✅ Model loaded successfully


  saveable.load_own_variables(weights_store.get(inner_path))


In [4]:
#Load Dataset (Images + Labels)
labels_df = pd.read_csv(LABELS_CSV)

X, y = [], []

for _, row in labels_df.iterrows():
    img_path = os.path.join(MEL_DIR, row["file"].replace(".wav", ".png"))
    if not os.path.exists(img_path):
        continue

    img = load_img(img_path, target_size=(IMG_SIZE, IMG_SIZE))
    img = img_to_array(img) / 255.0

    X.append(img)
    y.append(row[1:].values.astype("float32"))

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (1000, 128, 128, 3)
y shape: (1000, 11)


In [5]:
#Train / Validation Split (for evaluation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
#Keras Evaluation Metrics
loss, binary_acc, precision, recall = model.evaluate(X_val, y_val)

print("\n=== KERAS EVALUATION ===")
print("Loss:", loss)
print("Binary Accuracy:", binary_acc)
print("Precision:", precision)
print("Recall:", recall)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 239ms/step - binary_accuracy: 0.7825 - loss: 0.4984 - precision_2: 0.7238 - recall_2: 0.0417

=== KERAS EVALUATION ===
Loss: 0.5006890296936035
Binary Accuracy: 0.7809090614318848
Precision: 0.7083333134651184
Recall: 0.03455284610390663


In [7]:
#Threshold-based Evaluation (IMPORTANT for multilabel)
y_pred = model.predict(X_val)

THRESHOLD = 0.3
y_pred_bin = (y_pred >= THRESHOLD).astype(int)

precision_t = precision_score(y_val, y_pred_bin, average="micro")
recall_t = recall_score(y_val, y_pred_bin, average="micro")
f1_t = f1_score(y_val, y_pred_bin, average="micro")

print("\n=== THRESHOLD-BASED METRICS ===")
print("Threshold:", THRESHOLD)
print("Precision:", precision_t)
print("Recall:", recall_t)
print("F1-Score:", f1_t)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 245ms/step

=== THRESHOLD-BASED METRICS ===
Threshold: 0.3
Precision: 0.3582089552238806
Recall: 0.43902439024390244
F1-Score: 0.39452054794520547


In [8]:
#Test multiple thresholds:

for t in [0.2, 0.3, 0.4, 0.5]:
    y_bin = (y_pred >= t).astype(int)
    f1 = f1_score(y_val, y_bin, average="micro")
    print(f"Threshold {t} → F1: {f1:.3f}")

Threshold 0.2 → F1: 0.421
Threshold 0.3 → F1: 0.395
Threshold 0.4 → F1: 0.204
Threshold 0.5 → F1: 0.066


In [10]:
#FINAL_THRESHOLD = 0.2
FINAL_THRESHOLD = 0.2
y_pred_bin = (y_pred >= FINAL_THRESHOLD).astype(int)

precision = precision_score(y_val, y_pred_bin, average="micro")
recall = recall_score(y_val, y_pred_bin, average="micro")
f1 = f1_score(y_val, y_pred_bin, average="micro")

print("Final Threshold:", FINAL_THRESHOLD)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Final Threshold: 0.2
Precision: 0.2796780684104628
Recall: 0.8475609756097561
F1-score: 0.4205748865355522


After threshold analysis, 0.2 was selected as the final decision threshold and used for reporting performance.