# Imports

In [None]:
import os
import gc

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import h5py
from pathlib import Path
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K

from model_evaluation import *

2025-05-07 19:32:27.034125: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746646347.051874    9824 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746646347.057222    9824 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746646347.073855    9824 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746646347.073870    9824 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746646347.073873    9824 computation_placer.cc:177] computation placer alr

In [2]:
# Tensorflow Configuration
import multiprocessing

# Get the total number of CPU cores available
total_cores = multiprocessing.cpu_count()

# Calculate 75% of available cores (rounded down)
cores_to_use = int(total_cores * 0.75)

# Ensure at least 1 core is used
cores_to_use = max(1, cores_to_use)

# Set the environment variable
os.environ["LOKY_MAX_CPU_COUNT"] = str(cores_to_use)

print(f"Using {cores_to_use} out of {total_cores} available CPU cores (75%)")

# Add after TensorFlow import
tf.config.threading.set_inter_op_parallelism_threads(cores_to_use // 2)
tf.config.threading.set_intra_op_parallelism_threads(cores_to_use // 2)

Using 12 out of 16 available CPU cores (75%)


# Data Settings

In [None]:
# All bird species available in the HDF5 spectrogram dataset
ALL_BIRD_SPECIES = [
    'amecro',  # American Crow
    'amerob',  # American Robin
    'bewwre',  # Bewick's Wren
    'bkcchi',  # Black-capped Chickadee
    'daejun',  # Dark-eyed Junco
    'houfin',  # House Finch
    'houspa',  # House Sparrow
    'norfli',  # Northern Flicker
    'rewbla',  # Red-winged Blackbird
    'sonspa',  # Song Sparrow
    'spotow',  # Spotted Towhee
    'whcspa',  # White-crowned Sparrow
]

# Path to the HDF5 file containing bird spectrograms
SPEC_FILE_PATH = Path('../data/bird_spectrograms.hdf5')

# Training hyperparameters
EPOCHS = 50                 # Total number of training epochs
NUM_CV_FOLDS = 3            # K-Fold cross-validation (choose 3, 4, or 5)
BATCH_SIZE = [64, 128, 32]           # Batch size for training
LEARNING_RATE = [0.0001, 0.0005, 0.001]      # Learning rate for the optimizer

# List of evaluation metrics
EVALUATION_METRICS = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# Load Data
We need to transpose the data from (128, 517, sample_size) into (sample_size, 128, 517) because CNNs expect an input of shape (N, C, H, W). 

Here:
- N = number of samples
- C = number of channels (1 for grayscale spectrograms)
- H = height (128)
- W = width (517)

In [None]:
X = []
y = []

with h5py.File(SPEC_FILE_PATH, 'r') as f:
    for label, key in enumerate(f.keys()):
        data = f[key][:]  # shape = (128, 517, N)
        data = np.transpose(data, (2, 0, 1))  # shape = (N, 128, 517)
        X.append(data)
        y.append(np.full((data.shape[0],), label, dtype=np.int32))

X = np.concatenate(X, axis=0)  # (N_total, 128, 517)
y = np.concatenate(y)          # (N_total,)

X = np.expand_dims(X, axis=-1)  # (N_total, 128, 517, 1)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

(893, 128, 517, 1)
(893, 1)


# CNN Model Model Architecture

This convolutional neural network (CNN) performs **binary classification** on spectrogram inputs of shape `(1, 128, 517)`, where:
- `1` is the channel dimension (grayscale),
- `128` is the number of frequency bins (height),
- `517` is the number of time steps (width).

---

### General Steps for CNN
According to a very long discussion on the order of layers, BN, DropOut, and Pooling on [stackoverflow](https://stackoverflow.com/questions/39691902/ordering-of-batch-normalization-and-dropout) <br>
Generally, this laying is the consensus. <br>

Conv → BatchNorm → ReLU → Dropout → MaxPool

# CNN Model #1

In [None]:
def build_bird_binary_cnn_v1(num_classes, input_shape=(128, 517, 1), dropout_rate=0.2, learning_rate=0.0001):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(32, kernel_size=3, strides=1, padding='same')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size=2, strides=2)(x)

    x = layers.Conv2D(64, kernel_size=3, strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size=2, strides=2)(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                        loss='sparse_categorical_crossentropy',
                        metrics=['accuracy'])
    
    return model

def build_bird_binary_cnn_v2(num_classes, input_shape=(128, 517, 1), dropout_rate=0.2, learning_rate=0.0001):
    inputs = layers.Input(shape=input_shape)

    x = layers.Conv2D(32, kernel_size=3, strides=1, padding='same')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size=2, strides=2)(x)

    x = layers.Conv2D(64, kernel_size=3, strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size=2, strides=2)(x)

    x = layers.Conv2D(128, kernel_size=3, strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size=2, strides=2)(x)

    x = layers.Conv2D(256, kernel_size=3, strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.MaxPooling2D(pool_size=2, strides=2)(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                        loss='sparse_categorical_crossentropy',
                        metrics=['accuracy'])
    
    return model

# Graph Plotting

In [18]:
def build_plot_training_history(history, path, tag):
    """
    Plots training and validation accuracy and loss from two training histories.

    Args:
        history: First training history object.
        history2: Second training history object.
    """
    # Extract values from history object
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs_range = range(1, len(acc) + 1)

    # Plot Accuracy
    plt.figure(figsize=(8, 6))
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{path}/{tag}_acc.png')
    plt.show()
    plt.close()

    # Plot Loss
    plt.figure(figsize=(8, 6))
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{path}/{tag}_loss.png')
    plt.show()
    plt.close()

# Model Training and Evaluation Loop

For CNNs, these four essential steps are important. <br>
I have explained more about backpropagation in this [Medium article](https://medium.com/data-science-collective/why-backpropagation-is-so-important-for-models-in-machine-learning-4736591b24b3)

- Forward pass: Feed input through the network to make a prediction.
- Loss calculation: It compares those predictions to the actual labels and computes the loss.
- Backpropagation: Use the chain rule to find how much each weight influenced the loss.
- Weight update: Apply the gradients using an optimizer like Adam.
---

## Reasoning behind of KFold Cross Validation
Since the dataset is relatively small, it may be best to go with just a KFold Cross Validation rather than doing a train-test split + cross-validation. <br>

That is the reason why I went with K-fold cross validation.

---

# 2 Layer Model Run
- 3 FOLD (stratified)
- Batch Size = 32
- Dropout = 0.2
- Learning Rate = [0.001, 0.005, 0.0001]
- 50 EPOCHS with early stopping.

In [None]:
# K-Fold CV
kfold = StratifiedKFold(n_splits=NUM_CV_FOLDS, shuffle=True, random_state=42)

for batch_size in BATCH_SIZE:
    for learning_rate in LEARNING_RATE:
        version_key_prefix = f"multi_2layer_{batch_size}_{learning_rate}_{EPOCHS}"
        result_path = f"../output/multi-class_results"
        fold_results = []
        metrics = pd.DataFrame(columns=EVALUATION_METRICS)

        for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
            print(f"\nFold {fold + 1} | Train size: {len(train_idx)}, Test size: {len(test_idx)}")

            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            print(X_train.shape[1:])
            # Initialize model
            model = build_bird_binary_cnn_v1(input_shape=X_train.shape[1:], dropout_rate=0.2, learning_rate=learning_rate)

            # Callbacks
            early_stopping = EarlyStopping(monitor="val_loss", patience=10, mode="min", restore_best_weights=True)
            reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6, verbose=1)

            # Train
            history = model.fit(
                X_train, y_train,
                validation_data=(X_test, y_test),
                epochs=EPOCHS,
                batch_size=batch_size,
                callbacks=[early_stopping, reduce_lr],
                verbose=1
            )

            # Optional: Training history visualization
            build_plot_training_history(history)

            # Evaluate
            y_probs = model.predict(X_test)
            y_pred = np.argmax(y_probs, axis=1)
            y_true = y_test.flatten()

            # Confusion Matrix
            cm = confusion_matrix(y_true, y_pred)
            print("Confusion Matrix:\n", cm)

            # Classification Report
            report = classification_report(y_true, y_pred, digits=4)
            print("Classification Report:\n", report)

            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred, average='macro')
            rec = recall_score(y_true, y_pred, average='macro')
            f1 = f1_score(y_true, y_pred, average='macro')

            fold_tag = f"{version_key_prefix}_fold_{fold + 1}"
            metrics.loc[fold_tag] = [acc, prec, rec, f1]

            # Cleanup
            K.clear_session()
            del model
            gc.collect()
        
        # Save all 3 folds' metrics into a single CSV file
        result_subset = metrics.loc[[f"{version_key_prefix}_fold_{i+1}" for i in range(NUM_CV_FOLDS)]]
        output_path = f"{result_path}/4layer_{version_key_prefix}_results.csv"
        result_subset.to_csv(output_path, index_label="Fold")

# 4 Layer Model Run
- 3 FOLD (stratified)
- Batch Size = 32
- Dropout = 0.2
- Learning Rate = [0.001, 0.005, 0.0001]
- 50 EPOCHS with early stopping.

In [None]:
# K-Fold CV
kfold = StratifiedKFold(n_splits=NUM_CV_FOLDS, shuffle=True, random_state=42)

for batch_size in BATCH_SIZE:
    for learning_rate in LEARNING_RATE:
        version_key_prefix = f"multi_4layer_{batch_size}_{learning_rate}_{EPOCHS}"
        result_path = f"../output/multi-class_results"
        fold_results = []
        metrics = pd.DataFrame(columns=EVALUATION_METRICS)

        for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
            print(f"\nFold {fold + 1} | Train size: {len(train_idx)}, Test size: {len(test_idx)}")

            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            print(X_train.shape[1:])
            # Initialize model
            model = build_bird_binary_cnn_v2(input_shape=X_train.shape[1:], dropout_rate=0.2, learning_rate=learning_rate)

            # Callbacks
            early_stopping = EarlyStopping(monitor="val_loss", patience=10, mode="min", restore_best_weights=True)
            reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6, verbose=1)

            # Train
            history = model.fit(
                X_train, y_train,
                validation_data=(X_test, y_test),
                epochs=EPOCHS,
                batch_size=batch_size,
                callbacks=[early_stopping, reduce_lr],
                verbose=1
            )

            # Optional: Training history visualization
            build_plot_training_history(history, result_path, fold_tag)

             # Evaluate
            y_probs = model.predict(X_test)
            y_pred = np.argmax(y_probs, axis=1)
            y_true = y_test.flatten()

            # Confusion Matrix
            cm = confusion_matrix(y_true, y_pred)
            print("Confusion Matrix:\n", cm)

            # Classification Report
            report = classification_report(y_true, y_pred, digits=4)
            print("Classification Report:\n", report)

            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred, average='macro')
            rec = recall_score(y_true, y_pred, average='macro')
            f1 = f1_score(y_true, y_pred, average='macro')

            fold_tag = f"{version_key_prefix}_fold_{fold + 1}"
            metrics.loc[fold_tag] = [acc, prec, rec, f1]

            # Cleanup
            K.clear_session()
            del model
            gc.collect()
        
        # Save all 3 folds' metrics into a single CSV file
        result_subset = metrics.loc[[f"{version_key_prefix}_fold_{i+1}" for i in range(NUM_CV_FOLDS)]]
        output_path = f"{result_path}/4layer_{version_key_prefix}_results.csv"
        result_subset.to_csv(output_path, index_label="Fold")


Fold 1 | Train size: 595, Test size: 298
(128, 517, 1)
Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2s/step - accuracy: 0.5769 - loss: 0.7374 - val_accuracy: 0.2953 - val_loss: 1.1499 - learning_rate: 1.0000e-04
Epoch 2/50
[1m 4/19[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m35s[0m 2s/step - accuracy: 0.6947 - loss: 0.6040