In [27]:
import os
import numpy as np
import pandas as pd
from scipy.io import wavfile
import librosa
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import sys
from mediapipe.tasks import python
from mediapipe.tasks.python import audio
from mediapipe.tasks.python.components import containers
import logging
import json
from datetime import datetime
import tensorflow as tf

In [23]:
# Suppress MediaPipe logging more aggressively
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Only show fatal errors
os.environ['MEDIAPIPE_DISABLE_GPU'] = '0'  # Don't disable GPU, but quiet the logs
# Suppress other logging
logging.getLogger('mediapipe').setLevel(logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.ERROR)

In [24]:
print(f"NumPy version: {np.__version__}")
print(f"NumPy location: {np.__file__}")
print(f"Python path: {sys.path}")

NumPy version: 1.26.4
NumPy location: /home/regina/Desktop/birdclef2025/.venv/lib/python3.10/site-packages/numpy/__init__.py
Python path: ['/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/regina/Desktop/birdclef2025/.venv/lib/python3.10/site-packages', '/home/regina/Desktop/birdclef2025/.venv/lib/python3.10/site-packages/setuptools/_vendor', '/tmp/tmpx5qi7fj4']


In [25]:
results_dir = 'yamnet_training_results'
os.makedirs(results_dir, exist_ok=True)

# Initialize results tracking
results = {
    'epochs': [],
    'train_loss': [],
    'val_loss': [],
    'accuracy': [],
    'timestamp': []
}

In [26]:
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))
print("Built with CUDA:", tf.test.is_built_with_cuda())

TensorFlow version: 2.19.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Built with CUDA: True


In [6]:
!wget -O yamnet.tflite -q https://storage.googleapis.com/mediapipe-models/audio_classifier/yamnet/float32/1/yamnet.tflite

In [28]:
def load_audio_file(file_path, target_sr=16000):
    """Load audio file without using librosa"""
    import soundfile as sf
    
    # Load audio with soundfile
    audio_data, sr = sf.read(file_path)
    
    # Convert to mono if stereo
    if len(audio_data.shape) > 1:
        audio_data = audio_data.mean(axis=1)
    
    # Resample if needed (simple method)
    if sr != target_sr:
        # Simple resampling (not as good as librosa but works without numba)
        from scipy import signal
        duration = len(audio_data) / sr
        new_length = int(duration * target_sr)
        audio_data = signal.resample(audio_data, new_length)
    
    return audio_data, target_sr

In [29]:

# Define paths
base_dir = "/data/birdclef/birdclef-2025"
train_audio_dir = os.path.join(base_dir, "train_audio")
train_csv_path = os.path.join(base_dir, "train.csv")
taxonomy_path = os.path.join(base_dir, "taxonomy.csv")

# Load metadata
train_df = pd.read_csv(train_csv_path)
taxonomy_df = pd.read_csv(taxonomy_path)

# Create a mapping from primary_label to class index
unique_labels = train_df['primary_label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

print(f"Total number of species: {len(unique_labels)}")

def extract_yamnet_embeddings(audio_path, max_segments=10):
    """Extract embeddings from audio file using MediaPipe YAMNet."""
    try:
        # Load and preprocess audio
        audio_data, sample_rate = load_audio_file(audio_path, target_sr=16000)
        
        # Create classifier options
        base_options = python.BaseOptions(model_asset_path='yamnet.tflite')
        options = audio.AudioClassifierOptions(
            base_options=base_options,
            max_results=5,
            running_mode=audio.RunningMode.AUDIO_CLIPS
        )
        
        # Create the classifier
        with audio.AudioClassifier.create_from_options(options) as classifier:
            # Convert to AudioData format
            audio_data_obj = containers.AudioData.create_from_array(
                audio_data.astype(float), sample_rate)
            
            # Classify audio
            result = classifier.classify(audio_data_obj)
            
            # Handle different result structures
            features = np.zeros(521)  # YAMNet has 521 classes
            
            # Check if result is a list or has classifications attribute
            if hasattr(result, 'classifications'):
                classifications_list = result.classifications
            elif isinstance(result, list):
                classifications_list = result
            else:
                # If neither, return zeros
                return features
            
            # Process each classification
            all_segment_features = []
            for segment_result in classifications_list:
                segment_features = np.zeros(521)
                
                # Handle different segment_result structures
                categories = []
                if hasattr(segment_result, 'categories'):
                    categories = segment_result.categories
                elif hasattr(segment_result, 'classification_list'):
                    categories = segment_result.classification_list
                
                # Extract features from categories
                for i, category in enumerate(categories):
                    if i >= max_segments:
                        break
                    if hasattr(category, 'index'):
                        class_idx = int(category.index)
                    else:
                        class_idx = i  # Fallback
                    
                    if hasattr(category, 'score'):
                        score = category.score
                    else:
                        score = 0.1  # Fallback
                    
                    if 0 <= class_idx < 521:
                        segment_features[class_idx] = score
                
                all_segment_features.append(segment_features)
            
            # If we have multiple segments, average them
            if all_segment_features:
                features = np.mean(all_segment_features, axis=0)
                
            return features
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return np.zeros(521)  # Return zeros if processing fails



Total number of species: 206


In [31]:
# Create a custom dataset
class BirdCLEFDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        features = extract_yamnet_embeddings(audio_path)
        label = self.labels[idx]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Create a simple classifier model
class BirdCLEFClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BirdCLEFClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Prepare dataset (using a small subset for testing)
print("Preparing dataset...")
file_paths = []
labels = []

# Limit to a small subset for testing
sample_size = 100  # Adjust this based on your needs
sampled_df = train_df.sample(sample_size, random_state=42)

for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    species_id = row['primary_label']
    filename = row['filename']
    file_path = os.path.join(train_audio_dir, filename)
    
    if os.path.exists(file_path):
        file_paths.append(file_path)
        labels.append(label_to_idx[species_id])


Preparing dataset...


100%|██████████| 100/100 [00:00<00:00, 20314.35it/s]


In [32]:
# Split without stratification
train_files, val_files, train_labels, val_labels = train_test_split(
    file_paths, labels, test_size=0.2, random_state=42
)

print(f"Training samples: {len(train_files)}")
print(f"Validation samples: {len(val_files)}")

# Create datasets and dataloaders
train_dataset = BirdCLEFDataset(train_files, train_labels)
val_dataset = BirdCLEFDataset(val_files, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model, loss function, and optimizer
input_dim = 521  # YAMNet has 521 classes
hidden_dim = 256
output_dim = len(unique_labels)

model = BirdCLEFClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


Training samples: 80
Validation samples: 20


In [None]:
# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    
    # Less verbose progress bar
    for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", 
                leave=False, ncols=80):
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    

    # Validation
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []  # Store probabilities for each class
    
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            
            # Get predicted class
            probs = torch.softmax(outputs, dim=1)
            _, preds = torch.max(probs, 1)
            
            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = sum(1 for x, y in zip(all_preds, all_labels) if x == y) / len(all_labels)
    
    # Store results
    results['epochs'].append(epoch + 1)
    results['train_loss'].append(float(train_loss))
    results['val_loss'].append(float(val_loss))
    results['accuracy'].append(float(accuracy))
    results['timestamp'].append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Accuracy={accuracy:.4f}")
    
    # Save results every 5 epochs
    if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
        # Save results to JSON
        results_file = os.path.join(results_dir, f'results_epoch_{epoch+1}.json')
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=4)
        
        # Save model checkpoint
        model_file = os.path.join(results_dir, f'model_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'accuracy': accuracy
        }, model_file)
        
        print(f"Saved results and model checkpoint at epoch {epoch+1}")
    # Only print the full report at the end
    if epoch == num_epochs - 1:
        print("\nFinal Classification Report:")
        print(classification_report(all_labels, all_preds, zero_division=0))
print("Training complete!")

Using device: cuda


Epoch 1/5:   0%|                                          | 0/5 [00:00<?, ?it/s]I0000 00:00:1743624334.177553   53352 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743624334.212082   57926 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1743624334.217410   57929 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1743624334.779262   53352 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743624334.806182   57934 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1743624334.817132   57937 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1743624335.12339

Epoch 1/5
Train Loss: 5.2145, Val Loss: 5.2528
Epoch 1/5: Train Loss=5.2145, Val Loss=5.2528, Accuracy=0.0000
Accuracy: 0.0


Epoch 2/5:   0%|                                          | 0/5 [00:00<?, ?it/s]I0000 00:00:1743624376.920027   53352 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743624376.945240   58726 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1743624376.953636   58730 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1743624377.039520   53352 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743624377.062768   58734 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1743624377.072785   58737 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1743624377.53921

Epoch 2/5
Train Loss: 5.1796, Val Loss: 5.2336
Epoch 2/5: Train Loss=5.1796, Val Loss=5.2336, Accuracy=0.0000
Accuracy: 0.0


Epoch 3/5:   0%|                                          | 0/5 [00:00<?, ?it/s]I0000 00:00:1743624419.514431   53352 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743624419.537767   59530 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1743624419.542903   59532 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1743624420.780231   53352 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1743624420.806078   59539 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1743624420.811114   59541 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1743624421.03096

KeyboardInterrupt: 