In [14]:
import os
import sys
import logging
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
from datetime import datetime
from collections import Counter
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [15]:
# Custom stdout filter to suppress MediaPipe logs
class MediaPipeFilter:
    def __init__(self):
        self.terminal = sys.stdout
        
    def write(self, message):
        if not any(x in message for x in ['EGL', 'GL version', 'NVIDIA', 'Feedback manager']):
            self.terminal.write(message)
            
    def flush(self):
        self.terminal.flush()

sys.stdout = MediaPipeFilter()

# Set environment variables to suppress logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['MEDIAPIPE_DISABLE_LOG'] = '1'

# Disable all logging
logging.disable(logging.CRITICAL)


In [16]:
# Download YAMNet model
if not os.path.exists('yamnet.tflite'):
    print("Downloading YAMNet model...")
    import urllib.request
    urllib.request.urlretrieve('https://storage.googleapis.com/mediapipe-models/audio_classifier/yamnet/float32/1/yamnet.tflite', 'yamnet.tflite')

# Import MediaPipe
from mediapipe.tasks import python
from mediapipe.tasks.python import audio
from mediapipe.tasks.python.components import containers


In [17]:
# Define paths
base_dir = "/data/birdclef/birdclef-2025"
train_audio_dir = os.path.join(base_dir, "train_audio")
train_csv_path = os.path.join(base_dir, "train.csv")
taxonomy_path = os.path.join(base_dir, "taxonomy.csv")

# Create results directory
results_dir = 'training_results/yamnet_training_results_boost_deeper'
os.makedirs(results_dir, exist_ok=True)

# Load metadata
print("Loading metadata...")
train_df = pd.read_csv(train_csv_path)
taxonomy_df = pd.read_csv(taxonomy_path)

In [18]:
# Create a mapping from primary_label to class index
unique_labels = train_df['primary_label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

print(f"Total number of species: {len(unique_labels)}")

In [19]:
# Function to load audio without librosa (to avoid numba dependency)
def load_audio_file(file_path, target_sr=16000):
    """Load audio file without using librosa"""
    import soundfile as sf
    
    try:
        # Load audio with soundfile
        audio_data, sr = sf.read(file_path)
        
        # Convert to mono if stereo
        if len(audio_data.shape) > 1:
            audio_data = audio_data.mean(axis=1)
        
        # Resample if needed (simple method)
        if sr != target_sr:
            # Simple resampling (not as good as librosa but works without numba)
            from scipy import signal
            duration = len(audio_data) / sr
            new_length = int(duration * target_sr)
            audio_data = signal.resample(audio_data, new_length)
        
        return audio_data, target_sr
    except Exception as e:
        print(f"Error loading {os.path.basename(file_path)}: {e}")
        return np.zeros(1000), target_sr  # Return short empty audio

# Global variables for error tracking
error_files = {}
processed_files = 0
successful_files = 0

In [20]:
# Function to extract YAMNet embeddings
def extract_yamnet_embeddings(audio_path, max_segments=10):
    """Extract embeddings from audio file using MediaPipe YAMNet."""
    global processed_files, successful_files
    processed_files += 1
    
    try:
        # Load and preprocess audio
        audio_data, sample_rate = load_audio_file(audio_path, target_sr=16000)
        
        if len(audio_data) < 100:  # Skip very short or empty audio
            raise ValueError("Audio too short")
        
        # Create classifier options
        base_options = python.BaseOptions(model_asset_path='yamnet.tflite')
        options = audio.AudioClassifierOptions(
            base_options=base_options,
            max_results=5,
            running_mode=audio.RunningMode.AUDIO_CLIPS
        )
        
        # Create the classifier
        with audio.AudioClassifier.create_from_options(options) as classifier:
            # Convert to AudioData format
            audio_data_obj = containers.AudioData.create_from_array(
                audio_data.astype(float), sample_rate)
            
            # Classify audio
            result = classifier.classify(audio_data_obj)
            
            # Handle different result structures
            features = np.zeros(521)  # YAMNet has 521 classes
            
            # Check if result is a list or has classifications attribute
            if hasattr(result, 'classifications'):
                classifications_list = result.classifications
            elif isinstance(result, list):
                classifications_list = result
            else:
                # If neither, return zeros
                return features
            
            # Process each classification
            all_segment_features = []
            for segment_result in classifications_list:
                segment_features = np.zeros(521)
                
                # Handle different segment_result structures
                categories = []
                if hasattr(segment_result, 'categories'):
                    categories = segment_result.categories
                elif hasattr(segment_result, 'classification_list'):
                    categories = segment_result.classification_list
                
                # Extract features from categories
                for i, category in enumerate(categories):
                    if i >= max_segments:
                        break
                    if hasattr(category, 'index'):
                        class_idx = int(category.index)
                    else:
                        class_idx = i  # Fallback
                    
                    if hasattr(category, 'score'):
                        score = category.score
                    else:
                        score = 0.1  # Fallback
                    
                    if 0 <= class_idx < 521:
                        segment_features[class_idx] = score
                
                all_segment_features.append(segment_features)
            
            # If we have multiple segments, average them
            if all_segment_features:
                features = np.mean(all_segment_features, axis=0)
            
            successful_files += 1
            return features
    except Exception as e:
        error_type = str(type(e).__name__)
        if error_type not in error_files:
            error_files[error_type] = []
        if len(error_files[error_type]) < 3:  # Collect at most 3 examples per error type
            error_files[error_type].append(os.path.basename(audio_path))
        return np.zeros(521)  # Return zeros if processing fails


In [21]:
# Create a custom dataset
class BirdCLEFDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels
        self.cache = {}  # Cache features to avoid recomputing
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        
        # Use cached features if available
        if audio_path in self.cache:
            features = self.cache[audio_path]
        else:
            features = extract_yamnet_embeddings(audio_path)
            self.cache[audio_path] = features
            
        label = self.labels[idx]
        return torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Create a simple classifier model
class BirdCLEFClassifier(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.5):
        super(BirdCLEFClassifier, self).__init__()
        # First layer
        self.fc1 = nn.Linear(input_dim, 512)  # Wider first layer
        self.bn1 = nn.BatchNorm1d(512)
        self.leaky1 = nn.LeakyReLU(0.1)  # LeakyReLU with 0.1 negative slope
        self.dropout1 = nn.Dropout(dropout_rate)
        
        # Second layer
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.leaky2 = nn.LeakyReLU(0.1)
        self.dropout2 = nn.Dropout(dropout_rate)
        
        # Third layer (the additional layer)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.leaky3 = nn.LeakyReLU(0.1)
        self.dropout3 = nn.Dropout(dropout_rate)
        
        # Output layer
        self.fc4 = nn.Linear(128, output_dim)
        
    def forward(self, x):
        # First layer
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.leaky1(x)
        x = self.dropout1(x)
        
        # Second layer
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.leaky2(x)
        x = self.dropout2(x)
        
        # Third layer
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.leaky3(x)
        x = self.dropout3(x)
        
        # Output layer
        x = self.fc4(x)
        return x
        
# Prepare dataset
print("Preparing dataset...")
file_paths = []
labels = []

# Increase sample size
# sample_size = 500  # Adjust based on your needs
# sampled_df = train_df.sample(sample_size, random_state=42)
sampled_df = train_df

In [22]:
for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Finding files"):
    species_id = row['primary_label']
    filename = row['filename']
    file_path = os.path.join(train_audio_dir, filename)
    
    if os.path.exists(file_path):
        file_paths.append(file_path)
        labels.append(label_to_idx[species_id])

print(f"Found {len(file_paths)} valid files")

# Filter out classes with too few samples
label_counts = Counter(labels)
print("Class distribution:")
for label, count in label_counts.most_common(5):
    print(f"  {idx_to_label[label]}: {count} files")
print(f"  ... and {len(label_counts) - 5} more classes")

Finding files: 100%|██████████| 28564/28564 [00:00<00:00, 28748.42it/s]


In [23]:
# Split into train and validation sets
train_files, val_files, train_labels, val_labels = train_test_split(
    file_paths, labels, test_size=0.2, random_state=42
)

print(f"Training samples: {len(train_files)}")
print(f"Validation samples: {len(val_files)}")


In [24]:
# Create datasets and dataloaders
train_dataset = BirdCLEFDataset(train_files, train_labels)
val_dataset = BirdCLEFDataset(val_files, val_labels)

# Check a sample to verify data is loaded correctly
print("Checking sample data...")
features, label = train_dataset[0]
print(f"Features shape: {features.shape}")
print(f"Features min/max/mean: {features.min():.4f}/{features.max():.4f}/{features.mean():.4f}")
print(f"Label: {label} (corresponds to: {idx_to_label[label.item()]})")


I0000 00:00:1744082131.131059    9494 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744082131.168631    9666 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1744082131.174793    9669 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [25]:

# Check if all features are zeros
zero_features = 0
for i in range(min(10, len(train_dataset))):
    features, _ = train_dataset[i]
    if torch.all(features == 0):
        zero_features += 1
print(f"Zero feature vectors in first 10 samples: {zero_features}")


I0000 00:00:1744082136.706399    9494 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744082136.740631    9673 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1744082136.749369    9674 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1744082137.269345    9494 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744082137.299176    9680 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1744082137.304718    9683 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1744082138.013760    9494 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model, loss function, and optimizer
input_dim = 521  # YAMNet has 521 classes
output_dim = len(unique_labels)


# Calculate class weights based on inverse frequency
class_counts = Counter(train_labels)
total_samples = len(train_labels)
class_weights = {cls: total_samples / (len(class_counts) * count) for cls, count in class_counts.items()}

# Convert to tensor
weight_tensor = torch.FloatTensor([class_weights.get(i, 1.0) for i in range(output_dim)]).to(device)

# Print some information about class weights
print("Class weight examples:")
for i in range(min(5, output_dim)):
    species_name = idx_to_label.get(i, "Unknown")
    weight = weight_tensor[i].item()
    count = class_counts.get(i, 0)
    print(f"  {species_name}: weight={weight:.2f} (count={count})")

model = BirdCLEFClassifier(input_dim, output_dim)
criterion = nn.CrossEntropyLoss(weight=weight_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.01)  # Higher learning rate

# Initialize results tracking
results = {
    'epochs': [],
    'train_loss': [],
    'val_loss': [],
    'accuracy': [],
    'timestamp': []
}

In [None]:
# Training loop
num_epochs = 20  # Increase epochs


model.to(device)

print("\nStarting training...")
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    
    # Training loop with less verbose progress bar
    for batch_idx, (features, labels) in enumerate(tqdm(train_loader, 
                            desc=f"Epoch {epoch+1}/{num_epochs}",
                            leave=False, ncols=80)):
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
        # Print diagnostic info for first few batches
        if epoch == 0 and batch_idx < 3:
            print(f"\nDiagnostic - Batch {batch_idx}:")
            print(f"Features stats: min={features.min().item():.4f}, max={features.max().item():.4f}")
            print(f"Outputs: {outputs[0][:5].tolist()}")  # First 5 logits of first sample
            print(f"Label counts in batch: {torch.bincount(labels, minlength=5)[:5]}")
    
    # Validation
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            # Get predicted class
            probs = torch.softmax(outputs, dim=1)
            _, preds = torch.max(probs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    
    # Direct calculation of accuracy
    correct = sum(1 for x, y in zip(all_preds, all_labels) if x == y)
    total = len(all_labels)
    accuracy = correct / total if total > 0 else 0
    
    # Store results
    results['epochs'].append(epoch + 1)
    results['train_loss'].append(float(train_loss))
    results['val_loss'].append(float(val_loss))
    results['accuracy'].append(float(accuracy))
    results['timestamp'].append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Accuracy={accuracy:.4f}, Correct={correct}/{total}")
    
    # Print prediction distribution
    pred_counts = Counter(all_preds)
    print(f"Prediction distribution: {len(pred_counts)} classes predicted")
    if len(pred_counts) < 5:
        print("WARNING: Model is only predicting a few classes!")
        for pred, count in pred_counts.most_common():
            print(f"  Class {idx_to_label[pred]}: {count} predictions ({count/len(all_preds)*100:.1f}%)")
    
    # Save results every 5 epochs
    if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
        # Save results to JSON
        results_file = os.path.join(results_dir, f'results_epoch_{epoch+1}.json')
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=4)
        
        # Save model checkpoint
        model_file = os.path.join(results_dir, f'model_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'accuracy': accuracy
        }, model_file)
        
        print(f"Saved results and model checkpoint at epoch {epoch+1}")

print("Training complete!")
# print(f"Processed {processed_files} files, {successful_files} successful ({successful_files/processed_files*100:.1f}%)")

# # Print error summary
# print("\nError Summary:")
# for error_type, files in error_files.items():
#     print(f"{error_type}: {len(files)} files. Examples: {', '.join(files[:3])}")


print("\nTraining boosted ensemble...")

# Extract features for all training samples
train_features = []
train_targets = []

print("Extracting features for boosting...")
for file_path, label in tqdm(zip(train_files, train_labels), total=len(train_files)):
    # Get features from our dataset cache if possible
    if file_path in train_dataset.cache:
        features = train_dataset.cache[file_path]
    else:
        features = extract_yamnet_embeddings(file_path)
        train_dataset.cache[file_path] = features
    
    train_features.append(features)
    train_targets.append(label)

train_features = np.array(train_features)
train_targets = np.array(train_targets)

# Do the same for validation set
val_features = []
val_targets = []

print("Extracting validation features...")
for file_path, label in tqdm(zip(val_files, val_labels), total=len(val_files)):
    if file_path in val_dataset.cache:
        features = val_dataset.cache[file_path]
    else:
        features = extract_yamnet_embeddings(file_path)
        val_dataset.cache[file_path] = features
    
    val_features.append(features)
    val_targets.append(label)

val_features = np.array(val_features)
val_targets = np.array(val_targets)

# Train AdaBoost ensemble
print("Training AdaBoost ensemble...")
base_estimator = DecisionTreeClassifier(max_depth=3)
boosted_model = AdaBoostClassifier(
    n_estimators=5,
    learning_rate=0.1,
    random_state=42
)

boosted_model.fit(train_features, train_targets)

# Evaluate boosted model
boost_preds = boosted_model.predict(val_features)
boost_accuracy = sum(1 for x, y in zip(boost_preds, val_targets) if x == y) / len(val_targets)
print(f"Boosted model accuracy: {boost_accuracy:.4f}")

# Save the boosted model
import pickle
with open(os.path.join(results_dir, 'boosted_model.pkl'), 'wb') as f:
    pickle.dump(boosted_model, f)

Epoch 1/20:   0%|                                      | 0/1429 [00:00<?, ?it/s]I0000 00:00:1744082157.646911    9494 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744082157.672139    9744 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1744082157.678592    9746 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1744082157.940656    9494 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1744082157.969458    9751 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 NVIDIA 570.124.06), renderer: NVIDIA GeForce RTX 4070 Ti/PCIe/SSE2
W0000 00:00:1744082157.975045    9753 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1744082158.22625