In [12]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from collections import Counter
import json
from datetime import datetime
import pandas as pd

In [6]:
def extract_melspectrogram(audio, sr, n_mels=128, n_fft=2048, hop_length=512):
    """Extract mel spectrogram from audio."""
    # Compute mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio, 
        sr=sr, 
        n_mels=n_mels,
        n_fft=n_fft, 
        hop_length=hop_length
    )
    
    # Convert to log scale (dB)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    return log_mel_spec

# Let's visualize a spectrogram to understand what we're working with
def plot_spectrogram(spec, title=None):
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spec, x_axis='time', y_axis='mel', cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    if title:
        plt.title(title)
    plt.tight_layout()
    plt.show()

In [7]:
class SpectrogramDataset(Dataset):
    def __init__(self, file_paths, labels, sr=22050, duration=5, n_mels=128, 
                 n_fft=2048, hop_length=512, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.duration = duration
        self.n_mels = n_mels
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.transform = transform
        self.cache = {}  # Cache spectrograms to avoid recomputing
        
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        
        # Use cached spectrogram if available
        if audio_path in self.cache:
            spec = self.cache[audio_path]
        else:
            # Load audio
            try:
                audio, sr = librosa.load(audio_path, sr=self.sr)
                
                # Ensure consistent duration
                target_length = self.duration * sr
                if len(audio) < target_length:
                    # Pad if too short
                    audio = np.pad(audio, (0, target_length - len(audio)))
                else:
                    # Trim if too long
                    audio = audio[:target_length]
                
                # Extract spectrogram
                spec = extract_melspectrogram(
                    audio, sr, n_mels=self.n_mels, 
                    n_fft=self.n_fft, hop_length=self.hop_length
                )
                
                # Cache for future use
                self.cache[audio_path] = spec
                
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")
                # Return empty spectrogram on error
                spec = np.zeros((self.n_mels, int((self.duration * self.sr) / self.hop_length) + 1))
        
        # Apply transformations if any
        if self.transform:
            spec = self.transform(spec)
        
        # Convert to tensor (add channel dimension)
        spec_tensor = torch.tensor(spec, dtype=torch.float32).unsqueeze(0)  # Shape: [1, n_mels, time]
        label_tensor = torch.tensor(label, dtype=torch.long)
        
        return spec_tensor, label_tensor

In [8]:
class AudioCNN(nn.Module):
    def __init__(self, num_classes, n_mels=128):
        super(AudioCNN, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(kernel_size=2)
        
        self.conv4 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        self.pool4 = nn.MaxPool2d(kernel_size=2)
        
        # Calculate the size of the flattened features
        # This depends on your input spectrogram size and pooling layers
        # For n_mels=128 and 5-second audio, after 4 pooling layers of factor 2:
        # Height: 128 / (2^4) = 8
        # Width: depends on hop_length, but roughly 5*sr/hop_length / (2^4)
        # For sr=22050 and hop_length=512: 5*22050/512 / 16 ≈ 13
        self.flat_features = 256 * 8 * 13  # Adjust based on your actual dimensions
        
        # Fully connected layers
        self.fc1 = nn.Linear(self.flat_features, 512)
        self.bn5 = nn.BatchNorm1d(512)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)
        
    def forward(self, x):
        # Convolutional layers
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.pool4(F.relu(self.bn4(self.conv4(x))))
        
        # Flatten
        x = x.view(-1, self.flat_features)
        
        # Fully connected layers
        x = F.relu(self.bn5(self.fc1(x)))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [18]:
# Prepare dataset
print("Preparing dataset...")
file_paths = []
labels = []

# Define paths
base_dir = "/data/birdclef/birdclef-2025"
train_audio_dir = os.path.join(base_dir, "train_audio")
train_csv_path = os.path.join(base_dir, "train.csv")
taxonomy_path = os.path.join(base_dir, "taxonomy.csv")

# Create results directory
results_dir = 'training_results/mel_cnn_model_2'
os.makedirs(results_dir, exist_ok=True)

# Load metadata
print("Loading metadata...")
train_df = pd.read_csv(train_csv_path)
taxonomy_df = pd.read_csv(taxonomy_path)

# Increase sample size
sample_size = 500  # Adjust based on your needs
sampled_df = train_df.sample(sample_size, random_state=42)
# sampled_df = train_df

# Create a mapping from primary_label to class index
unique_labels = train_df['primary_label'].unique()
label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}

print(f"Total number of species: {len(unique_labels)}")

for _, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Finding files"):
    species_id = row['primary_label']
    filename = row['filename']
    file_path = os.path.join(train_audio_dir, filename)
    
    if os.path.exists(file_path):
        file_paths.append(file_path)
        labels.append(label_to_idx[species_id])

print(f"Found {len(file_paths)} valid files")

# Filter out classes with too few samples
label_counts = Counter(labels)
print("Class distribution:")
for label, count in label_counts.most_common(5):
    print(f"  {idx_to_label[label]}: {count} files")
print(f"  ... and {len(label_counts) - 5} more classes")

Preparing dataset...
Loading metadata...
Total number of species: 206


Finding files: 100%|██████████| 500/500 [00:00<00:00, 25641.94it/s]

Found 500 valid files
Class distribution:
  grekis: 20 files
  amekes: 13 files
  compau: 13 files
  soulap1: 13 files
  gycwor1: 12 files
  ... and 123 more classes





In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [17]:
train_files, val_files, train_labels, val_labels = train_test_split(
    file_paths, labels, test_size=0.2, random_state=42
)

# Create datasets
train_dataset = SpectrogramDataset(train_files, train_labels)
val_dataset = SpectrogramDataset(val_files, val_labels)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Calculate class weights
class_counts = Counter(train_labels)
total_samples = len(train_labels)
class_weights = {cls: total_samples / (len(class_counts) * count) for cls, count in class_counts.items()}
weight_tensor = torch.FloatTensor([class_weights.get(i, 1.0) for i in range(len(unique_labels))]).to(device)

# Initialize model, loss function, and optimizer
model = AudioCNN(num_classes=len(unique_labels)).to(device)
criterion = nn.CrossEntropyLoss(weight=weight_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)


In [19]:
# Training loop
num_epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize results tracking
results = {
    'epochs': [],
    'train_loss': [],
    'val_loss': [],
    'accuracy': [],
    'timestamp': []
}

# Function to find the latest checkpoint
def find_latest_checkpoint(results_dir):
    checkpoints = [f for f in os.listdir(results_dir) if f.startswith('cnn_model_epoch_') and f.endswith('.pt')]
    if not checkpoints:
        return None
    
    # Extract epoch numbers and find the highest
    epoch_nums = [int(f.split('_')[-1].split('.')[0]) for f in checkpoints]
    if not epoch_nums:
        return None
    
    latest_epoch = max(epoch_nums)
    return os.path.join(results_dir, f'cnn_model_epoch_{latest_epoch}.pt')

# Check if we have a checkpoint to resume from
latest_checkpoint = find_latest_checkpoint(results_dir)
start_epoch = 0

# Initialize model, criterion, optimizer
model = AudioCNN(num_classes=len(unique_labels)).to(device)
criterion = nn.CrossEntropyLoss(weight=weight_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

# Initialize results tracking
results = {
    'epochs': [],
    'train_loss': [],
    'val_loss': [],
    'accuracy': [],
    'timestamp': []
}

# Load checkpoint if it exists
if latest_checkpoint:
    print(f"Resuming from checkpoint: {latest_checkpoint}")
    checkpoint = torch.load(latest_checkpoint, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    
    # Also load results if available
    results_file = os.path.join(results_dir, f'cnn_results_epoch_{start_epoch}.json')
    if os.path.exists(results_file):
        with open(results_file, 'r') as f:
            results = json.load(f)
    
    print(f"Resuming from epoch {start_epoch}")
else:
    print("Starting training from scratch")

# Modify training loop to start from the correct epoch
for epoch in range(start_epoch, num_epochs):
    model.train()
    train_loss = 0.0
    
    for batch_idx, (specs, labels) in enumerate(tqdm(train_loader, 
                                                     desc=f"Epoch {epoch+1}/{num_epochs}",
                                                     leave=False)):
        specs, labels = specs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(specs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for specs, labels in val_loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    
    correct = sum(1 for x, y in zip(all_preds, all_labels) if x == y)
    total = len(all_labels)
    accuracy = correct / total if total > 0 else 0
    
    # Update learning rate based on validation loss
    scheduler.step(val_loss)
    
    # Store results
    results['epochs'].append(epoch + 1)
    results['train_loss'].append(float(train_loss))
    results['val_loss'].append(float(val_loss))
    results['accuracy'].append(float(accuracy))
    results['timestamp'].append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    # Print progress
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}, Accuracy={accuracy:.4f}")
    
    # Save results and model periodically
    if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
        results_file = os.path.join(results_dir, f'cnn_results_epoch_{epoch+1}.json')
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=4)
        
        model_file = os.path.join(results_dir, f'cnn_model_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'val_loss': val_loss,
            'accuracy': accuracy
        }, model_file)
        
        print(f"Saved results and model checkpoint at epoch {epoch+1}")

print("Training complete!")

Using device: cuda
Starting training from scratch


                                                          

Epoch 1/1: Train Loss=5.5113, Val Loss=6.0547, Accuracy=0.0600
Saved results and model checkpoint at epoch 1
Training complete!
