In [2]:
import pandas as pd
import numpy as np
import librosa
import torch

from IPython.display import Audio
import random
import glob
import os
import csv
import io

import utils

In [3]:
RANDOM_SEED = 21

# Set seed for experiment reproducibility
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [4]:
is_in_kaggle_env = utils.get_is_in_kaggle_env()

We are running code on Localhost


In [5]:
data_path = '/kaggle/input/birdclef-2023/' if is_in_kaggle_env else 'data/'

In [6]:
device = 'cpu' if is_in_kaggle_env else utils.determine_device()

We are using device: cpu


In [7]:
if not is_in_kaggle_env and not os.path.exists('data'):
    !kaggle competitions download -c 'birdclef-2023'
    !mkdir data
    !unzip -q birdclef-2023.zip -d data
    !rm birdclef-2023.zip

## Data Exploration

In [8]:
audio_abe, sr_abe = librosa.load(f"{data_path}/train_audio/abethr1/XC128013.ogg")
audio_abh, sr_abh = librosa.load(f"{data_path}/train_audio/abhori1/XC127317.ogg")

In [9]:
Audio(data=audio_abe, rate=sr_abe)

In [10]:
Audio(data=audio_abh, rate=sr_abh)

In [11]:
train_metadata = pd.read_csv(f"{data_path}/train_metadata.csv")

In [12]:
train_metadata.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg


In [18]:
competition_classes = sorted(train_metadata.primary_label.unique())
competition_classes[:5]

['abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1']

## Data Preparation

In [15]:
def ensure_sample_rate(waveform, original_sample_rate, desired_sample_rate=32000):
    # 32000 sample rate is required by the competition
    if original_sample_rate != desired_sample_rate:
        waveform = librosa.resample(waveform, orig_sr=original_sample_rate, target_sr=desired_sample_rate)
    return desired_sample_rate, waveform

---

# WIP ...

---

In [14]:
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 5.0,
      hop_size_s: float = 5.0,
      sample_rate = 32000,
      ) -> np.ndarray:
    
    """Helper function for framing audio for inference."""
    if window_size_s is None or window_size_s < 0:
        return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = tf.signal.frame(audio_array, frame_length, hop_length, pad_end=True)
    return framed_audio

Below we load one training sample - use the Audio function to listen to the samples inside the notebook!

In [16]:
audio, sample_rate = librosa.load(f"{data_path}/train_audio/afghor1/XC156639.ogg")
sample_rate, wav_data = ensure_sample_rate(audio, sample_rate)
Audio(wav_data, rate=sample_rate)

## Step 5: Make predictions

Each test sample is cut into 5-second chunks. We use the pretrained model to return probabilities for all 10k birds included in the model, then pull out the classes used in this competition to create a final submission row. Note that we are NOT doing anything special to handle the 3 missing classes; those will need fine-tuning / transfer learning, which will be handled in a separate notebook.

In [17]:
fixed_tm = frame_audio(wav_data)
logits, embeddings = model.infer_tf(fixed_tm[:1])
probabilities = tf.nn.softmax(logits)
argmax = np.argmax(probabilities)
print(f"The audio is from the class {classes[argmax]} (element:{argmax} in the label.csv file), with probability of {probabilities[0][argmax]}")

NameError: name 'tf' is not defined

In [None]:
def predict_for_sample(filename, sample_submission, frame_limit_secs=None):
    file_id = filename.split(".ogg")[0].split("/")[-1]
    
    audio, sample_rate = librosa.load(filename)
    sample_rate, wav_data = ensure_sample_rate(audio, sample_rate)
    
    fixed_tm = frame_audio(wav_data)
    frame_length = int(5 * sample_rate)
    
    all_logits, all_embeddings = model.infer(torch.tensor(fixed_tm[:1]))
    for window in fixed_tm[1:]:
        if frame_limit_secs and frame_length / sample_rate > frame_limit_secs:
            continue
        
        logits, embeddings = model.infer(torch.tensor(window[np.newaxis, :]))
        all_logits = torch.cat([all_logits, logits], dim=0)
        frame_length += int(5 * sample_rate)
    
    all_probabilities = []
    for frame_logits in all_logits:
        probabilities = torch.nn.functional.softmax(frame_logits, dim=0).numpy()
        
        ## set the appropriate row in the sample submission
        sample_submission.loc[sample_submission.row_id == file_id + "_" + str(frame_length // sample_rate), competition_classes] = probabilities[competition_class_map]
        frame_length += int(5 * sample_rate)

## Step 6: Generate a submission

Now we process all of the test samples as discussed above, creating output rows, and saving them in the provided `sample_submission.csv`. Finally, we save these rows to our final output file: `submission.csv`. This is the file that gets submitted and scored when you submit the notebook.

In [None]:
test_samples = list(glob.glob("/kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))
test_samples

In [None]:
sample_sub = pd.read_csv(f"{data_path}/sample_submission.csv")
sample_sub[competition_classes] = sample_sub[competition_classes].astype(np.float32)
sample_sub.head()

In [None]:
frame_limit_secs = 15 if sample_sub.shape[0] == 3 else None
for sample_filename in test_samples:
    predict_for_sample(sample_filename, sample_sub, frame_limit_secs=15)

In [None]:
sample_sub

In [None]:
sample_sub.to_csv(f"{data_path}/submission.csv", index=False)