In this notebook, we'll use a pre-trained machine learning model to generate a submission to the [BirdClef2023 competition](https://www.kaggle.com/c/birdclef-2023).  The goal of the competition is to identify Eastern African bird species by sound.

# Step 1: Setup

In [1]:
from fastkaggle import iskaggle, setup_comp

## Paths

In [2]:
from pathlib import Path

comp_path = setup_comp('birdclef-2023')

debug_on = False
model_path = comp_path/'birdclef-2023-offline-models'
model_fname = 'birdclef-2023-efficientnet_b0-20230320-232256.pkl'
package_path = comp_path/'birdclef-2023-offline-packages'
sub_dir = 'test_soundscapes'

required_test_audio_sample_rate_hz = 16000

In [3]:
comp_path

Path('birdclef-2023')

In [4]:
!pwd

/home/sdhca/Code/kaggle


In [5]:
# Local paths
input_path = comp_path/'birdclef-2023'
working_path = comp_path/'working'
temp_path = comp_path/'temp'

# Kaggle paths
if iskaggle:
    input_path = Path('/kaggle/input/birdclef-2023')
    working_path = Path('/kaggle/working')
    temp_path = Path('/kaggle/temp')
    
working_path.mkdir(exist_ok=True)
temp_path.mkdir(exist_ok=True)

## Import libraries

In [6]:
import pandas as pd
import numpy as np
import glob

import fastai
from fastai.vision.all import load_learner
print(f'fastai: {fastai.__version__}')
import torchaudio
# import torchaudio.transforms as T
print(f'torchaudio: {torchaudio.__version__}')

import fastaudio
from fastaudio.core.all import *
from fastaudio.augment.all import *
print(f'fastaudio: {fastaudio.__version__}')

# import tensorflow_io as tfio

  warn(f"Failed to load image Python extension: {e}")


fastai: 2.7.12
torchaudio: 2.0.1+cu117
fastaudio: 1.0.3


# Step 2: Preprocessing functions

## Todo
1. [ ] Generate submission skeleton -- intended for parallel execution. Come back to this idea later
2. [ ] Pre-process
3. [ ] Predict
4. [ ] Populate submission
5. [ ] Parallelize

In [7]:
test_samples = get_files(input_path/sub_dir, '.ogg')

In [8]:
test_samples[:5]

(#5) [Path('birdclef-2023/birdclef-2023/test_soundscapes/soundscape_29205.ogg'),Path('birdclef-2023/birdclef-2023/test_soundscapes/soundscape_29204.ogg'),Path('birdclef-2023/birdclef-2023/test_soundscapes/soundscape_29202.ogg'),Path('birdclef-2023/birdclef-2023/test_soundscapes/soundscape_29206.ogg'),Path('birdclef-2023/birdclef-2023/test_soundscapes/soundscape_29203.ogg')]

In [9]:
# audio_metadata_df = get_audio_files_metadata(test_samples)
# audio_metadata_df.head().T

In [10]:
# https://discuss.pytorch.org/t/pytorch-equivalent-to-tf-signal-frame/123239
def torch_frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
    """
    equivalent of tf.signal.frame
    """
    signal_length = signal.shape[axis]
    if pad_end:
        frames_overlap = frame_length - frame_step
        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
        pad_size = int(frame_length - rest_samples)
        if pad_size != 0:
            pad_axis = [0] * signal.ndim
            pad_axis[axis] = pad_size
            signal = F.pad(signal, pad_axis, "constant", pad_value)
    frames=signal.unfold(axis, frame_length, frame_step)
    return frames

In [11]:
def predict_for_sample(filename, sample_submission, frame_limit_secs=None):
    file_id = filename.split(".ogg")[0].split("/")[-1]
    
    audio, sample_rate = torchaudio.load(filename)
    wav_data, sample_rate = ensure_sample_rate(audio, sample_rate)
    
    fixed_tm = frame_audio(wav_data)
    
    frame = 5
    all_logits, all_embeddings = model.infer_tf(fixed_tm[:1])
    for window in fixed_tm[1:]:
        if frame_limit_secs and frame > frame_limit_secs:
            continue
        
        logits, embeddings = model.infer_tf(window[np.newaxis, :])
        all_logits = np.concatenate([all_logits, logits], axis=0)
        frame += 5
    
    frame = 5
    all_probabilities = []
    for frame_logits in all_logits:
        probabilities = tf.nn.softmax(frame_logits).numpy()
        
        ## set the appropriate row in the sample submission
        sample_submission.loc[sample_submission.row_id == file_id + "_" + str(frame), competition_classes] = probabilities[competition_class_map]
        frame += 5

In [12]:
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 5.0,
      hop_size_s: float = 5.0,
      sample_rate = 32000,
      ) -> np.ndarray:
    
    """Helper function for framing audio for inference."""
    """ using tf.signal """
    if window_size_s is None or window_size_s < 0:
        return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = torch_frame(audio_array, frame_length, hop_length, pad_end=True)
    return framed_audio

def ensure_sample_rate(waveform, original_sample_rate,
                       desired_sample_rate=32000):
    """Resample waveform if required."""
    if original_sample_rate != desired_sample_rate:
        waveform = torchaudio.functional.resample(waveform, original_sample_rate, desired_sample_rate)
    return waveform, desired_sample_rate

Below we load one training sample - use the Audio function to listen to the samples inside the notebook!

In [13]:
# Test example
audio, sample_rate = torchaudio.load(input_path/'train_audio/afghor1/XC156639.ogg')
print(sample_rate)
audio, sample_rate = ensure_sample_rate(audio, sample_rate, required_test_audio_sample_rate_hz)
print(sample_rate)
Audio(audio, rate=sample_rate)

32000
16000


In [14]:
resample = Resample(sr_new=required_test_audio_sample_rate_hz)
cfg = AudioConfig.BasicSpectrogram()
a2s = AudioToSpec.from_cfg(cfg)

# Step 3: Make predictions
Each test sample is cut into 5-second chunks. We use the pretrained model to return probabilities for all 10k birds included in the model, then pull out the classes used in this competition to create a final submission row. Note that we are NOT doing anything special to handle the 3 missing classes; those will need fine-tuning / transfer learning, which will be handled in a separate notebook.

In [15]:
def _get_label(df, label_col):
    return df[label_col]

get_label = partial(_get_label, label_col='primary_label')

In [16]:
model = load_learner(comp_path/model_fname)

### Todo
 1. [ ] Change tensor shape

In [38]:
fixed_tm = frame_audio(audio, sample_rate=required_test_audio_sample_rate_hz)
print(fixed_tm.shape)
at = AudioTensor(audio, required_test_audio_sample_rate_hz)
print(at.shape)
pred = model.predict(at)
print(pred[2].shape)
pred
# logits, embeddings = model.predict(at)
# probabilities = tf.nn.softmax(logits)
# argmax = np.argmax(probabilitiesty
# print(f"The audio is from the class {classes[argmax]} (element:{argmax} in the label.csv file), with probability of {probabilities[0][argmax]}")

torch.Size([1, 8, 80000])
torch.Size([1, 595174])


torch.Size([264])


('afghor1',
 TensorBase(8),
 TensorBase([3.0045e-07, 1.7298e-07, 9.7948e-08, 9.4855e-08, 1.6228e-08,
             1.3662e-03, 7.8479e-06, 3.3193e-07, 9.9701e-01, 4.1942e-06,
             1.3537e-07, 1.2262e-06, 6.2920e-07, 4.0432e-07, 5.4425e-07,
             4.9912e-04, 2.3548e-05, 5.5927e-06, 1.0697e-06, 3.2409e-08,
             2.9427e-09, 4.4840e-06, 9.7612e-07, 2.6217e-07, 8.8118e-09,
             2.6116e-07, 2.0804e-05, 4.2236e-08, 2.2602e-08, 1.4014e-05,
             8.3746e-06, 1.0531e-08, 9.9602e-06, 1.0875e-06, 1.8268e-07,
             4.0172e-07, 8.0370e-07, 1.0473e-06, 3.9215e-06, 5.7890e-06,
             1.9409e-07, 8.6254e-07, 3.3210e-06, 2.5123e-06, 3.5946e-07,
             4.8016e-08, 5.3317e-07, 6.2069e-08, 3.2999e-08, 5.3405e-07,
             4.6109e-09, 5.1930e-07, 2.6663e-06, 3.3647e-08, 9.1167e-10,
             1.5576e-06, 3.1000e-07, 1.1907e-06, 3.5291e-06, 3.3628e-08,
             9.4868e-10, 6.6906e-08, 4.3598e-08, 2.5220e-07, 1.4357e-06,
             6.8828e-08

In [29]:
AudioTensor?

[0;31mInit signature:[0m [0mAudioTensor[0m[0;34m([0m[0mx[0m[0;34m,[0m [0msr[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Semantic torch tensor that represents an audio.
Contains all of the functionality of a normal tensor,
but additionally can be created from files and has
extra properties. Also knows how to show itself.
[0;31mFile:[0m           ~/Code/fastaudio/src/fastaudio/core/signal.py
[0;31mType:[0m           _TensorMeta
[0;31mSubclasses:[0m     
