In this notebook, we'll use a pre-trained machine learning model to generate a submission to the [BirdClef2023 competition](https://www.kaggle.com/c/birdclef-2023).  The goal of the competition is to identify Eastern African bird species by sound.

# Step 1: Setup
The convnext model was too slow, so I've gone back to the resnet model (offline models pinned to version 3)

In [1]:
!ls /kaggle/input

birdclef-2023  birdclef-2023-offline-models  birdclef-2023-offline-packages


## Config

In [2]:
from pathlib import Path

debug_on = False
remove_tfms = False
model_path = Path('/kaggle/input/birdclef-2023-offline-models')
model_fname = 'birdclef-2023-eca_nfnet_l0-20230511-231507.pkl'
package_path = Path('/kaggle/input/birdclef-2023-offline-packages')

required_test_audio_sample_rate_hz = 32000

input_path = Path('/kaggle/input/birdclef-2023')
working_path = Path('/kaggle/working')
temp_path = Path('/kaggle/temp')
temp_path.mkdir(parents=True, exist_ok=True)
sub_dir = 'test_soundscapes'

## Installs & Imports

`fastaudio` should be installed last because other wheel file change dependency versions relative to what comes pre-installed.

In [3]:
# !cd {package_path}; pip install *.whl --dry-run
!echo "Installing:"
!cd {package_path}; ls *.whl;
!echo "--------------------------------------------------------------------------------"
!cd {package_path}; pip install --no-index --no-deps `ls *.whl | egrep -v 'fastaudio|soundfile'`
!cd {package_path}; pip install --no-index --no-deps fastaudio*.whl

Installing:
colorednoise-2.1.0-py3-none-any.whl
fastaudio-1.0.2.post0.dev9gb80fa16.dirty-py2.py3-none-any.whl
resampy-0.4.2-py3-none-any.whl
soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl
--------------------------------------------------------------------------------
Processing ./colorednoise-2.1.0-py3-none-any.whl
Processing ./resampy-0.4.2-py3-none-any.whl
Installing collected packages: resampy, colorednoise
Successfully installed colorednoise-2.1.0 resampy-0.4.2
[0mProcessing ./fastaudio-1.0.2.post0.dev9gb80fa16.dirty-py2.py3-none-any.whl
Installing collected packages: fastaudio
Successfully installed fastaudio-1.0.2.post0.dev9+gb80fa16.dirty
[0m

In [4]:
# import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_io as tfio

import pandas as pd
import numpy as np
# import soundfile
# print(f'soundfile: {soundfile.__version__}')
# import librosa
# print(f'librosa: {librosa.__version__}')
import glob

import fastai
from fastai.vision.all import load_learner
print(f'fastai: {fastai.__version__}')
import torchaudio
print(f'torchaudio: {torchaudio.__version__}')

import fastaudio
from fastaudio.core.all import *
from fastaudio.augment.all import *
print(f'fastaudio: {fastaudio.__version__}')
# import csv
# import io

from collections import OrderedDict
from IPython.display import Audio

fastai: 2.7.11
torchaudio: 0.13.0+cpu
fastaudio: 1.0.3


# Step 2: Preprocessing functions

In [5]:
# https://discuss.pytorch.org/t/pytorch-equivalent-to-tf-signal-frame/123239
def torch_frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
    """
    equivalent of tf.signal.frame
    """
    signal_length = signal.shape[axis]
    if pad_end:
        frames_overlap = frame_length - frame_step
        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
        pad_size = int(frame_length - rest_samples)
        if pad_size != 0:
            pad_axis = [0] * signal.ndim
            pad_axis[axis] = pad_size
            signal = F.pad(signal, pad_axis, "constant", pad_value)
    frames=signal.unfold(axis, frame_length, frame_step)
    return frames

In [6]:
def frame_audio(
      audio_array: np.ndarray,
      window_size_s: float = 5.0,
      hop_size_s: float = 5.0,
      sample_rate = 32000,
      repeat: int = None
      ) -> np.ndarray:
    
    """Helper function for framing audio for inference."""
    """ using tf.signal """
    if window_size_s is None or window_size_s < 0:
        return audio_array[np.newaxis, :]
    frame_length = int(window_size_s * sample_rate)
    hop_length = int(hop_size_s * sample_rate)
    framed_audio = torch_frame(audio_array, frame_length, hop_length, pad_end=True)
    framed_audio = torch.squeeze(framed_audio, 0)
    if repeat:
        framed_audio = framed_audio.repeat(1, repeat)
    return framed_audio

def ensure_sample_rate(waveform, original_sample_rate,
                       desired_sample_rate=32000):
    """Resample waveform if required."""
    if original_sample_rate != desired_sample_rate:
        waveform = torchaudio.functional.resample(waveform, original_sample_rate, desired_sample_rate)
    return waveform, desired_sample_rate

Below we load one training sample - use the Audio function to listen to the samples inside the notebook!

In [7]:
if debug_on:
    # Test example
    audio, sample_rate = torchaudio.load(input_path/'train_audio/barswa/XC113914.ogg')
    print(sample_rate)
    audio, sample_rate = ensure_sample_rate(audio, sample_rate, required_test_audio_sample_rate_hz)
    print(sample_rate)
    Audio(audio, rate=sample_rate)

if debug_on:
    print(audio.shape)
    test_framed = frame_audio(audio, sample_rate=sample_rate, repeat=2)
    print(test_framed.shape)
    print(test_framed.__class__)

# Step 3: Prediction functions
Each test sample is cut into 5-second chunks. We use the pretrained model to return probabilities for all 10k birds included in the model, then pull out the classes used in this competition to create a final submission row. Note that we are NOT doing anything special to handle the 3 missing classes; those will need fine-tuning / transfer learning, which will be handled in a separate notebook.

In [8]:
def cmAP(preds, labels, num_classes=264):
    probs = F.one_hot(torch.argmax(preds, dim=1), num_classes=num_classes).cpu().numpy()
    labels_oh = F.one_hot(labels, num_classes=num_classes).cpu().numpy()
    return average_precision_score(labels_oh, probs, average='macro')

def cmAP_probs(preds, labels, num_classes=264):
    probs = F.softmax(preds, dim=1).cpu().numpy()
    labels_oh = F.one_hot(labels, num_classes=num_classes).cpu().numpy()
    return average_precision_score(labels_oh, probs, average='macro')

def padded_cmAP(preds, labels, num_classes=264, padding_factor=1):
    padding = torch.ones((padding_factor, num_classes))
    probs = torch.cat((F.one_hot(torch.argmax(preds, dim=1), num_classes=num_classes).cpu(), padding)).numpy()
    labels_oh = torch.cat((F.one_hot(labels, num_classes=num_classes).cpu(), padding)).numpy()
    return average_precision_score(labels_oh, probs, average='macro')

def padded_cmAP_probs(preds, labels, num_classes=264, padding_factor=1):
    padding = torch.ones((padding_factor, num_classes))
    probs = torch.cat((F.softmax(preds, dim=1).cpu(), padding)).numpy()
    labels_oh = torch.cat((F.one_hot(labels, num_classes=num_classes).cpu(), padding)).numpy()
    return average_precision_score(labels_oh, probs, average='macro')

def error_rate_multi(preds, labels, thresh=0.5, sigmoid=True):
    return 1 - accuracy_multi(preds, labels, thresh, sigmoid)

In [9]:
def _get_label(df, label_col):
    return df[label_col]

get_label = partial(_get_label, label_col='primary_label')

model = load_learner(model_path/model_fname)

In [10]:
model.dls.valid.after_item.fs

[Resample:
 encodes: (AudioTensor,object) -> encodes
 decodes: ,
 DownmixMono:
 encodes: (AudioTensor,object) -> encodes
 decodes: ,
 AudioToSpec:
 encodes: (AudioTensor,object) -> encodes
 decodes: ,
 ToTensor:
 encodes: (PILMask,object) -> encodes
 (PILBase,object) -> encodes
 decodes: ]

## Ugly hack
Some FastAudio transforms wouldn't work on GPU, so I put them in item_tfms instead of batch_tfms. They shouldn't be use for inference, so this hack removes them.

Starting Apr. 22, moved the hack to the model training script so the model is exported with the desired transforms for inference.

In [11]:
# if remove_tfms:
#     model.dls.valid.after_item.fs.pop(3)
#     model.dls.valid.after_item.fs.pop(3)
# model.dls.valid.after_item.fs

In [12]:
if debug_on:
    fixed_tm = frame_audio(audio, sample_rate=required_test_audio_sample_rate_hz)
    # print(fixed_tm.shape)
    for window in fixed_tm:
        window = window.unsqueeze(0)
    #     print(window.shape)
        at = AudioTensor(window, required_test_audio_sample_rate_hz)
        pred = model.predict(at)
        print(f'{pred[0]}:, {torch.max(pred[2])}, {torch.argmax(pred[2])}')

In [13]:
def predict_for_sample(filename, required_sample_rate, pred_dict, frame_limit_secs=None):
    file_id = filename.split(".ogg")[0].split("/")[-1]
    audio, sample_rate = torchaudio.load(filename)
    wav_data, sample_rate = ensure_sample_rate(audio, sample_rate, required_sample_rate)
    fixed_tm = frame_audio(wav_data, sample_rate=sample_rate)
    frame = 5
    for window in fixed_tm:
        if frame_limit_secs and frame > frame_limit_secs:
            continue
        window = window.unsqueeze(0)
        at = AudioTensor(window, required_sample_rate)
        with model.no_bar():
            preds = model.predict(at)
        
        # Todo: make sure training columns and sample submission columns are in the same order
        key = file_id + '_' + str(frame)
        pred_dict[key] = np.copy(preds[2].numpy())
        frame += 5

# Step 4: Generate a submission

In [14]:
!ls /kaggle/input/birdclef-2023/train_audio/barswa | head

XC113914.ogg
XC129647.ogg
XC132406.ogg
XC133096.ogg
XC133802.ogg
XC134349.ogg
XC135474.ogg
XC139171.ogg
XC141346.ogg
XC141678.ogg


In [15]:
test_samples = list(glob.glob("/kaggle/input/birdclef-2023/test_soundscapes/*.ogg"))

if debug_on:
    test_samples.insert(0, '/kaggle/input/birdclef-2023/train_audio/barswa/XC113914.ogg')
    test_samples.insert(1, '/kaggle/input/birdclef-2023/train_audio/barswa/XC129647.ogg')
    
test_samples[:5]

['/kaggle/input/birdclef-2023/test_soundscapes/soundscape_29201.ogg']

In [16]:
# train_metadata = pd.read_csv("/kaggle/input/birdclef-2023/train_metadata.csv")
# competition_classes = sorted(train_metadata.primary_label.unique())
sample_sub_df = pd.read_csv("/kaggle/input/birdclef-2023/sample_submission.csv")
sample_sub_df.head()
# sample_sub_df[competition_classes] = sample_sub[competition_classes].astype(np.float32)
# sample_sub.head()

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,soundscape_29201_10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,soundscape_29201_15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Serial prediction

In [17]:
frame_limit_secs = 15 if sample_sub_df.shape[0] == 3 else None

predictions = OrderedDict()
for sample_filename in test_samples:
    predict_for_sample(sample_filename, required_test_audio_sample_rate_hz, predictions, frame_limit_secs=600)

## Parallel prediction

In [18]:
# torch.set_num_threads(1)
# from fastcore.parallel import parallel_gen

# class PredictToOrderedDict():
#     def __init__(self, required_sample_rate: int=32000, frame_limit_secs: int=600):
#         self.sr = required_sample_rate
#         self.limit = frame_limit_secs
#         self.predictions = OrderedDict()
        
#     def __call__(self, batch):
#         for sample in batch:
#             predict_for_sample(sample, self.sr, self.predictions, self.limit)
#         yield self.predictions

In [19]:
# %%timeit -n 1
# results = list(parallel_gen(PredictToOrderedDict, test_samples, n_workers=0))

In [20]:
# class _C:
#     def __call__(self, o): return ((i+1) for i in o)

# items = range(5)

# res = L(parallel_gen(_C, items, n_workers=0))
# idxs,dat1 = zip(*res.sorted(itemgetter(0)))
# test_eq(dat1, range(1,6))

# res = L(parallel_gen(_C, items, n_workers=3))
# idxs,dat2 = zip(*res.sorted(itemgetter(0)))
# test_eq(dat2, dat1)

## Create submission file

In [21]:
sub_df = pd.DataFrame.from_dict(predictions, orient='index')
sub_df.index.name = 'row_id'
sub_df = sub_df.reset_index()
sub_df.columns = sample_sub_df.columns

In [22]:
sub_df.head()

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.001647948,0.000342,0.002274,2.5e-05,0.0003997433,0.001299993,0.004900931,0.000636,0.002335,...,8.9e-05,1.8e-05,0.0003677559,4.3e-05,8.8e-05,0.000577,0.001068,0.000136,0.001791,0.001834079
1,soundscape_29201_10,5.817847e-05,0.010233,1.6e-05,0.002161,0.001262999,6.628463e-05,0.0002829918,0.00137,0.000124,...,0.000311,6.9e-05,0.0002970379,0.002,7e-05,0.000313,0.000266,0.00128,0.00011,0.001399656
2,soundscape_29201_15,1.71286e-06,4.4e-05,1.7e-05,1e-05,3.20977e-07,0.0001061228,1.317631e-06,3e-06,8e-06,...,1e-06,9e-06,2.130616e-05,9.6e-05,8e-06,0.000252,4.5e-05,1.6e-05,5.6e-05,0.0004708157
3,soundscape_29201_20,7.663779e-06,7.3e-05,5.9e-05,0.000227,0.0004312716,9.863844e-06,3.511371e-06,0.000172,4.4e-05,...,0.000186,2.2e-05,1.158271e-05,0.000903,0.000157,0.000196,9e-06,8.5e-05,0.000272,2.264296e-06
4,soundscape_29201_25,7.184537e-07,3e-06,4.1e-05,3.3e-05,0.0001686615,9.759963e-07,1.643771e-07,1.6e-05,1.5e-05,...,4e-05,2e-06,9.071156e-07,8.4e-05,6e-06,3.3e-05,2e-06,2e-06,1.8e-05,6.582222e-07


In [23]:
if debug_on:
    print(sub_df)

In [24]:
sub_df.to_csv("submission.csv", index=False)

In [25]:
# if debug_on:
#     !head submission.csv

In [26]:
if debug_on:
    print(np.argmax(sub_df.iloc[0][1:]))

In [27]:
if debug_on:
    for i in range(len(sub_df)):
        argmax_loc = np.argmax(sub_df.iloc[i][1:]) + 1
        argmax_prob = sub_df.iloc[i, argmax_loc]
        argmax_class = sub_df.columns[argmax_loc]
        print(f'{sub_df.iloc[i, 0]}: {100*argmax_prob:.1f}% {argmax_class} (class {argmax_loc})')